[CLAUDE] Docs: setup RAG Framework v1.3 governance + eval framework

- docs/governance/README.md: Path B delegation stub → AI_INFRA canonical Phase/BC vocabulary documented (9 phase + 10 BC SOLUTION_ERP-specific) - .claude/rag.json: add _decision_log block (10 rationale entries) + add .claude/agents/**/*.md to corpus_paths (fix Case D harvest gap) - eval/evaluator.md: inline executor spec v1.0 (Spec A strict) - eval/golden-set-solution_erp.jsonl: 14-entry golden set v1.1 (5 gotcha + 3 pattern + 3 decision + 3 negative) - eval/runs/2026-05-26-baseline-v1.0-failed.json: v1.0 attempt recall@5=0.455 FAIL — root cause diagnosis Case A/C/D - eval/runs/2026-05-26-baseline-v1.1-pending.json: v1.1 attempt pending CLI restart for accurate numbers - eval/trial-state-lock.json: 2-section split (quality_gate + drift_monitor) per v1.3 §6.2, 4-week milestones 2026-05-26 → 2026-06-23 CRITICAL lesson: bootstrap.py --project flag overrides collection name only. Use --config D:\...\SOLUTION_ERP\.claude\rag.json for correct project root. Old projects.json had root_path=AI_INFRA for solution_erp (Anti #24) — FIXED. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-26 13:14:23 +07:00
parent c506919d7d
commit b223466ded
7 changed files with 342 additions and 2 deletions
--- a/eval/runs/2026-05-26-baseline-v1.0-failed.json
+++ b/eval/runs/2026-05-26-baseline-v1.0-failed.json
@ -0,0 +1,47 @@
+{
+  "run_date": "2026-05-26",
+  "golden_set_version": "v1.0",
+  "spec": "A",
+  "status": "FAIL",
+  "recall_at_5": 0.4545,
+  "hits": 5,
+  "positive_queries": 11,
+  "avg_top1_rerank_hits_only": 0.860,
+  "pass_gate": false,
+  "gate_threshold": 0.7,
+  "results": [
+    {"id":"q01","query":"gotcha #39 act_runner TCP timeout manual checkout bypass","hit":true,"top1_source":"docs/architecture.md","top1_rerank":0.887,"case":null},
+    {"id":"q02","query":"gotcha #41 paths-ignore docs-only CI skip path filter","hit":true,"top1_source":"docs/architecture.md","top1_rerank":0.910,"case":null},
+    {"id":"q03","query":"gotcha #44 silent 403 class-level Authorize policy endpoint","hit":true,"top1_source":"docs/changelog/sessions/2026-05-08-1945-s18-pe-v2-polish-clone-b.md","top1_rerank":0.859,"case":null},
+    {"id":"q04","query":"gotcha #17 EF migration 3-file rule Designer Snapshot commit","hit":false,"top1_source":"docs/STATUS.md","top1_rerank":0.488,"case":"A","note":"Expected ef-core-migration skill or gotchas.md. STATUS.md matched but rerank < 0.7. Short chunk density issue."},
+    {"id":"q05","query":"gotcha #25 IIS WebSocket SignalR negotiate module exclusion","hit":false,"top1_source":null,"top1_rerank":null,"case":"C","note":"0 results. Content exists in docs/gotchas.md ### 25 but query uses '#25' notation vs '### 25.' format. Also 'module exclusion' wrong term — actual is 'applicationHost webSocket section lock'."},
+    {"id":"q06","query":"CQRS MediatR Features.cs Command Validator Handler single file","hit":false,"top1_source":null,"top1_rerank":null,"case":"C","note":"0 results. Content exists in docs/rules.md §2.2 but 'Features.cs' not mentioned, 'single file' vs Vietnamese 'cùng 1 file'. Language + term mismatch."},
+    {"id":"q07","query":"Smart Friend adversarial reviewer quality ceiling independent","hit":false,"top1_source":null,"top1_rerank":null,"case":"D","note":"0 results. .claude/agents/reviewer.md contains Smart Friend guard but agents/*.md NOT in corpus_paths. Harvest gap — add agents to corpus."},
+    {"id":"q08","query":"PE V2 ApprovalWorkflow Steps Levels OR-of-N ApproverUserId","hit":true,"top1_source":".claude/agent-memory/investigator/MEMORY.md","top1_rerank":0.824,"case":null},
+    {"id":"q09","query":"Implementer isolation worktree DROPPED Windows MAX_PATH Dropbox","hit":false,"top1_source":null,"top1_rerank":null,"case":"D","note":"0 results. .claude/agents/implementer.md contains worktree decision but agents/*.md NOT in corpus_paths. Harvest gap — add agents to corpus."},
+    {"id":"q10","query":"sub-agent model inherit 1M Opus context parent spawn","hit":false,"top1_source":"docs/HANDOFF.md","top1_rerank":0.641,"case":"A","note":"Rerank 0.641 borderline < 0.7 threshold. HANDOFF.md has content but rerank filtered out. Rephrase with more specific anchor."},
+    {"id":"q11","query":"ApprovalWorkflow V1 V2 dual schema backward compatible fallback","hit":true,"top1_source":".claude/agent-memory/investigator/MEMORY.md","top1_rerank":0.824,"case":null},
+    {"id":"q12","query":"GraphQL subscription realtime resolver Apollo","hit":true,"top1_source":null,"top1_rerank":null,"case":null,"note":"CORRECT EXCLUSION — 0 results as expected"},
+    {"id":"q13","query":"Redis cache distributed session eviction TTL","hit":true,"top1_source":null,"top1_rerank":null,"case":null,"note":"CORRECT EXCLUSION — 0 results as expected"},
+    {"id":"q14","query":"Kubernetes Helm chart microservice deployment","hit":true,"top1_source":null,"top1_rerank":null,"case":null,"note":"CORRECT EXCLUSION — 0 results as expected"}
+  ],
+  "_diagnosis": {
+    "root_cause_summary": "DIFFERENT from AI_INFRA Anti #9 keyword stacking. SOLUTION_ERP v1.0 fails due to: (1) Corpus gap — agents/*.md NOT indexed [q07, q09 Case D]; (2) Query language mismatch — Vietnamese content vs English query terms [q05, q06 Case C]; (3) Borderline rerank — short chunks below 0.7 threshold [q04, q10 Case A].",
+    "case_breakdown": {
+      "case_A": ["q04 (EF 3-file rule)", "q10 (sub-agent model inherit)"],
+      "case_B": [],
+      "case_C": ["q05 (gotcha #25 IIS WebSocket)", "q06 (CQRS MediatR)"],
+      "case_D": ["q07 (Smart Friend reviewer)", "q09 (Implementer worktree DROPPED)"]
+    },
+    "fix_actions": {
+      "corpus_fix": "Add .claude/agents/**/*.md to corpus_paths in rag.json → re-bootstrap → fixes q07 + q09",
+      "query_rephrase": "v1.1 rephrase q04/q05/q06/q10 with: Vietnamese keyword anchors + correct notation (### 25 not #25) + drop absent terms (Features.cs, single file)"
+    }
+  },
+  "_lessons": [
+    "Anti #9 keyword stacking was AI_INFRA problem — SOLUTION_ERP has different failure mode: corpus gap + language mismatch",
+    "Notation matters: gotcha query must use '25. IIS' not '#25 IIS' to match actual docs/gotchas.md format",
+    "Vietnamese corpus requires Vietnamese keywords OR canonical English terms (ApprovalWorkflow, NOT 'approval flow')",
+    ".claude/agents/*.md files are valuable content — should be in corpus_paths"
+  ]
+}
--- a/eval/runs/2026-05-26-baseline-v1.1-pending.json
+++ b/eval/runs/2026-05-26-baseline-v1.1-pending.json
@ -0,0 +1,33 @@
+{
+  "run_date": "2026-05-26",
+  "golden_set_version": "v1.1",
+  "spec": "A",
+  "status": "PENDING_RELOAD",
+  "note": "v1.1 baseline attempted after re-bootstrap (2949 chunks, correct SOLUTION_ERP root_path). Results unexpectedly worse than v1.0 — MCP server likely needs CLI restart to reload Qdrant/BM25 cache after bootstrap. Re-run needed.",
+  "recall_at_5_tentative": 0.3636,
+  "hits_tentative": 4,
+  "positive_queries": 11,
+  "pass_gate": false,
+  "results_tentative": [
+    {"id":"q01","hit":true,"top1_source":"docs/architecture.md","top1_rerank":0.887},
+    {"id":"q02","hit":true,"top1_source":"docs/architecture.md","top1_rerank":0.910},
+    {"id":"q03","hit":true,"top1_source":"docs/changelog/sessions/s18","top1_rerank":0.859},
+    {"id":"q04","hit":false,"note":"0 results — pending reload verify"},
+    {"id":"q05","hit":false,"note":"0 results — pending reload verify"},
+    {"id":"q06","hit":false,"note":"0 results — pending reload verify"},
+    {"id":"q07","hit":false,"note":"0 results — pending reload verify"},
+    {"id":"q08","hit":true,"top1_source":".claude/agent-memory/investigator/MEMORY.md","top1_rerank":0.824},
+    {"id":"q09","hit":false,"note":"0 results — pending reload verify"},
+    {"id":"q10","hit":false,"note":"0 results — pending reload verify"},
+    {"id":"q11","hit":false,"note":"0 results — pending reload verify BUT BM25 direct search returns 3 hits investigator MEMORY.md — pipeline issue"},
+    {"id":"q12","hit":true,"note":"CORRECT EXCLUSION"},
+    {"id":"q13","hit":true,"note":"CORRECT EXCLUSION"},
+    {"id":"q14","hit":true,"note":"CORRECT EXCLUSION"}
+  ],
+  "_diagnosis": {
+    "bm25_confirmed": "BM25 search 'ApprovalWorkflow V1 V2' → 3 hits investigator MEMORY.md (direct SQLite query). Data IS indexed.",
+    "qdrant_confirmed": "Qdrant 2949 points green. Source paths all SOLUTION_ERP correct.",
+    "likely_cause": "MCP server caches Qdrant collection discovery or vector index. After bootstrap.py cleared+replaced collection, MCP server may use stale embedding cache or connection. CLI restart needed.",
+    "action": "After CLI restart, re-run 14 queries as v1.1 official baseline."
+  }
+}