Project Files
eval / cases / hard.jsonl
{"id":"gate-ambiguous-short-followup-1","component":"gate","input":{"prompt":"and the other one?","files":["small-project-note.txt","small-atlas-note.txt"],"threshold":0.7},"expected":{"decision":"ambiguous"}}
{"id":"gate-unanswerable-external-fact-1","component":"gate","input":{"prompt":"who won the most recent super bowl?","files":["large-architecture-doc.md"],"threshold":0.7},"expected":{"decision":"likely-unanswerable"}}
{"id":"rewrite-quoted-span-1","component":"rewrite","input":{"prompt":"summarize the tradeoff described in \"Platform Architecture Review\" for the session service database choice","count":4},"expected":{"minRewrites":3,"mustIncludeLabels":["original","keywords","quoted-span"]}}
{"id":"evidence-dedupe-cross-file-1","component":"evidence","input":{"entries":[{"content":"The launch date is June 12, 2026.","score":0.82,"sourceIdentifier":"file-project","sourceName":"small-project-note.txt"},{"content":"The launch date is June 12, 2026.","score":0.81,"sourceIdentifier":"file-atlas","sourceName":"small-atlas-note.txt"},{"content":"Project owner: Maya Chen.","score":0.78,"sourceIdentifier":"file-project","sourceName":"small-project-note.txt"}],"threshold":0.9,"maxEvidenceBlocks":4},"expected":{"dedupedLength":3}}
{"id":"rerank-completeness-priority-1","component":"rerank","input":{"prompt":"what database is used by the session service and what tradeoff is mentioned?","topK":3,"strategy":"heuristic-v1","entries":[{"content":"# Analytics\nThe analytics pipeline writes events into ClickHouse for aggregate dashboards.","score":0.84,"sourceIdentifier":"file-a-3","sourceName":"large-architecture-doc.md"},{"content":"# Session Service\nThe session service uses PostgreSQL for durable session state.","score":0.81,"sourceIdentifier":"file-a-2","sourceName":"large-architecture-doc.md"},{"content":"# Session Service\nThe session service uses PostgreSQL for durable session state. The tradeoff is higher write latency in exchange for consistency during failover.","score":0.79,"sourceIdentifier":"file-a-1","sourceName":"large-architecture-doc.md"}]},"expected":{"topSourceName":"large-architecture-doc.md","topIncludes":"higher write latency","lastIncludes":"ClickHouse","minTopLexicalOverlap":0.45}}
{"id":"rerank-diversity-penalty-1","component":"rerank","input":{"prompt":"what database is used by the session service?","topK":3,"strategy":"heuristic-v1","entries":[{"content":"# Session Service\nThe session service uses PostgreSQL for durable session state.","score":0.86,"sourceIdentifier":"file-a-1","sourceName":"large-architecture-doc.md"},{"content":"# Session Service\nThe session service uses PostgreSQL for durable session state and prioritizes consistency.","score":0.85,"sourceIdentifier":"file-a-2","sourceName":"large-architecture-doc.md"},{"content":"# Storage Overview\nRedis is used only for ephemeral cache warming and not for durable sessions.","score":0.7,"sourceIdentifier":"file-a-3","sourceName":"large-architecture-doc.md"}]},"expected":{"topIncludes":"PostgreSQL","lastIncludes":"Redis","minTopLexicalOverlap":0.5,"maxSecondDiversityPenalty":0.2}}
{"id":"hybrid-quoted-span-1","component":"hybrid","input":{"prompt":"what tradeoff is described in \"Platform Architecture Review\" for the session service database choice?","semanticEntries":[{"content":"Analytics uses ClickHouse for aggregate dashboards.","score":0.88,"sourceIdentifier":"file-a-sem-1","sourceName":"large-architecture-doc.md"},{"content":"The session service uses PostgreSQL for durable session state.","score":0.79,"sourceIdentifier":"file-a-sem-2","sourceName":"large-architecture-doc.md"}],"documents":[{"fileName":"large-architecture-doc.md","content":"# Platform Architecture Review\nSession Service\nThe session service uses PostgreSQL for durable session state.\n\nTradeoffs\nHigher write latency is accepted in exchange for consistency during failover.\n\nAnalytics\nClickHouse powers aggregate dashboards."}],"lexicalCandidateCount":4,"hybridCandidateCount":6,"semanticWeight":0.65,"lexicalWeight":0.35},"expected":{"mustContain":"Higher write latency","minLexicalEntries":2,"minHybridEntries":3}}