fix(tll): tighten shouldUseTLL regex to reduce false-positives (review #9)

moralespanitz · moralespanitz · commit 9eacd794ca1c · 2026-05-05T22:22:55.000-05:00
The original gate was a single alternation regex that fired on any
single occurrence of `first|last|before|after|then|later|track|...`.
That over-fired on plain factual queries that incidentally contained
one of those tokens — `what is my first name`, `the model used before
GPT-4`, `track my spending` — pulling in unrelated TLL chain memories
on the augmented retrieval path.

Replaced the gate with a two-tier check:

  1. ORDERING_TERMS_RE — a curated set of single-token signals
     (first/last/before/after/then/later/earlier/previous/next/prior).
     Only fires TLL when TWO co-occur, e.g. "what aspects did I
     discuss BEFORE and AFTER X".
  2. SEQUENCE_PATTERNS — phrase-level structural signals
     (`in (chronological/reverse/the) order`, `when did`, `since when`,
     `over time`, `evolution of`, `history|timeline of`,
     `originally`/`initially`, `progression of`,
     `how X evolved/shifted/changed`, `brought up`). Single phrase
     hit is enough.

Removed `track`, `sequence`, and bare `order` from the gate — they
were the largest false-positive contributors.

Updated `src/services/__tests__/tll-retrieval.test.ts`:
  - Positive list rewritten to canonical EO/MSR/TR shapes that hit
    one of the structural patterns or co-occurring ordering terms.
  - Negative list now includes the false-positive shapes the loose
    regex used to match (the three reviewer-cited ones plus a handful
    of single-ordering-term factual queries).

41/41 unit tests pass against the updated gate.
diff --git a/src/services/__tests__/tll-retrieval.test.ts b/src/services/__tests__/tll-retrieval.test.ts
@@ -52,44 +52,61 @@ function makeTllRepo(chainResult: string[]): {
 }
 
 describe('shouldUseTLL', () => {
+  // Canonical EO/MSR/TR question shapes — each fires either via two
+  // ordering terms or via a single structural sequence phrase. The
+  // pre-tightened regex over-fired on single-token ordering hits like
+  // "what is my first name" / "the model used before GPT-4"; the
+  // updated gate trades a few rare single-word matches for sharper
+  // precision on these canonical shapes.
   const positiveQueries = [
-    'in what order did the events happen',
-    'what came first in the sequence',
-    'what was the last meeting about',
-    'what happened before the merger',
-    'what changed after the launch',
+    // SEQUENCE_PATTERNS hits
+    'in what order did I bring up X',
+    'in chronological order list the events',
     'when did the user move to Berlin',
+    'since when has X been deprecated',
+    'how preferences shifted over time',
     'show the evolution of the project',
-    'list events in chronological order',
-    'reconstruct the sequence',
-    'build me a timeline of changes',
     'what is the history of this codebase',
-    'how preferences shifted over time',
+    'build me a timeline of changes',
+    'when the topic was brought up',
     'what did the user originally say',
     'what did they initially mention',
-    'first this then that',
-    'and later they switched',
-    'when the topic was brought up',
-    'track the progression of opinion',
     'show progression of editor choice',
+    'how did the architecture evolve',
+    'how have my preferences shifted',
+    // Two ordering terms (co-occurrence)
+    'first this then that',
+    'what aspects did I discuss before vs after the launch',
+    'first the migration, then the rollback',
+    'what came earlier and what came later',
   ];
 
   it.each(positiveQueries)('returns true for ordering query: %s', (q) => {
     expect(shouldUseTLL(q)).toBe(true);
   });
 
   it('matches case-insensitively', () => {
-    expect(shouldUseTLL('What Is The HISTORY?')).toBe(true);
-    expect(shouldUseTLL('TIMELINE please')).toBe(true);
+    expect(shouldUseTLL('What Is The HISTORY OF this?')).toBe(true);
+    expect(shouldUseTLL('TIMELINE OF events please')).toBe(true);
   });
 
   const negativeQueries = [
+    // Non-temporal shapes (existing coverage)
     'what is X',
     'list all the entities',
     'explain why this is a tool',
     'who is the current owner',
     'tell me about the project',
     'summarize the discussion',
+    // False-positive shapes that the prior loose regex incorrectly
+    // matched (review #9). These must stay false under the new gate.
+    'what is my first name',
+    'the model used before GPT-4',
+    'track my spending',
+    'we then moved on to lunch',
+    'what is the next step',
+    'is this the previous version',
+    'what came last in the queue',
   ];
 
   it.each(negativeQueries)('returns false for non-temporal query: %s', (q) => {
diff --git a/src/services/tll-retrieval.ts b/src/services/tll-retrieval.ts
@@ -29,11 +29,57 @@ import type { TllRepository } from '../db/repository-tll.js';
  */
 export const TLL_ENTITY_LOOKUP_SEED_LIMIT = 10;
 
-const ORDERING_QUERY_RE =
-  /\b(order|first|last|before|after|when did|evolution|chronological|sequence|timeline|history|over time|originally|initially|then|later|brought up|track|progression|how did .* evolve|in what order)\b/i;
+/**
+ * Single-token ordering signals. Matched in isolation these are too
+ * weak to gate TLL — "what is my FIRST name", "the model used BEFORE
+ * GPT-4", "we then moved on" all contain one of these but are not
+ * EO/MSR/TR queries. We require either two of them to co-occur, or
+ * one of the structural sequence patterns below, before firing.
+ */
+const ORDERING_TERMS_RE =
+  /\b(first|last|before|after|then|later|earlier|previous|next|prior)\b/gi;
 
+/**
+ * Structural sequence patterns. Each one is a phrase whose presence
+ * unambiguously indicates an ordering / temporal-reasoning question.
+ * Single-pattern hit is enough to gate TLL.
+ *
+ * Curated to keep precision high: "track my spending" and "what is my
+ * first name" must not match any pattern here. Add new patterns
+ * conservatively — a leak here will silently re-introduce the
+ * false-positive class this fix addresses.
+ */
+const SEQUENCE_PATTERNS: readonly RegExp[] = [
+  /\bin (what |the )?(chronological |reverse )?order\b/i,
+  /\b(when|after) did\b/i,
+  /\bsince when\b/i,
+  /\bover time\b/i,
+  /\bevolution of\b/i,
+  /\b(history|timeline) of\b/i,
+  /\bbrought up\b/i,
+  /\b(originally|initially)\b/i,
+  /\bprogression of\b/i,
+  /\bhow .{1,80}(evolved?|shifted?|changed)\b/i,
+  /\bwhat .{1,80}(originally|initially)\b/i,
+];
+
+/**
+ * Returns true if the query has the shape of an event-ordering / temporal
+ * question and should trigger TLL chain expansion. The gate is
+ * intentionally conservative: TLL augmentation is augmentation, not the
+ * primary retrieval path, so over-firing was producing irrelevant chain
+ * memories on plain-fact queries that happened to contain "first",
+ * "before", "track", etc.
+ *
+ * Two ordering terms co-occurring (e.g. "what did I discuss BEFORE and
+ * AFTER X") is a strong-enough signal on its own; one structural
+ * sequence phrase (e.g. "in what order", "evolution of", "since when")
+ * is also strong enough. Single ordering term + nothing else is not.
+ */
 export function shouldUseTLL(query: string): boolean {
-  return ORDERING_QUERY_RE.test(query);
+  const orderingMatches = (query.match(ORDERING_TERMS_RE) ?? []).length;
+  if (orderingMatches >= 2) return true;
+  return SEQUENCE_PATTERNS.some((re) => re.test(query));
 }
 
 /**