From dc61c563ba67a24522295f7726837462c7f35d90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20HOUZ=C3=89?= Date: Thu, 12 Mar 2026 13:49:10 +0100 Subject: [PATCH 01/13] Add src/regex.ts core module + aggregate() regex filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - isRegexQuery(): detects /pattern/flags token in query string - buildApiQuery(): derives a safe GitHub API query from a regex query - top-level alternation A|B|C → "A OR B OR C" (GitHub OR operator) - otherwise: longest word-char literal sequence outside [...], escapes, metacharacters - qualifier tokens (filename:, language:, path:) preserved unchanged - <3 char result → warn (caller must use --regex-hint) - invalid regex → null filter + warn - aggregate(): new optional regexFilter parameter filters CodeMatch entries by testing at least one TextMatch.fragment against the RegExp Closes #111 Part of #110 --- src/aggregate.test.ts | 77 ++++++++++++++ src/aggregate.ts | 7 ++ src/regex.test.ts | 130 +++++++++++++++++++++++ src/regex.ts | 239 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 453 insertions(+) create mode 100644 src/regex.test.ts create mode 100644 src/regex.ts diff --git a/src/aggregate.test.ts b/src/aggregate.test.ts index bc7bd87..1a314c5 100644 --- a/src/aggregate.test.ts +++ b/src/aggregate.test.ts @@ -137,3 +137,80 @@ describe("aggregate", () => { expect(groups).toHaveLength(0); }); }); + +// ─── aggregate with regexFilter ─────────────────────────────────────────────── + +function makeMatchWithFragments(repo: string, path: string, fragments: string[]): CodeMatch { + return { + path, + repoFullName: repo, + htmlUrl: `https://github.com/${repo}/blob/main/${path}`, + archived: false, + textMatches: fragments.map((fragment) => ({ fragment, matches: [] })), + }; +} + +describe("aggregate — regexFilter", () => { + it("keeps matches where at least one fragment satisfies the regex", () => { + const matches: CodeMatch[] = [ + makeMatchWithFragments("myorg/repoA", "src/a.ts", ["import axios from 'axios'"]), + makeMatchWithFragments("myorg/repoA", "src/b.ts", ["const x = 1"]), + ]; + + const groups = aggregate(matches, new Set(), new Set(), false, /axios/); + expect(groups).toHaveLength(1); + expect(groups[0].matches).toHaveLength(1); + expect(groups[0].matches[0].path).toBe("src/a.ts"); + }); + + it("excludes the whole repo when no fragment matches the regex", () => { + const matches: CodeMatch[] = [ + makeMatchWithFragments("myorg/repoA", "src/a.ts", ["const x = 1"]), + makeMatchWithFragments("myorg/repoA", "src/b.ts", ["const y = 2"]), + ]; + + const groups = aggregate(matches, new Set(), new Set(), false, /axios/); + expect(groups).toHaveLength(0); + }); + + it("matches against any fragment in a multi-fragment match", () => { + const matches: CodeMatch[] = [ + makeMatchWithFragments("myorg/repoA", "src/a.ts", [ + "unrelated line", + 'import axios from "axios"', + ]), + ]; + + const groups = aggregate(matches, new Set(), new Set(), false, /axios/); + expect(groups).toHaveLength(1); + }); + + it("keeps all matches when regexFilter is undefined (backward compat)", () => { + const matches: CodeMatch[] = [ + makeMatchWithFragments("myorg/repoA", "src/a.ts", ["const x = 1"]), + makeMatchWithFragments("myorg/repoB", "src/b.ts", ["const y = 2"]), + ]; + + const groups = aggregate(matches, new Set(), new Set()); + expect(groups).toHaveLength(2); + }); + + it("respects regex flags (case-insensitive)", () => { + const matches: CodeMatch[] = [ + makeMatchWithFragments("myorg/repoA", "src/a.ts", ["import AXIOS from 'axios'"]), + ]; + + const groups = aggregate(matches, new Set(), new Set(), false, /axios/i); + expect(groups).toHaveLength(1); + }); + + it("works with matches having no textMatches (empty fragments array)", () => { + const matches: CodeMatch[] = [ + makeMatch("myorg/repoA", "src/a.ts"), // textMatches: [] + ]; + + // No fragments → regex can never match → repo excluded + const groups = aggregate(matches, new Set(), new Set(), false, /axios/); + expect(groups).toHaveLength(0); + }); +}); diff --git a/src/aggregate.ts b/src/aggregate.ts index 3fe7b91..84fb31f 100644 --- a/src/aggregate.ts +++ b/src/aggregate.ts @@ -34,11 +34,18 @@ export function aggregate( excludedRepos: Set, excludedExtractRefs: Set, includeArchived = false, + regexFilter?: RegExp, ): RepoGroup[] { const map = new Map(); for (const m of matches) { if (excludedRepos.has(m.repoFullName)) continue; if (!includeArchived && m.archived) continue; + // Fix: when a regex filter is active, only keep matches where at least one + // text_match fragment satisfies the pattern — see issue #111 + if (regexFilter !== undefined) { + const hasMatch = m.textMatches.some((tm) => regexFilter.test(tm.fragment)); + if (!hasMatch) continue; + } const list = map.get(m.repoFullName) ?? []; list.push(m); map.set(m.repoFullName, list); diff --git a/src/regex.test.ts b/src/regex.test.ts new file mode 100644 index 0000000..ae62477 --- /dev/null +++ b/src/regex.test.ts @@ -0,0 +1,130 @@ +import { describe, expect, it } from "bun:test"; +import { buildApiQuery, isRegexQuery } from "./regex.ts"; + +// ─── isRegexQuery ───────────────────────────────────────────────────────────── + +describe("isRegexQuery", () => { + it("returns true for a bare regex token", () => { + expect(isRegexQuery("/from.*axios/")).toBe(true); + }); + + it("returns true for a regex with flags", () => { + expect(isRegexQuery("/pattern/i")).toBe(true); + }); + + it("returns true for a query mixing qualifiers and regex", () => { + expect(isRegexQuery('filename:package.json /["\'"]axios["\'"]:/')); + }); + + it("returns false for a plain text query", () => { + expect(isRegexQuery("from axios")).toBe(false); + }); + + it("returns false for an empty string", () => { + expect(isRegexQuery("")).toBe(false); + }); + + it("returns false for a qualifier-only query", () => { + expect(isRegexQuery("filename:package.json")).toBe(false); + }); +}); + +// ─── buildApiQuery ──────────────────────────────────────────────────────────── + +describe("buildApiQuery — plain text passthrough", () => { + it("returns input unchanged when no regex token", () => { + const r = buildApiQuery("plain text query"); + expect(r.apiQuery).toBe("plain text query"); + expect(r.regexFilter).toBeNull(); + expect(r.warn).toBeUndefined(); + }); +}); + +describe("buildApiQuery — longest literal extraction", () => { + it("/from.*['\"]axios/ → axios", () => { + const r = buildApiQuery("/from.*['\"]axios/"); + expect(r.apiQuery).toBe("axios"); + expect(r.regexFilter).toEqual(/from.*['"]axios/); + expect(r.warn).toBeUndefined(); + }); + + it("/useState/ → useState (trivial literal)", () => { + const r = buildApiQuery("/useState/"); + expect(r.apiQuery).toBe("useState"); + expect(r.regexFilter).toEqual(/useState/); + }); + + it("/require\\(['\"']old-lib['\"]\\)/ → old-lib", () => { + const r = buildApiQuery("/require\\(['\"]old-lib['\"]\\)/"); + expect(r.apiQuery).toBe("old-lib"); + expect(r.regexFilter).not.toBeNull(); + }); +}); + +describe("buildApiQuery — top-level alternation → OR", () => { + it("/TODO|FIXME|HACK/ → TODO OR FIXME OR HACK", () => { + const r = buildApiQuery("/TODO|FIXME|HACK/"); + expect(r.apiQuery).toBe("TODO OR FIXME OR HACK"); + expect(r.regexFilter).toEqual(/TODO|FIXME|HACK/); + expect(r.warn).toBeUndefined(); + }); +}); + +describe("buildApiQuery — partial alternation falls back to longest literal", () => { + it("/(import|require).*someLongLib/ → someLongLib", () => { + const r = buildApiQuery("/(import|require).*someLongLib/"); + // The alternation is nested inside (...) — not top-level — so we fall back + // to the longest contiguous literal sequence: "someLongLib". + expect(r.apiQuery).toBe("someLongLib"); + expect(r.regexFilter).not.toBeNull(); + }); +}); + +describe("buildApiQuery — qualifier preservation", () => { + it('filename:package.json /[\'"]axios[\'"]:\\s*"/ → filename:package.json axios', () => { + const r = buildApiQuery("filename:package.json /['\"]axios['\"]:/"); + expect(r.apiQuery).toBe("filename:package.json axios"); + expect(r.regexFilter).not.toBeNull(); + expect(r.warn).toBeUndefined(); + }); + + it("preserves language: qualifier", () => { + const r = buildApiQuery("language:TypeScript /useState/"); + expect(r.apiQuery).toBe("language:TypeScript useState"); + }); + + it("preserves path: qualifier", () => { + const r = buildApiQuery("path:src/ /useState/"); + expect(r.apiQuery).toBe("path:src/ useState"); + }); +}); + +describe("buildApiQuery — flags", () => { + it("/pattern/i → compiles with i flag", () => { + const r = buildApiQuery("/pattern/i"); + expect(r.apiQuery).toBe("pattern"); + expect(r.regexFilter?.flags).toContain("i"); + }); + + it("/pattern/gi → g flag stripped, i kept", () => { + const r = buildApiQuery("/pattern/gi"); + expect(r.regexFilter?.flags).not.toContain("g"); + expect(r.regexFilter?.flags).toContain("i"); + }); +}); + +describe("buildApiQuery — warn cases", () => { + it("/[~^]?[0-9]+\\.[0-9]+/ → empty term + warn", () => { + const r = buildApiQuery("/[~^]?[0-9]+\\.[0-9]+/"); + expect(r.apiQuery).toBe(""); + expect(r.regexFilter).not.toBeNull(); + expect(r.warn).toBeDefined(); + }); + + it("/[/ (invalid regex) → empty term + warn + null filter", () => { + const r = buildApiQuery("/[/"); + expect(r.apiQuery).toBe(""); + expect(r.regexFilter).toBeNull(); + expect(r.warn).toBeDefined(); + }); +}); diff --git a/src/regex.ts b/src/regex.ts new file mode 100644 index 0000000..846aeb5 --- /dev/null +++ b/src/regex.ts @@ -0,0 +1,239 @@ +// ─── Regex query helpers ────────────────────────────────────────────────────── +// +// The GitHub REST API (/search/code) does not support /pattern/ regex syntax. +// These helpers detect regex queries, derive a safe literal term to send to the +// API (casting a wide net), and return a compiled RegExp for local post-filtering. + +/** GitHub search qualifier tokens to preserve unchanged (e.g. filename:foo.json). */ +const QUALIFIER_RE = /^(?:filename|language|path|extension|repo|org|NOT):?\S*/; + +/** + * Returns true if `q` contains a `/pattern/` or `/pattern/flags` token. + * A leading qualifier like `filename:package.json /regex/` is also matched. + */ +export function isRegexQuery(q: string): boolean { + return extractRegexToken(q) !== null; +} + +/** + * Given a raw query string (possibly mixing GitHub qualifiers and a /regex/flags + * token), returns: + * + * - `apiQuery` — the query safe to send to the GitHub REST API + * - `regexFilter` — the compiled RegExp to apply locally on `TextMatch.fragment` + * - `warn` — set when no exploitable literal term could be extracted; + * the caller should require `--regex-hint` before proceeding. + * + * When `q` contains no regex token the input is returned unchanged and + * `regexFilter` is `null`. + */ +export function buildApiQuery(q: string): { + apiQuery: string; + regexFilter: RegExp | null; + warn?: string; +} { + const token = extractRegexToken(q); + + // Plain-text query — nothing to do. + if (token === null) { + return { apiQuery: q, regexFilter: null }; + } + + // Compile the regex (strip the `g` flag — GitHub doesn't return all occurrences). + const { pattern, flags, raw } = token; + const safeFlags = flags.replace("g", ""); + let regexFilter: RegExp | null = null; + try { + regexFilter = new RegExp(pattern, safeFlags); + } catch { + // Fix: invalid regex pattern — warn and return empty query — see issue #111 + return { + apiQuery: "", + regexFilter: null, + warn: `Invalid regex pattern: /${pattern}/${flags}`, + }; + } + + // Separate qualifier tokens from the regex token. + const qualifiers = q + .replace(raw, "") + .trim() + .split(/\s+/) + .filter((t) => t.length > 0 && QUALIFIER_RE.test(t)) + .join(" "); + + // Derive the API search term from the regex pattern. + const { term, warn } = extractApiTerm(pattern); + const apiQuery = [qualifiers, term].filter(Boolean).join(" ").trim(); + + return { apiQuery, regexFilter, warn }; +} + +// ─── Internal helpers ───────────────────────────────────────────────────────── + +interface RegexToken { + /** Raw string as it appears in the query, e.g. "/from.*axios/i" */ + raw: string; + /** Pattern string without delimiters */ + pattern: string; + /** Flags string (may be empty) */ + flags: string; +} + +/** + * Extracts the first `/pattern/flags` token from a query string. + * Returns `null` when no regex token is found. + */ +function extractRegexToken(q: string): RegexToken | null { + // Match /pattern/flags where pattern doesn't contain unescaped newlines. + // The trailing flags are optional letters (gimsuy). + const m = q.match(/(?:^|\s)(\/(?:[^/\\]|\\.)+\/[gimsuy]*)/); + if (!m || !m[1]) return null; + const raw = m[1].trim(); + const lastSlash = raw.lastIndexOf("/"); + const pattern = raw.slice(1, lastSlash); + const flags = raw.slice(lastSlash + 1); + return { raw, pattern, flags }; +} + +/** + * Derive a literal API search term from a regex pattern. + * + * Strategy (in order): + * 1. If the pattern is a **top-level alternation** `A|B|C` (branches not + * nested inside `(...)` or `[...]`) → join branches with ` OR `. + * 2. Otherwise → extract all unescaped literal sequences, pick the longest one. + * 3. If the best term is shorter than 3 characters → return `warn`. + */ +function extractApiTerm(pattern: string): { term: string; warn?: string } { + // 1. Top-level alternation detection. + const branches = splitTopLevelAlternation(pattern); + if (branches.length > 1) { + // Each branch must yield a meaningful literal to use the OR strategy; + // otherwise fall through to the longest-literal approach. + const branchTerms = branches.map((b) => longestLiteralSequence(b)); + if (branchTerms.every((t) => t.length >= 1)) { + return { term: branchTerms.join(" OR ") }; + } + } + + // 2. Longest literal sequence. + const term = longestLiteralSequence(pattern); + if (term.length < 3) { + return { + term: "", + warn: + "No meaningful search term could be extracted from the regex pattern. " + + "Use --regex-hint to specify the term to send to the GitHub API.", + }; + } + return { term }; +} + +/** + * Split a regex pattern on top-level `|` characters — i.e. `|` that are not + * inside `(...)`, `[...]`, or preceded by a backslash. + */ +function splitTopLevelAlternation(pattern: string): string[] { + const branches: string[] = []; + let depth = 0; // tracks unescaped ( nesting + let inClass = false; // tracks [...] + let current = ""; + + for (let i = 0; i < pattern.length; i++) { + const ch = pattern[i]; + const prev = i > 0 ? pattern[i - 1] : ""; + + if (prev === "\\") { + current += ch; + continue; + } + + if (ch === "[" && !inClass) { + inClass = true; + current += ch; + } else if (ch === "]" && inClass) { + inClass = false; + current += ch; + } else if (ch === "(" && !inClass) { + depth++; + current += ch; + } else if (ch === ")" && !inClass) { + depth = Math.max(0, depth - 1); + current += ch; + } else if (ch === "|" && depth === 0 && !inClass) { + branches.push(current); + current = ""; + } else { + current += ch; + } + } + branches.push(current); + return branches; +} + +/** + * Extract the longest contiguous sequence of characters useful as a GitHub + * search term from a regex pattern fragment. + * + * Only `[a-zA-Z0-9_-]` characters are accumulated — punctuation and special + * characters that are valid regex literals (e.g. `\(`) are intentionally + * excluded because they produce poor search terms. + * Character classes `[...]` are skipped entirely. + * Uses `>=` when updating `best` so that later (more specific) sequences of + * equal length are preferred over earlier structural ones (e.g. `old-lib` + * is preferred over `require` in `/require\(['"]old-lib['"]\)/`). + */ +function longestLiteralSequence(pattern: string): string { + let best = ""; + let current = ""; + let i = 0; + + while (i < pattern.length) { + const ch = pattern[i]; + + // Skip entire character class [...] — its contents are never good search terms. + if (ch === "[") { + if (current.length >= best.length) best = current; + current = ""; + i++; // skip `[` + // Handle negation `[^` and literal `]` at the very start of the class. + if (i < pattern.length && pattern[i] === "^") i++; + if (i < pattern.length && pattern[i] === "]") i++; + // Advance until unescaped `]`. + while (i < pattern.length && pattern[i] !== "]") { + if (pattern[i] === "\\") i++; // skip escaped char inside class + i++; + } + i++; // skip closing `]` + continue; + } + + // Handle escape sequences. + if (ch === "\\") { + const next = pattern[i + 1] ?? ""; + // Only accumulate if the escaped char is itself a word character or hyphen. + if (/[a-zA-Z0-9_-]/.test(next)) { + current += next; + } else { + // Escaped punctuation (\(, \), \., …) — not a useful search char → break. + if (current.length >= best.length) best = current; + current = ""; + } + i += 2; + continue; + } + + // Only accumulate characters that make a useful GitHub search term. + if (/[a-zA-Z0-9_-]/.test(ch)) { + current += ch; + } else { + if (current.length >= best.length) best = current; + current = ""; + } + i++; + } + + if (current.length >= best.length) best = current; + return best; +} From 5264a37f1240d0e451cc1bf00ac425e7ad55cc0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20HOUZ=C3=89?= Date: Thu, 12 Mar 2026 15:31:38 +0100 Subject: [PATCH 02/13] Address PR #113 review comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - regex.ts: strip y (sticky) flag alongside g to prevent stateful RegExp.test() across independent fragments - regex.ts: preserve all non-regex tokens (free-text and qualifiers alike) in buildApiQuery — previously only qualifier-prefixed tokens were kept, silently dropping terms like 'useFeatureFlag NOT deprecated' - regex.ts: fix splitTopLevelAlternation escape detection — replace prev==='\\' check with an 'escaped' boolean so that \\|foo correctly identifies | as top-level (the double backslash escapes a backslash, not the pipe) - aggregate.ts: reset regexFilter.lastIndex = 0 before each .test() call to avoid false negatives with global/sticky regexes - regex.test.ts: add .toBe(true) to the missing assertion (line 16) - regex.test.ts: fix test description for qualifier-preservation case (remove non-existent :\s*\" from name) - regex.test.ts: add tests for y-flag stripping, free-text preservation, and escaped-backslash-before-pipe corner case --- src/aggregate.ts | 7 ++++++- src/regex.test.ts | 25 +++++++++++++++++++++++-- src/regex.ts | 40 ++++++++++++++++++++++++---------------- 3 files changed, 53 insertions(+), 19 deletions(-) diff --git a/src/aggregate.ts b/src/aggregate.ts index 84fb31f..fd66230 100644 --- a/src/aggregate.ts +++ b/src/aggregate.ts @@ -43,7 +43,12 @@ export function aggregate( // Fix: when a regex filter is active, only keep matches where at least one // text_match fragment satisfies the pattern — see issue #111 if (regexFilter !== undefined) { - const hasMatch = m.textMatches.some((tm) => regexFilter.test(tm.fragment)); + const hasMatch = m.textMatches.some((tm) => { + // Fix: reset lastIndex before each call — a global/sticky regex is + // stateful and would produce false negatives on subsequent fragments. + regexFilter.lastIndex = 0; + return regexFilter.test(tm.fragment); + }); if (!hasMatch) continue; } const list = map.get(m.repoFullName) ?? []; diff --git a/src/regex.test.ts b/src/regex.test.ts index ae62477..1cf34cc 100644 --- a/src/regex.test.ts +++ b/src/regex.test.ts @@ -13,7 +13,7 @@ describe("isRegexQuery", () => { }); it("returns true for a query mixing qualifiers and regex", () => { - expect(isRegexQuery('filename:package.json /["\'"]axios["\'"]:/')); + expect(isRegexQuery('filename:package.json /["\'"]axios["\'"]:/')).toBe(true); }); it("returns false for a plain text query", () => { @@ -68,6 +68,15 @@ describe("buildApiQuery — top-level alternation → OR", () => { expect(r.regexFilter).toEqual(/TODO|FIXME|HACK/); expect(r.warn).toBeUndefined(); }); + + it("/\\\\|foo/ — escaped backslash before | → | is top-level → falls back to longest literal 'foo'", () => { + // Pattern \\|foo: \\ is an escaped backslash (matches literal \), | is top-level. + // splitTopLevelAlternation gives ["\\", "foo"]; "\\" yields no useful literal + // so branchTerms fails the every->=1 check and we fall back to longestLiteralSequence. + const r = buildApiQuery("/\\\\|foo/"); + expect(r.apiQuery).toBe("foo"); + expect(r.regexFilter).not.toBeNull(); + }); }); describe("buildApiQuery — partial alternation falls back to longest literal", () => { @@ -81,13 +90,19 @@ describe("buildApiQuery — partial alternation falls back to longest literal", }); describe("buildApiQuery — qualifier preservation", () => { - it('filename:package.json /[\'"]axios[\'"]:\\s*"/ → filename:package.json axios', () => { + it("filename:package.json /['\"axios['\"]:/ → filename:package.json axios", () => { const r = buildApiQuery("filename:package.json /['\"]axios['\"]:/"); expect(r.apiQuery).toBe("filename:package.json axios"); expect(r.regexFilter).not.toBeNull(); expect(r.warn).toBeUndefined(); }); + it("preserves free-text terms alongside the regex token", () => { + const r = buildApiQuery("useFeatureFlag NOT deprecated /pattern/i"); + expect(r.apiQuery).toBe("useFeatureFlag NOT deprecated pattern"); + expect(r.regexFilter).not.toBeNull(); + }); + it("preserves language: qualifier", () => { const r = buildApiQuery("language:TypeScript /useState/"); expect(r.apiQuery).toBe("language:TypeScript useState"); @@ -111,6 +126,12 @@ describe("buildApiQuery — flags", () => { expect(r.regexFilter?.flags).not.toContain("g"); expect(r.regexFilter?.flags).toContain("i"); }); + + it("/pattern/iy → y (sticky) flag stripped, i kept", () => { + const r = buildApiQuery("/pattern/iy"); + expect(r.regexFilter?.flags).not.toContain("y"); + expect(r.regexFilter?.flags).toContain("i"); + }); }); describe("buildApiQuery — warn cases", () => { diff --git a/src/regex.ts b/src/regex.ts index 846aeb5..74643d9 100644 --- a/src/regex.ts +++ b/src/regex.ts @@ -4,9 +4,6 @@ // These helpers detect regex queries, derive a safe literal term to send to the // API (casting a wide net), and return a compiled RegExp for local post-filtering. -/** GitHub search qualifier tokens to preserve unchanged (e.g. filename:foo.json). */ -const QUALIFIER_RE = /^(?:filename|language|path|extension|repo|org|NOT):?\S*/; - /** * Returns true if `q` contains a `/pattern/` or `/pattern/flags` token. * A leading qualifier like `filename:package.json /regex/` is also matched. @@ -39,9 +36,10 @@ export function buildApiQuery(q: string): { return { apiQuery: q, regexFilter: null }; } - // Compile the regex (strip the `g` flag — GitHub doesn't return all occurrences). + // Compile the regex (strip stateful flags `g` and `y` — GitHub doesn't return + // all occurrences and `y` (sticky) makes RegExp.test() stateful via lastIndex). const { pattern, flags, raw } = token; - const safeFlags = flags.replace("g", ""); + const safeFlags = flags.replace(/[gy]/g, ""); let regexFilter: RegExp | null = null; try { regexFilter = new RegExp(pattern, safeFlags); @@ -54,17 +52,18 @@ export function buildApiQuery(q: string): { }; } - // Separate qualifier tokens from the regex token. - const qualifiers = q - .replace(raw, "") - .trim() - .split(/\s+/) - .filter((t) => t.length > 0 && QUALIFIER_RE.test(t)) - .join(" "); - // Derive the API search term from the regex pattern. const { term, warn } = extractApiTerm(pattern); - const apiQuery = [qualifiers, term].filter(Boolean).join(" ").trim(); + + // Rebuild the API query by replacing the regex token with the derived term, + // preserving all other tokens (qualifiers and free-text terms alike). + const apiQuery = q + .trim() + .split(/\s+/) + .map((t) => (t === raw ? term : t)) + .filter((t) => t.length > 0) + .join(" ") + .trim(); return { apiQuery, regexFilter, warn }; } @@ -139,13 +138,22 @@ function splitTopLevelAlternation(pattern: string): string[] { let depth = 0; // tracks unescaped ( nesting let inClass = false; // tracks [...] let current = ""; + let escaped = false; // true when current char is escaped by a preceding backslash for (let i = 0; i < pattern.length; i++) { const ch = pattern[i]; - const prev = i > 0 ? pattern[i - 1] : ""; - if (prev === "\\") { + if (escaped) { + // Current character is escaped — treat as literal, never as a delimiter. + current += ch; + escaped = false; + continue; + } + + if (ch === "\\") { + // Next character will be escaped. current += ch; + escaped = true; continue; } From 47cd845cf44d98faf201992d97a1bd2daf818a76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20HOUZ=C3=89?= Date: Thu, 12 Mar 2026 16:10:50 +0100 Subject: [PATCH 03/13] Address second round of PR #113 review comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - regex.ts: raise branch length threshold from >= 1 to >= 3 in extractApiTerm — short branches like /a|bc/ now fall through to the longest-literal path which enforces the existing '< 3 chars → warn' rule, preventing extremely broad API queries such as 'a OR bc' - aggregate.ts: align regexFilter type with buildApiQuery return value; accept RegExp | null | undefined (via RegExp | null optional param) and update guard from !== undefined to != null so callers can pass the result of buildApiQuery directly without a null→undefined conversion - regex.test.ts: add test for short-branch alternation fallback (/a|bc/) - aggregate.test.ts: add test for null regexFilter treated as no-op --- src/aggregate.test.ts | 10 ++++++++++ src/aggregate.ts | 4 ++-- src/regex.test.ts | 9 +++++++++ src/regex.ts | 8 +++++--- 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/src/aggregate.test.ts b/src/aggregate.test.ts index 1a314c5..92a0f83 100644 --- a/src/aggregate.test.ts +++ b/src/aggregate.test.ts @@ -195,6 +195,16 @@ describe("aggregate — regexFilter", () => { expect(groups).toHaveLength(2); }); + it("keeps all matches when regexFilter is null (backward compat — null treated as no filter)", () => { + const matches: CodeMatch[] = [ + makeMatchWithFragments("myorg/repoA", "src/a.ts", ["const x = 1"]), + makeMatchWithFragments("myorg/repoB", "src/b.ts", ["const y = 2"]), + ]; + + const groups = aggregate(matches, new Set(), new Set(), false, null); + expect(groups).toHaveLength(2); + }); + it("respects regex flags (case-insensitive)", () => { const matches: CodeMatch[] = [ makeMatchWithFragments("myorg/repoA", "src/a.ts", ["import AXIOS from 'axios'"]), diff --git a/src/aggregate.ts b/src/aggregate.ts index fd66230..33ef699 100644 --- a/src/aggregate.ts +++ b/src/aggregate.ts @@ -34,7 +34,7 @@ export function aggregate( excludedRepos: Set, excludedExtractRefs: Set, includeArchived = false, - regexFilter?: RegExp, + regexFilter?: RegExp | null, ): RepoGroup[] { const map = new Map(); for (const m of matches) { @@ -42,7 +42,7 @@ export function aggregate( if (!includeArchived && m.archived) continue; // Fix: when a regex filter is active, only keep matches where at least one // text_match fragment satisfies the pattern — see issue #111 - if (regexFilter !== undefined) { + if (regexFilter != null) { const hasMatch = m.textMatches.some((tm) => { // Fix: reset lastIndex before each call — a global/sticky regex is // stateful and would produce false negatives on subsequent fragments. diff --git a/src/regex.test.ts b/src/regex.test.ts index 1cf34cc..7d4453d 100644 --- a/src/regex.test.ts +++ b/src/regex.test.ts @@ -69,6 +69,15 @@ describe("buildApiQuery — top-level alternation → OR", () => { expect(r.warn).toBeUndefined(); }); + it("/a|bc/ — short branches (< 3 chars each) fall back to longest literal and warn", () => { + // branches: "a" (1 char) and "bc" (2 chars) — both < 3 → fall back to + // longestLiteralSequence("a|bc") → "bc" (2 chars) < 3 → warn + empty term + const r = buildApiQuery("/a|bc/"); + expect(r.warn).toBeDefined(); + expect(r.apiQuery).toBe(""); + expect(r.regexFilter).not.toBeNull(); + }); + it("/\\\\|foo/ — escaped backslash before | → | is top-level → falls back to longest literal 'foo'", () => { // Pattern \\|foo: \\ is an escaped backslash (matches literal \), | is top-level. // splitTopLevelAlternation gives ["\\", "foo"]; "\\" yields no useful literal diff --git a/src/regex.ts b/src/regex.ts index 74643d9..5a360cf 100644 --- a/src/regex.ts +++ b/src/regex.ts @@ -108,10 +108,12 @@ function extractApiTerm(pattern: string): { term: string; warn?: string } { // 1. Top-level alternation detection. const branches = splitTopLevelAlternation(pattern); if (branches.length > 1) { - // Each branch must yield a meaningful literal to use the OR strategy; - // otherwise fall through to the longest-literal approach. + // Each branch must yield a meaningful literal (>= 3 chars) to use the OR + // strategy — the same minimum enforced by the single-literal path below. + // Branches shorter than 3 chars (e.g. /a|bc/) fall through so the global + // "< 3 chars → warn + empty term" rule still applies. const branchTerms = branches.map((b) => longestLiteralSequence(b)); - if (branchTerms.every((t) => t.length >= 1)) { + if (branchTerms.every((t) => t.length >= 3)) { return { term: branchTerms.join(" OR ") }; } } From 4a52128d0ff0ec3a7d5d97c5e8b16d7a00115d2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20HOUZ=C3=89?= Date: Thu, 12 Mar 2026 16:30:09 +0100 Subject: [PATCH 04/13] Fix extractRegexToken to cover all JS RegExp flags The flag character class now includes all current JS RegExp flags: g i m s u y (already present) + d (hasIndices, ES2022) and v (unicodeSets, ES2023). The comment is updated to list them explicitly. This ensures /pattern/s (dotAll), /pattern/d, and /pattern/v are tokenized correctly and the regex token is replaced in the API query rather than left as a raw /.../flags string unsafe for the GitHub API. Add test case for /pattern/s to assert the flag is recognized and the API query is correctly derived. Closes review comment on PR #113 --- src/regex.test.ts | 9 +++++++++ src/regex.ts | 6 ++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/regex.test.ts b/src/regex.test.ts index 7d4453d..cb39e17 100644 --- a/src/regex.test.ts +++ b/src/regex.test.ts @@ -130,6 +130,15 @@ describe("buildApiQuery — flags", () => { expect(r.regexFilter?.flags).toContain("i"); }); + it("/pattern/s → s (dotAll) flag recognized and preserved", () => { + // s is a valid JS RegExp flag (dotAll) — must be tokenized correctly + // so the /pattern/s token is replaced in the API query (not left as-is). + const r = buildApiQuery("/pattern/s"); + expect(r.apiQuery).toBe("pattern"); + expect(r.regexFilter).not.toBeNull(); + expect(r.regexFilter?.flags).toContain("s"); + }); + it("/pattern/gi → g flag stripped, i kept", () => { const r = buildApiQuery("/pattern/gi"); expect(r.regexFilter?.flags).not.toContain("g"); diff --git a/src/regex.ts b/src/regex.ts index 5a360cf..cc0c8cd 100644 --- a/src/regex.ts +++ b/src/regex.ts @@ -85,8 +85,10 @@ interface RegexToken { */ function extractRegexToken(q: string): RegexToken | null { // Match /pattern/flags where pattern doesn't contain unescaped newlines. - // The trailing flags are optional letters (gimsuy). - const m = q.match(/(?:^|\s)(\/(?:[^/\\]|\\.)+\/[gimsuy]*)/); + // The trailing flags are all current JS RegExp flag letters: + // g (global), i (ignoreCase), m (multiline), s (dotAll), + // u (unicode), y (sticky), d (hasIndices ES2022), v (unicodeSets ES2023). + const m = q.match(/(?:^|\s)(\/(?:[^/\\]|\\.)+\/[gimsuydev]*)/); if (!m || !m[1]) return null; const raw = m[1].trim(); const lastSlash = raw.lastIndexOf("/"); From 70e4f33c9fa73b4629d3c04e0f933887867dea3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20HOUZ=C3=89?= Date: Thu, 12 Mar 2026 20:47:54 +0100 Subject: [PATCH 05/13] Address PR #113 round 4: end-of-token boundary, newline exclusion, lastIndex leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit src/regex.ts — extractRegexToken(): - Exclude \r and \n from the character class ([^/\\] → [^/\\\r\n]) so the comment ("doesn't contain unescaped newlines") matches reality. - Add (?=$|\s) end-of-token boundary after the flags so a word like /useState/iSomething is not mis-tokenised (the non-flag suffix stops the match before it can anchor to whitespace/end-of-string). src/regex.test.ts: - isRegexQuery('/useState/iSomething') → false (boundary regression) - buildApiQuery('/useState/iSomething') → passthrough, regexFilter null src/aggregate.ts — aggregate(): - Reset regexFilter.lastIndex = 0 after the some() loop so callers that reuse the same RegExp instance don't observe a stale lastIndex after aggregate() returns. src/aggregate.test.ts: - New test: verify lastIndex is 0 after aggregate() when using /g regex --- src/aggregate.test.ts | 11 +++++++++++ src/aggregate.ts | 3 +++ src/regex.test.ts | 13 +++++++++++++ src/regex.ts | 11 ++++++++--- 4 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/aggregate.test.ts b/src/aggregate.test.ts index 92a0f83..58f1dbd 100644 --- a/src/aggregate.test.ts +++ b/src/aggregate.test.ts @@ -223,4 +223,15 @@ describe("aggregate — regexFilter", () => { const groups = aggregate(matches, new Set(), new Set(), false, /axios/); expect(groups).toHaveLength(0); }); + + it("restores lastIndex to 0 after filtering (does not leak state to caller)", () => { + // Regression: aggregate() must not leave a stale lastIndex on the RegExp + // instance so callers that reuse it get consistent results. + const matches: CodeMatch[] = [ + makeMatchWithFragments("myorg/repoA", "src/a.ts", ["import axios from 'axios'"]), + ]; + const regex = /axios/g; // global flag: lastIndex advances after a match + aggregate(matches, new Set(), new Set(), false, regex); + expect(regex.lastIndex).toBe(0); + }); }); diff --git a/src/aggregate.ts b/src/aggregate.ts index 33ef699..03e3904 100644 --- a/src/aggregate.ts +++ b/src/aggregate.ts @@ -49,6 +49,9 @@ export function aggregate( regexFilter.lastIndex = 0; return regexFilter.test(tm.fragment); }); + // Fix: restore lastIndex to 0 so callers that reuse the same RegExp + // instance don't observe a stale non-zero lastIndex after aggregate(). + regexFilter.lastIndex = 0; if (!hasMatch) continue; } const list = map.get(m.repoFullName) ?? []; diff --git a/src/regex.test.ts b/src/regex.test.ts index cb39e17..7649a36 100644 --- a/src/regex.test.ts +++ b/src/regex.test.ts @@ -27,6 +27,12 @@ describe("isRegexQuery", () => { it("returns false for a qualifier-only query", () => { expect(isRegexQuery("filename:package.json")).toBe(false); }); + + it("returns false when /pattern/flags is not end-bounded (e.g. /useState/iSomething)", () => { + // Regression: the token must be followed by whitespace or end-of-string; + // a suffix of non-flag characters must not be silently swallowed. + expect(isRegexQuery("/useState/iSomething")).toBe(false); + }); }); // ─── buildApiQuery ──────────────────────────────────────────────────────────── @@ -38,6 +44,13 @@ describe("buildApiQuery — plain text passthrough", () => { expect(r.regexFilter).toBeNull(); expect(r.warn).toBeUndefined(); }); + + it("/useState/iSomething is NOT treated as a regex token (boundary regression)", () => { + // 'iSomething' is not a valid flag sequence — the token should not match. + const r = buildApiQuery("/useState/iSomething"); + expect(r.apiQuery).toBe("/useState/iSomething"); + expect(r.regexFilter).toBeNull(); + }); }); describe("buildApiQuery — longest literal extraction", () => { diff --git a/src/regex.ts b/src/regex.ts index cc0c8cd..f8fa2d0 100644 --- a/src/regex.ts +++ b/src/regex.ts @@ -84,11 +84,16 @@ interface RegexToken { * Returns `null` when no regex token is found. */ function extractRegexToken(q: string): RegexToken | null { - // Match /pattern/flags where pattern doesn't contain unescaped newlines. - // The trailing flags are all current JS RegExp flag letters: + // Match /pattern/flags where: + // - the pattern is a non-empty sequence that doesn't contain an unescaped + // forward slash, backslash, or newline (\r / \n) + // - the token ends at end-of-string or a whitespace boundary, so we don't + // accidentally match a prefix of a longer non-delimited word + // (e.g. /foo/iSomething must NOT be recognised as a regex token). + // The trailing flags cover all current JS RegExp flag letters: // g (global), i (ignoreCase), m (multiline), s (dotAll), // u (unicode), y (sticky), d (hasIndices ES2022), v (unicodeSets ES2023). - const m = q.match(/(?:^|\s)(\/(?:[^/\\]|\\.)+\/[gimsuydev]*)/); + const m = q.match(/(?:^|\s)(\/(?:[^/\\\r\n]|\\.)+\/[gimsuydev]*)(?=$|\s)/); if (!m || !m[1]) return null; const raw = m[1].trim(); const lastSlash = raw.lastIndexOf("/"); From f2fe87da25ccfb4cf449290644fab04ead52dbca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20HOUZ=C3=89?= Date: Thu, 12 Mar 2026 21:03:34 +0100 Subject: [PATCH 06/13] Address PR #113 round 5: remove invalid 'e' flag, clarify parser comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit src/regex.ts — extractRegexToken(): - Remove 'e' from the allowed flag set: [gimsuydev]* -> [gimsuydv]* 'e' is not a valid JavaScript RegExp flag; its presence would cause /pattern/e to be tokenised as a regex and then fail compilation, instead of being passed through unchanged as a plain query token. src/regex.ts — splitTopLevelAlternation(): - Clarify truncated inline comments: depth: 'tracks unescaped ( nesting' → 'tracks unescaped \"(\" nesting depth outside character classes' inClass: 'tracks [...]' → 'tracks whether we are currently inside a character class [...]' src/regex.test.ts: - isRegexQuery('/pattern/e') → false (regression test) - buildApiQuery('/pattern/e') → passthrough, regexFilter null --- src/regex.test.ts | 10 ++++++++++ src/regex.ts | 9 +++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/regex.test.ts b/src/regex.test.ts index 7649a36..553e842 100644 --- a/src/regex.test.ts +++ b/src/regex.test.ts @@ -33,6 +33,10 @@ describe("isRegexQuery", () => { // a suffix of non-flag characters must not be silently swallowed. expect(isRegexQuery("/useState/iSomething")).toBe(false); }); + + it("returns false for /pattern/e ('e' is not a valid JS RegExp flag)", () => { + expect(isRegexQuery("/pattern/e")).toBe(false); + }); }); // ─── buildApiQuery ──────────────────────────────────────────────────────────── @@ -51,6 +55,12 @@ describe("buildApiQuery — plain text passthrough", () => { expect(r.apiQuery).toBe("/useState/iSomething"); expect(r.regexFilter).toBeNull(); }); + + it("/pattern/e is NOT treated as a regex token ('e' is not a valid JS RegExp flag)", () => { + const r = buildApiQuery("/pattern/e"); + expect(r.apiQuery).toBe("/pattern/e"); + expect(r.regexFilter).toBeNull(); + }); }); describe("buildApiQuery — longest literal extraction", () => { diff --git a/src/regex.ts b/src/regex.ts index f8fa2d0..20500a8 100644 --- a/src/regex.ts +++ b/src/regex.ts @@ -90,10 +90,11 @@ function extractRegexToken(q: string): RegexToken | null { // - the token ends at end-of-string or a whitespace boundary, so we don't // accidentally match a prefix of a longer non-delimited word // (e.g. /foo/iSomething must NOT be recognised as a regex token). - // The trailing flags cover all current JS RegExp flag letters: + // The trailing flags cover all valid JS RegExp flag letters: // g (global), i (ignoreCase), m (multiline), s (dotAll), // u (unicode), y (sticky), d (hasIndices ES2022), v (unicodeSets ES2023). - const m = q.match(/(?:^|\s)(\/(?:[^/\\\r\n]|\\.)+\/[gimsuydev]*)(?=$|\s)/); + // Note: 'e' is intentionally excluded — it is not a valid JS RegExp flag. + const m = q.match(/(?:^|\s)(\/(?:[^/\\\r\n]|\\.)+\/[gimsuydv]*)(?=$|\s)/); if (!m || !m[1]) return null; const raw = m[1].trim(); const lastSlash = raw.lastIndexOf("/"); @@ -144,8 +145,8 @@ function extractApiTerm(pattern: string): { term: string; warn?: string } { */ function splitTopLevelAlternation(pattern: string): string[] { const branches: string[] = []; - let depth = 0; // tracks unescaped ( nesting - let inClass = false; // tracks [...] + let depth = 0; // tracks unescaped '(' nesting depth outside character classes + let inClass = false; // tracks whether we are currently inside a character class [...] let current = ""; let escaped = false; // true when current char is escaped by a preceding backslash From e6bcd62d074fcf0b5209601f7ab33c526c301f77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20HOUZ=C3=89?= Date: Thu, 12 Mar 2026 21:12:12 +0100 Subject: [PATCH 07/13] Fix longestLiteralSequence: break sequence on regex special escapes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, escaped characters like \b, \d, \s, \w, \p, \u, \x and digit backreferences were treated as literal word characters and accumulated into the API search term. This produced wrong results such as 'buseStateb' for the pattern \buseState\b, where \b is a word- boundary assertion, not the letter 'b'. Fix: when the escaped character is a letter/digit but is a known regex escape or backreference (bBdDsSwWpPuUxX, 0-9), break the current sequence instead of accumulating it. Only escaped punctuation or\nunambiguous literal chars (e.g. \- or \_ in some dialects) are kept. Tests added: - /\buseState\b/ → 'useState' (not 'buseStateb') - /\d+\.\d+/ → empty + warn - /foobar\sxyz/ → 'foobar' --- src/regex.test.ts | 21 +++++++++++++++++++++ src/regex.ts | 12 +++++++++--- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/src/regex.test.ts b/src/regex.test.ts index 553e842..439c68b 100644 --- a/src/regex.test.ts +++ b/src/regex.test.ts @@ -175,6 +175,27 @@ describe("buildApiQuery — flags", () => { }); }); +describe("buildApiQuery — special escape handling in longestLiteralSequence", () => { + it("/\\buseState\\b/ → useState (word-boundary escapes do not contaminate the term)", () => { + // Regression: \b is a regex assertion, not the letter 'b'. + // The sequence must be broken at \b so 'useState' is extracted, not 'buseStateb'. + const r = buildApiQuery("/\\buseState\\b/"); + expect(r.apiQuery).toBe("useState"); + expect(r.regexFilter).not.toBeNull(); + }); + + it("/\\d+\\.\\d+/ → empty term + warn (\\d and \\. are not literals)", () => { + const r = buildApiQuery("/\\d+\\.\\d+/"); + expect(r.apiQuery).toBe(""); + expect(r.warn).toBeDefined(); + }); + + it("/foobar\\sxyz/ → foobar (\\s breaks the sequence, longer prefix wins)", () => { + const r = buildApiQuery("/foobar\\sxyz/"); + expect(r.apiQuery).toBe("foobar"); + }); +}); + describe("buildApiQuery — warn cases", () => { it("/[~^]?[0-9]+\\.[0-9]+/ → empty term + warn", () => { const r = buildApiQuery("/[~^]?[0-9]+\\.[0-9]+/"); diff --git a/src/regex.ts b/src/regex.ts index 20500a8..ee82cd9 100644 --- a/src/regex.ts +++ b/src/regex.ts @@ -230,11 +230,17 @@ function longestLiteralSequence(pattern: string): string { // Handle escape sequences. if (ch === "\\") { const next = pattern[i + 1] ?? ""; - // Only accumulate if the escaped char is itself a word character or hyphen. - if (/[a-zA-Z0-9_-]/.test(next)) { + // Only accumulate if the escaped char is a word character or hyphen + // AND is not a common regex escape or backreference (\b, \d, \s, \w, + // \p, \u, \x, \1–\9, …). Those are non-literal and must break the + // current sequence (e.g. \buseState\b → 'useState', not 'buseStateb'). + const isWordLike = /[a-zA-Z0-9_-]/.test(next); + const isSpecialEscape = /[bBdDsSwWpPuUxX0-9]/.test(next); + if (isWordLike && !isSpecialEscape) { current += next; } else { - // Escaped punctuation (\(, \), \., …) — not a useful search char → break. + // Escaped punctuation or special escape — not a useful literal search + // char → break the current sequence. if (current.length >= best.length) best = current; current = ""; } From 686a9caf1f6de5eb7eed158e735ec7490b1b11dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20HOUZ=C3=89?= Date: Thu, 12 Mar 2026 21:18:54 +0100 Subject: [PATCH 08/13] Fix longestLiteralSequence: add control-char escapes to isSpecialEscape MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit \n, \r, \t, \f, \v, \a, \e were missing from the isSpecialEscape guard. Without this fix, /foo\nbar/ would accumulate 'n' and extract 'foonbar' instead of the correct 'foobar'. All named escape mnemonics (n, r, t, f, v, a, e) are now treated as sequence-breaking non-literals, consistent with the existing handling of \b, \d, \s, \w, backreferences, etc. Tests added: - /foobar\nbar/ → 'foobar' (\n breaks, not 'foobarnbar') - /foobar\tbaz/ → 'foobar' (\t breaks) --- src/regex.test.ts | 11 +++++++++++ src/regex.ts | 8 +++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/regex.test.ts b/src/regex.test.ts index 439c68b..05aee5f 100644 --- a/src/regex.test.ts +++ b/src/regex.test.ts @@ -194,6 +194,17 @@ describe("buildApiQuery — special escape handling in longestLiteralSequence", const r = buildApiQuery("/foobar\\sxyz/"); expect(r.apiQuery).toBe("foobar"); }); + + it("/foobar\\nbar/ → foobar (\\n is a control-character escape, not the letter 'n')", () => { + // Regression: \n must break the sequence, not accumulate 'n' → 'foobarnbar'. + const r = buildApiQuery("/foobar\\nbar/"); + expect(r.apiQuery).toBe("foobar"); + }); + + it("/foobar\\tbaz/ → foobar (\\t control escape breaks the sequence)", () => { + const r = buildApiQuery("/foobar\\tbaz/"); + expect(r.apiQuery).toBe("foobar"); + }); }); describe("buildApiQuery — warn cases", () => { diff --git a/src/regex.ts b/src/regex.ts index ee82cd9..9c68992 100644 --- a/src/regex.ts +++ b/src/regex.ts @@ -232,10 +232,12 @@ function longestLiteralSequence(pattern: string): string { const next = pattern[i + 1] ?? ""; // Only accumulate if the escaped char is a word character or hyphen // AND is not a common regex escape or backreference (\b, \d, \s, \w, - // \p, \u, \x, \1–\9, …). Those are non-literal and must break the - // current sequence (e.g. \buseState\b → 'useState', not 'buseStateb'). + // \p, \u, \x, \1–9, …) or control-character escape (\n, \r, \t, \f, + // \v, \a, \e). Those are non-literal and must break the current sequence + // (e.g. \buseState\b → 'useState', not 'buseStateb'; + // /foo\nbar/ → 'foobar' not 'foonbar'). const isWordLike = /[a-zA-Z0-9_-]/.test(next); - const isSpecialEscape = /[bBdDsSwWpPuUxX0-9]/.test(next); + const isSpecialEscape = /[bBdDsSwWpPuUxX0-9nrtfvae]/.test(next); if (isWordLike && !isSpecialEscape) { current += next; } else { From 158b9cdd9fc54292bf34b4019883429a1a3537cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20HOUZ=C3=89?= Date: Thu, 12 Mar 2026 21:28:44 +0100 Subject: [PATCH 09/13] Address PR #113 round 7: preserve caller lastIndex; include error in warn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit src/aggregate.ts — aggregate(): - Capture regexFilter.lastIndex before filtering and restore it afterward (instead of hard-coding 0). This ensures aggregate() has no observable side effects on the caller's RegExp instance: a caller that set lastIndex intentionally before calling aggregate() will find it unchanged on return. src/aggregate.test.ts: - Updated test: pre-seed a non-zero lastIndex and assert it is restored to that exact value after aggregate() returns (not forced to 0). src/regex.ts — buildApiQuery(): - Capture the thrown Error and include err.message in the warn string. Callers now see the engine's own diagnostic (e.g. 'Invalid regular expression: /[/: Unterminated character class') instead of a generic 'Invalid regex pattern …' that loses the root cause. src/regex.test.ts: - Extended the invalid-regex test to assert the raw token (/[/) appears in the warn string for easy identification. --- src/aggregate.test.ts | 14 +++++++++----- src/aggregate.ts | 9 ++++++--- src/regex.test.ts | 6 +++++- src/regex.ts | 8 +++++--- 4 files changed, 25 insertions(+), 12 deletions(-) diff --git a/src/aggregate.test.ts b/src/aggregate.test.ts index 58f1dbd..a72aaf4 100644 --- a/src/aggregate.test.ts +++ b/src/aggregate.test.ts @@ -224,14 +224,18 @@ describe("aggregate — regexFilter", () => { expect(groups).toHaveLength(0); }); - it("restores lastIndex to 0 after filtering (does not leak state to caller)", () => { - // Regression: aggregate() must not leave a stale lastIndex on the RegExp - // instance so callers that reuse it get consistent results. + it("restores lastIndex to its pre-call value after filtering (does not clobber caller state)", () => { + // Regression: aggregate() must restore lastIndex to whatever the caller had + // set before the call — not necessarily 0. const matches: CodeMatch[] = [ makeMatchWithFragments("myorg/repoA", "src/a.ts", ["import axios from 'axios'"]), ]; - const regex = /axios/g; // global flag: lastIndex advances after a match + const regex = /axios/g; + // Caller has already run a match, so lastIndex is non-zero. + regex.test("import axios from 'axios'"); + const savedIndex = regex.lastIndex; + expect(savedIndex).toBeGreaterThan(0); aggregate(matches, new Set(), new Set(), false, regex); - expect(regex.lastIndex).toBe(0); + expect(regex.lastIndex).toBe(savedIndex); }); }); diff --git a/src/aggregate.ts b/src/aggregate.ts index 03e3904..bc94f14 100644 --- a/src/aggregate.ts +++ b/src/aggregate.ts @@ -43,15 +43,18 @@ export function aggregate( // Fix: when a regex filter is active, only keep matches where at least one // text_match fragment satisfies the pattern — see issue #111 if (regexFilter != null) { + // Preserve the caller's lastIndex: aggregate() must not have observable + // side-effects on the passed-in RegExp instance. + const savedLastIndex = regexFilter.lastIndex; const hasMatch = m.textMatches.some((tm) => { // Fix: reset lastIndex before each call — a global/sticky regex is // stateful and would produce false negatives on subsequent fragments. regexFilter.lastIndex = 0; return regexFilter.test(tm.fragment); }); - // Fix: restore lastIndex to 0 so callers that reuse the same RegExp - // instance don't observe a stale non-zero lastIndex after aggregate(). - regexFilter.lastIndex = 0; + // Restore the caller's original lastIndex (rather than hard-coding 0), + // so aggregate() doesn't have observable side effects on its inputs. + regexFilter.lastIndex = savedLastIndex; if (!hasMatch) continue; } const list = map.get(m.repoFullName) ?? []; diff --git a/src/regex.test.ts b/src/regex.test.ts index 05aee5f..e476fa6 100644 --- a/src/regex.test.ts +++ b/src/regex.test.ts @@ -215,10 +215,14 @@ describe("buildApiQuery — warn cases", () => { expect(r.warn).toBeDefined(); }); - it("/[/ (invalid regex) → empty term + warn + null filter", () => { + it("/[/ (invalid regex) → warn includes the engine error message", () => { const r = buildApiQuery("/[/"); expect(r.apiQuery).toBe(""); expect(r.regexFilter).toBeNull(); + // The warn message must include the engine-provided reason (not just a + // generic message), so callers can surface a precise debugging hint. expect(r.warn).toBeDefined(); + // The raw token should appear in warn for easy identification. + expect(r.warn).toContain("/[/"); }); }); diff --git a/src/regex.ts b/src/regex.ts index 9c68992..8b9c875 100644 --- a/src/regex.ts +++ b/src/regex.ts @@ -43,12 +43,14 @@ export function buildApiQuery(q: string): { let regexFilter: RegExp | null = null; try { regexFilter = new RegExp(pattern, safeFlags); - } catch { - // Fix: invalid regex pattern — warn and return empty query — see issue #111 + } catch (err) { + // Fix: invalid regex — warn with the engine's own error message so callers + // can surface a precise reason (e.g. invalid flags vs bad syntax). + const reason = err instanceof Error ? err.message : String(err); return { apiQuery: "", regexFilter: null, - warn: `Invalid regex pattern: /${pattern}/${flags}`, + warn: `Invalid regex /${pattern}/${flags}: ${reason}`, }; } From 0a3a77e9062e0c405ec39a838a101414b1d78dbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20HOUZ=C3=89?= Date: Thu, 12 Mar 2026 21:41:56 +0100 Subject: [PATCH 10/13] Address PR #113 round 8: fix \a/\e escapes, document y-stripping, fix test title MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit src/regex.ts — longestLiteralSequence(): - Remove 'a' and 'e' from isSpecialEscape. In JS without u/v, \a and \e are identity escapes that simply match the literal characters 'a' and 'e', so they should be accumulated like any other word char, not break the sequence. Only true non-literal regex escapes remain (b, B, d, D, s, S, w, W, p, P, u, U, x, X, 0-9, n, r, t, f, v). src/regex.ts — buildApiQuery(): - Expand the comment explaining why both g and y are stripped: g: GitHub returns at most a few fragments, not all occurrences. y: sticky makes RegExp.test() stateful via lastIndex. Both are intentional; all other flags are kept unchanged. src/regex.test.ts: - Fix test title: was missing ']' after first character class ('/'['\"axios['\"]:/') → correct ('/['\"']axios['\"]:/'), now matching the actual query string under test. --- src/regex.test.ts | 2 +- src/regex.ts | 17 ++++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/regex.test.ts b/src/regex.test.ts index e476fa6..6be62e5 100644 --- a/src/regex.test.ts +++ b/src/regex.test.ts @@ -122,7 +122,7 @@ describe("buildApiQuery — partial alternation falls back to longest literal", }); describe("buildApiQuery — qualifier preservation", () => { - it("filename:package.json /['\"axios['\"]:/ → filename:package.json axios", () => { + it("filename:package.json /['\"]axios['\"]:/ → filename:package.json axios", () => { const r = buildApiQuery("filename:package.json /['\"]axios['\"]:/"); expect(r.apiQuery).toBe("filename:package.json axios"); expect(r.regexFilter).not.toBeNull(); diff --git a/src/regex.ts b/src/regex.ts index 8b9c875..8130366 100644 --- a/src/regex.ts +++ b/src/regex.ts @@ -36,8 +36,11 @@ export function buildApiQuery(q: string): { return { apiQuery: q, regexFilter: null }; } - // Compile the regex (strip stateful flags `g` and `y` — GitHub doesn't return - // all occurrences and `y` (sticky) makes RegExp.test() stateful via lastIndex). + // Compile the regex. Strip stateful flags: + // g (global) — GitHub returns at most a few fragments, not all occurrences. + // y (sticky) — makes RegExp.test() stateful via lastIndex, causing false + // negatives when the same instance is reused across fragments. + // Both are intentionally removed; all other flags (i, m, s, d, v, …) are kept. const { pattern, flags, raw } = token; const safeFlags = flags.replace(/[gy]/g, ""); let regexFilter: RegExp | null = null; @@ -234,12 +237,12 @@ function longestLiteralSequence(pattern: string): string { const next = pattern[i + 1] ?? ""; // Only accumulate if the escaped char is a word character or hyphen // AND is not a common regex escape or backreference (\b, \d, \s, \w, - // \p, \u, \x, \1–9, …) or control-character escape (\n, \r, \t, \f, - // \v, \a, \e). Those are non-literal and must break the current sequence - // (e.g. \buseState\b → 'useState', not 'buseStateb'; - // /foo\nbar/ → 'foobar' not 'foonbar'). + // \p, \u, \x, \1–9, …) or control-character escape (\n, \r, \t, \f, \v). + // Note: \a and \e are NOT in this list — in JS without u/v they are + // identity escapes that simply match the literal letter ('a' or 'e'), + // so they should be accumulated, not broken on. const isWordLike = /[a-zA-Z0-9_-]/.test(next); - const isSpecialEscape = /[bBdDsSwWpPuUxX0-9nrtfvae]/.test(next); + const isSpecialEscape = /[bBdDsSwWpPuUxX0-9nrtfv]/.test(next); if (isWordLike && !isSpecialEscape) { current += next; } else { From 0cde9990d88a83bf41013572ddba03d8d7e6bdf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20HOUZ=C3=89?= Date: Thu, 12 Mar 2026 21:48:52 +0100 Subject: [PATCH 11/13] Fix apiQuery reconstruction to preserve quoted phrases byte-for-byte MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous split(/\s+/) + join approach would break GitHub quoted phrases (e.g. "feature flag" /from.*axios/) by splitting 'feature flag' into two separate tokens and losing the quote grouping. Fix: use q.replace(raw, term).trim() to replace only the regex token in-place, leaving every other character in the original query unchanged. This preserves quoted phrases, extra whitespace handling, and all qualifiers byte-for-byte. Regression test added: '"feature flag" /from.*axios/' → '"feature flag" axios' --- src/regex.test.ts | 9 +++++++++ src/regex.ts | 13 ++++--------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/regex.test.ts b/src/regex.test.ts index 6be62e5..2c712fd 100644 --- a/src/regex.test.ts +++ b/src/regex.test.ts @@ -144,6 +144,15 @@ describe("buildApiQuery — qualifier preservation", () => { const r = buildApiQuery("path:src/ /useState/"); expect(r.apiQuery).toBe("path:src/ useState"); }); + + it("preserves quoted phrase alongside regex token", () => { + // Regression: split(/\s+/) would break quoted phrases like \"feature flag\"; + // the reconstruction must replace only the regex token, byte-for-byte. + const r = buildApiQuery('"feature flag" /from.*axios/'); + expect(r.apiQuery).toBe('"feature flag" axios'); + expect(r.regexFilter).not.toBeNull(); + expect(r.warn).toBeUndefined(); + }); }); describe("buildApiQuery — flags", () => { diff --git a/src/regex.ts b/src/regex.ts index 8130366..50b43a1 100644 --- a/src/regex.ts +++ b/src/regex.ts @@ -60,15 +60,10 @@ export function buildApiQuery(q: string): { // Derive the API search term from the regex pattern. const { term, warn } = extractApiTerm(pattern); - // Rebuild the API query by replacing the regex token with the derived term, - // preserving all other tokens (qualifiers and free-text terms alike). - const apiQuery = q - .trim() - .split(/\s+/) - .map((t) => (t === raw ? term : t)) - .filter((t) => t.length > 0) - .join(" ") - .trim(); + // Rebuild the API query by replacing the regex token in-place, preserving + // all other characters byte-for-byte — including quoted phrases (e.g. + // '"exact match" /pattern/') whose internal whitespace must not be split. + const apiQuery = q.replace(raw, term).trim(); return { apiQuery, regexFilter, warn }; } From a5b281511608254f82f3588bc7d6e20559ddfa56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20HOUZ=C3=89?= Date: Fri, 13 Mar 2026 10:08:50 +0100 Subject: [PATCH 12/13] Fix apiQuery splice: use match index instead of String.replace() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit String.replace(raw, term) replaces the first *substring* occurrence of raw, which can target the wrong position when the same text appears earlier in the query as a non-token prefix (e.g. '/useState/iSomething /useState/i' — the string '/useState/i' is a substring of the first token but the actual matched token is the second one). Fix: - Add `index: number` to RegexToken, storing the exact start position of the token within the original query (computed from m.index + leading-space offset). - Use slice-based splice in buildApiQuery(): q.slice(0, index) + term + q.slice(index + raw.length) This replaces only the boundary-validated token at its exact location, regardless of whether the same raw text appears elsewhere. Regression test added: '/useState/iSomething /useState/i' → '/useState/iSomething useState' --- src/regex.test.ts | 10 ++++++++++ src/regex.ts | 20 ++++++++++++++------ 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/regex.test.ts b/src/regex.test.ts index 2c712fd..35758f0 100644 --- a/src/regex.test.ts +++ b/src/regex.test.ts @@ -153,6 +153,16 @@ describe("buildApiQuery — qualifier preservation", () => { expect(r.regexFilter).not.toBeNull(); expect(r.warn).toBeUndefined(); }); + + it("replaces the matched token when the same raw text appears earlier as a prefix substring", () => { + // Regression: '/useState/i' is a substring of '/useState/iSomething' (not a + // valid token — fails boundary check). q.replace(raw, term) would wrongly + // replace the first occurrence inside the non-token prefix. The splice must + // target only the index-validated token. + const r = buildApiQuery("/useState/iSomething /useState/i"); + expect(r.apiQuery).toBe("/useState/iSomething useState"); + expect(r.regexFilter).not.toBeNull(); + }); }); describe("buildApiQuery — flags", () => { diff --git a/src/regex.ts b/src/regex.ts index 50b43a1..e839cd7 100644 --- a/src/regex.ts +++ b/src/regex.ts @@ -41,7 +41,7 @@ export function buildApiQuery(q: string): { // y (sticky) — makes RegExp.test() stateful via lastIndex, causing false // negatives when the same instance is reused across fragments. // Both are intentionally removed; all other flags (i, m, s, d, v, …) are kept. - const { pattern, flags, raw } = token; + const { pattern, flags, raw, index } = token; const safeFlags = flags.replace(/[gy]/g, ""); let regexFilter: RegExp | null = null; try { @@ -60,10 +60,12 @@ export function buildApiQuery(q: string): { // Derive the API search term from the regex pattern. const { term, warn } = extractApiTerm(pattern); - // Rebuild the API query by replacing the regex token in-place, preserving - // all other characters byte-for-byte — including quoted phrases (e.g. - // '"exact match" /pattern/') whose internal whitespace must not be split. - const apiQuery = q.replace(raw, term).trim(); + // Rebuild the API query by splicing the derived term at the exact byte + // position of the matched token. Using q.replace(raw, term) would replace the + // first *substring* occurrence of `raw`, which may appear earlier in the query + // as a non-token prefix (e.g. inside a longer word). Using the stored index + // guarantees we replace only the boundary-validated token that was matched. + const apiQuery = (q.slice(0, index) + term + q.slice(index + raw.length)).trim(); return { apiQuery, regexFilter, warn }; } @@ -77,6 +79,8 @@ interface RegexToken { pattern: string; /** Flags string (may be empty) */ flags: string; + /** Start index of `raw` within the original query string. */ + index: number; } /** @@ -97,10 +101,14 @@ function extractRegexToken(q: string): RegexToken | null { const m = q.match(/(?:^|\s)(\/(?:[^/\\\r\n]|\\.)+\/[gimsuydv]*)(?=$|\s)/); if (!m || !m[1]) return null; const raw = m[1].trim(); + // Compute the exact start position of the token within the original string. + // m.index is where the full match starts; the token (group 1) may be preceded + // by one whitespace character captured by (?:^|\s), hence the offset. + const tokenStart = m.index! + m[0].length - m[1].length; const lastSlash = raw.lastIndexOf("/"); const pattern = raw.slice(1, lastSlash); const flags = raw.slice(lastSlash + 1); - return { raw, pattern, flags }; + return { raw, pattern, flags, index: tokenStart }; } /** From 25de5dd2d9a40a27bfdefa0ff5d4efaa8e58a226 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20HOUZ=C3=89?= Date: Fri, 13 Mar 2026 10:59:38 +0100 Subject: [PATCH 13/13] Extend isSpecialEscape with \c and \k to prevent contamination --- src/regex.test.ts | 14 ++++++++++++++ src/regex.ts | 3 ++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/regex.test.ts b/src/regex.test.ts index 35758f0..8add72b 100644 --- a/src/regex.test.ts +++ b/src/regex.test.ts @@ -224,6 +224,20 @@ describe("buildApiQuery — special escape handling in longestLiteralSequence", const r = buildApiQuery("/foobar\\tbaz/"); expect(r.apiQuery).toBe("foobar"); }); + + it("/foobar\\cAbaz/ → foobar (\\cA is a control escape, 'c' must not be accumulated)", () => { + // Regression: \cA matches control-character 0x01, not the letter 'c'. + // The sequence must be broken at \c so 'foobar' is extracted, not 'foobarcAbaz'. + const r = buildApiQuery("/foobar\\cAbaz/"); + expect(r.apiQuery).toBe("foobar"); + }); + + it("/foobar\\kbaz/ → foobar (\\k is a named back-reference, 'k' must not be accumulated)", () => { + // Regression: \k is a named back-reference, not the letter 'k'. + // The sequence must be broken at \k so 'foobar' is extracted, not 'foobarknameabaz'. + const r = buildApiQuery("/foobar\\kbaz/"); + expect(r.apiQuery).toBe("foobar"); + }); }); describe("buildApiQuery — warn cases", () => { diff --git a/src/regex.ts b/src/regex.ts index e839cd7..d209c1e 100644 --- a/src/regex.ts +++ b/src/regex.ts @@ -244,8 +244,9 @@ function longestLiteralSequence(pattern: string): string { // Note: \a and \e are NOT in this list — in JS without u/v they are // identity escapes that simply match the literal letter ('a' or 'e'), // so they should be accumulated, not broken on. + // \c = control escape (\cA–\cZ), \k = named back-reference (\k). const isWordLike = /[a-zA-Z0-9_-]/.test(next); - const isSpecialEscape = /[bBdDsSwWpPuUxX0-9nrtfv]/.test(next); + const isSpecialEscape = /[bBdDsSwWpPuUxX0-9nrtfvck]/.test(next); if (isWordLike && !isSpecialEscape) { current += next; } else {