diff --git a/package-lock.json b/package-lock.json index 27f96de80..9bf48a917 100644 --- a/package-lock.json +++ b/package-lock.json @@ -45,6 +45,7 @@ "cross-env": "^10.1.0", "esbuild": "^0.27.3", "markdownlint-cli2": "^0.20.0", + "tsx": "^4.21.0", "typescript": "^5.7.2", "vite": "^6.0.7", "vite-plugin-pwa": "^1.2.0", @@ -8211,6 +8212,19 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/get-tsconfig": { + "version": "4.13.6", + "resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.13.6.tgz", + "integrity": "sha512-shZT/QMiSHc/YBLxxOkMtgSid5HFoauqCE3/exfsEcwg1WkeqjG+V40yBbBrsD+jW2HDXcs28xOfcbm2jI8Ddw==", + "dev": true, + "license": "MIT", + "dependencies": { + "resolve-pkg-maps": "^1.0.0" + }, + "funding": { + "url": "https://github.com/privatenumber/get-tsconfig?sponsor=1" + } + }, "node_modules/github-from-package": { "version": "0.0.0", "resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz", @@ -11268,6 +11282,16 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/resolve-pkg-maps": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz", + "integrity": "sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==", + "dev": true, + "license": "MIT", + "funding": { + "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1" + } + }, "node_modules/resolve-protobuf-schema": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/resolve-protobuf-schema/-/resolve-protobuf-schema-2.1.0.tgz", @@ -12330,6 +12354,26 @@ "license": "0BSD", "peer": true }, + "node_modules/tsx": { + "version": "4.21.0", + "resolved": "https://registry.npmjs.org/tsx/-/tsx-4.21.0.tgz", + "integrity": "sha512-5C1sg4USs1lfG0GFb2RLXsdpXqBSEhAaA/0kPL01wxzpMqLILNxIxIOKiILz+cdg/pLnOUxFYOR5yhHU666wbw==", + "dev": true, + "license": "MIT", + "dependencies": { + "esbuild": "~0.27.0", + "get-tsconfig": "^4.7.5" + }, + "bin": { + "tsx": "dist/cli.mjs" + }, + "engines": { + "node": ">=18.0.0" + }, + "optionalDependencies": { + "fsevents": "~2.3.3" + } + }, "node_modules/tunnel-agent": { "version": "0.6.0", "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz", diff --git a/package.json b/package.json index 7b51b1a97..7e5287d94 100644 --- a/package.json +++ b/package.json @@ -29,7 +29,7 @@ "test:e2e:finance": "cross-env VITE_VARIANT=finance playwright test", "test:e2e:runtime": "cross-env VITE_VARIANT=full playwright test e2e/runtime-fetch.spec.ts", "test:e2e": "npm run test:e2e:runtime && npm run test:e2e:full && npm run test:e2e:tech && npm run test:e2e:finance", - "test:data": "node --test tests/*.test.mjs", + "test:data": "tsx --test tests/*.test.mjs tests/*.test.mts", "test:feeds": "node scripts/validate-rss-feeds.mjs", "test:sidecar": "node --test src-tauri/sidecar/local-api-server.test.mjs api/_cors.test.mjs api/youtube/embed.test.mjs api/cyber-threats.test.mjs api/usni-fleet.test.mjs scripts/ais-relay-rss.test.cjs api/loaders-xml-wms-regression.test.mjs", "test:e2e:visual:full": "cross-env VITE_VARIANT=full playwright test -g \"matches golden screenshots per layer and zoom\"", @@ -64,6 +64,7 @@ "cross-env": "^10.1.0", "esbuild": "^0.27.3", "markdownlint-cli2": "^0.20.0", + "tsx": "^4.21.0", "typescript": "^5.7.2", "vite": "^6.0.7", "vite-plugin-pwa": "^1.2.0", diff --git a/src/app/data-loader.ts b/src/app/data-loader.ts index d61f273c3..4f32bea21 100644 --- a/src/app/data-loader.ts +++ b/src/app/data-loader.ts @@ -12,6 +12,7 @@ import { LAYER_TO_SOURCE, } from '@/config'; import { INTEL_HOTSPOTS, CONFLICT_ZONES } from '@/config/geo'; +import { tokenizeForMatch, matchKeyword } from '@/utils/keyword-match'; import { fetchCategoryFeeds, getFeedFailures, @@ -315,7 +316,7 @@ export class DataLoaderManager implements AppModule { } private findFlashLocation(title: string): { lat: number; lon: number } | null { - const titleLower = title.toLowerCase(); + const tokens = tokenizeForMatch(title); let bestMatch: { lat: number; lon: number; matches: number } | null = null; const countKeywordMatches = (keywords: string[] | undefined): number => { @@ -323,7 +324,7 @@ export class DataLoaderManager implements AppModule { let matches = 0; for (const keyword of keywords) { const cleaned = keyword.trim().toLowerCase(); - if (cleaned.length >= 3 && titleLower.includes(cleaned)) { + if (cleaned.length >= 3 && matchKeyword(tokens, cleaned)) { matches++; } } diff --git a/src/components/DeckGLMap.ts b/src/components/DeckGLMap.ts index 84fc5b727..0f2efd84e 100644 --- a/src/components/DeckGLMap.ts +++ b/src/components/DeckGLMap.ts @@ -44,6 +44,7 @@ import { ArcLayer } from '@deck.gl/layers'; import { HeatmapLayer } from '@deck.gl/aggregation-layers'; import type { WeatherAlert } from '@/services/weather'; import { escapeHtml } from '@/utils/sanitize'; +import { tokenizeForMatch, matchKeyword, matchesAnyKeyword, findMatchingKeywords } from '@/utils/keyword-match'; import { t } from '@/services/i18n'; import { debounce, rafSchedule, getCurrentTheme } from '@/utils/index'; import { @@ -3796,10 +3797,9 @@ export class DeckGLMap { const matchCounts = new Map(); recentNews.forEach(item => { + const tokens = tokenizeForMatch(item.title); this.hotspots.forEach(hotspot => { - if (hotspot.keywords.some(kw => - item.title.toLowerCase().includes(kw.toLowerCase()) - )) { + if (matchesAnyKeyword(tokens, hotspot.keywords)) { breakingKeywords.add(hotspot.id); matchCounts.set(hotspot.id, (matchCounts.get(hotspot.id) || 0) + 1); } @@ -3820,32 +3820,27 @@ export class DeckGLMap { /** Get news items related to a hotspot by keyword matching */ private getRelatedNews(hotspot: Hotspot): NewsItem[] { - // High-priority conflict keywords that indicate the news is really about another topic - const conflictTopics = ['gaza', 'ukraine', 'russia', 'israel', 'iran', 'china', 'taiwan', 'korea', 'syria']; + const conflictTopics = ['gaza', 'ukraine', 'ukrainian', 'russia', 'russian', 'israel', 'israeli', 'iran', 'iranian', 'china', 'chinese', 'taiwan', 'taiwanese', 'korea', 'korean', 'syria', 'syrian']; return this.news .map((item) => { - const titleLower = item.title.toLowerCase(); - const matchedKeywords = hotspot.keywords.filter((kw) => titleLower.includes(kw.toLowerCase())); + const tokens = tokenizeForMatch(item.title); + const matchedKeywords = findMatchingKeywords(tokens, hotspot.keywords); if (matchedKeywords.length === 0) return null; - // Check if this news mentions other hotspot conflict topics const conflictMatches = conflictTopics.filter(t => - titleLower.includes(t) && !hotspot.keywords.some(k => k.toLowerCase().includes(t)) + matchKeyword(tokens, t) && !hotspot.keywords.some(k => k.toLowerCase().includes(t)) ); - // If article mentions a major conflict topic that isn't this hotspot, deprioritize heavily if (conflictMatches.length > 0) { - // Only include if it ALSO has a strong local keyword (city name, agency) const strongLocalMatch = matchedKeywords.some(kw => kw.toLowerCase() === hotspot.name.toLowerCase() || - hotspot.agencies?.some(a => titleLower.includes(a.toLowerCase())) + hotspot.agencies?.some(a => matchKeyword(tokens, a)) ); if (!strongLocalMatch) return null; } - // Score: more keyword matches = more relevant const score = matchedKeywords.length; return { item, score }; }) diff --git a/src/components/Map.ts b/src/components/Map.ts index ca58838dc..032355eab 100644 --- a/src/components/Map.ts +++ b/src/components/Map.ts @@ -42,6 +42,7 @@ import { CENTRAL_BANKS, COMMODITY_HUBS, } from '@/config'; +import { tokenizeForMatch, matchKeyword, findMatchingKeywords } from '@/utils/keyword-match'; import { MapPopup } from './MapPopup'; import { updateHotspotEscalation, @@ -2740,32 +2741,27 @@ export class MapComponent { } private getRelatedNews(hotspot: Hotspot): NewsItem[] { - // High-priority conflict keywords that indicate the news is really about another topic - const conflictTopics = ['gaza', 'ukraine', 'russia', 'israel', 'iran', 'china', 'taiwan', 'korea', 'syria']; + const conflictTopics = ['gaza', 'ukraine', 'ukrainian', 'russia', 'russian', 'israel', 'israeli', 'iran', 'iranian', 'china', 'chinese', 'taiwan', 'taiwanese', 'korea', 'korean', 'syria', 'syrian']; return this.news .map((item) => { - const titleLower = item.title.toLowerCase(); - const matchedKeywords = hotspot.keywords.filter((kw) => titleLower.includes(kw.toLowerCase())); + const tokens = tokenizeForMatch(item.title); + const matchedKeywords = findMatchingKeywords(tokens, hotspot.keywords); if (matchedKeywords.length === 0) return null; - // Check if this news mentions other hotspot conflict topics const conflictMatches = conflictTopics.filter(t => - titleLower.includes(t) && !hotspot.keywords.some(k => k.toLowerCase().includes(t)) + matchKeyword(tokens, t) && !hotspot.keywords.some(k => k.toLowerCase().includes(t)) ); - // If article mentions a major conflict topic that isn't this hotspot, deprioritize heavily if (conflictMatches.length > 0) { - // Only include if it ALSO has a strong local keyword (city name, agency) const strongLocalMatch = matchedKeywords.some(kw => kw.toLowerCase() === hotspot.name.toLowerCase() || - hotspot.agencies?.some(a => titleLower.includes(a.toLowerCase())) + hotspot.agencies?.some(a => matchKeyword(tokens, a)) ); if (!strongLocalMatch) return null; } - // Score: more keyword matches = more relevant const score = matchedKeywords.length; return { item, score }; }) @@ -2784,8 +2780,8 @@ export class MapComponent { let matchedCount = 0; news.forEach((item) => { - const titleLower = item.title.toLowerCase(); - const matches = spot.keywords.filter((kw) => titleLower.includes(kw.toLowerCase())); + const tokens = tokenizeForMatch(item.title); + const matches = spot.keywords.filter((kw) => matchKeyword(tokens, kw)); if (matches.length > 0) { matchedCount++; diff --git a/src/config/geo.ts b/src/config/geo.ts index ed5dec3f1..b22a3f2cf 100644 --- a/src/config/geo.ts +++ b/src/config/geo.ts @@ -81,7 +81,7 @@ export const INTEL_HOTSPOTS: Hotspot[] = [ lat: 38.9, lon: -77.0, location: 'Washington D.C., USA', - keywords: ['pentagon', 'white house', 'congress', 'cia', 'nsa', 'washington', 'biden', 'trump', 'house', 'senate', 'supreme court', 'vance', 'elon', 'us '], + keywords: ['pentagon', 'white house', 'congress', 'cia', 'nsa', 'washington', 'biden', 'trump', 'senate', 'supreme court', 'vance', 'elon'], agencies: ['Pentagon', 'CIA', 'NSA', 'State Dept'], description: 'US government and military headquarters. Intelligence community center.', status: 'Monitoring', diff --git a/src/services/country-instability.ts b/src/services/country-instability.ts index c38e013d3..b8c986079 100644 --- a/src/services/country-instability.ts +++ b/src/services/country-instability.ts @@ -1,4 +1,5 @@ import type { SocialUnrestEvent, MilitaryFlight, MilitaryVessel, ClusteredEvent, InternetOutage } from '@/types'; +import { tokenizeForMatch, matchKeyword } from '@/utils/keyword-match'; import { INTEL_HOTSPOTS, CONFLICT_ZONES, STRATEGIC_WATERWAYS } from '@/config/geo'; import { CURATED_COUNTRIES, DEFAULT_BASELINE_RISK, DEFAULT_EVENT_MULTIPLIER, getHotspotCountries } from '@/config/countries'; import { focalPointDetector } from './focal-point-detector'; @@ -136,11 +137,11 @@ export function getPreviousScores(): Map { export type { CountryData }; function normalizeCountryName(name: string): string | null { - const lower = name.toLowerCase(); + const tokens = tokenizeForMatch(name); for (const [code, cfg] of Object.entries(CURATED_COUNTRIES)) { - if (cfg.scoringKeywords.some(kw => lower.includes(kw))) return code; + if (cfg.scoringKeywords.some(kw => matchKeyword(tokens, kw))) return code; } - return nameToCountryCode(lower); + return nameToCountryCode(name.toLowerCase()); } export function ingestProtestsForCII(events: SocialUnrestEvent[]): void { @@ -347,16 +348,16 @@ export function ingestMilitaryForCII(flights: MilitaryFlight[], vessels: Militar export function ingestNewsForCII(events: ClusteredEvent[]): void { for (const e of events) { - const title = e.primaryTitle.toLowerCase(); + const tokens = tokenizeForMatch(e.primaryTitle); const matched = new Set(); for (const [code, cfg] of Object.entries(CURATED_COUNTRIES)) { - if (cfg.scoringKeywords.some(kw => title.includes(kw))) { + if (cfg.scoringKeywords.some(kw => matchKeyword(tokens, kw))) { matched.add(code); } } - for (const code of matchCountryNamesInText(title)) { + for (const code of matchCountryNamesInText(e.primaryTitle.toLowerCase())) { matched.add(code); } diff --git a/src/services/geo-hub-index.ts b/src/services/geo-hub-index.ts index 80c1d75e1..ccfc99511 100644 --- a/src/services/geo-hub-index.ts +++ b/src/services/geo-hub-index.ts @@ -1,3 +1,4 @@ +import { tokenizeForMatch, matchKeyword } from '@/utils/keyword-match'; // Geopolitical Hub Index - aggregates news by strategic locations export interface GeoHubLocation { @@ -109,22 +110,13 @@ export interface GeoHubMatch { export function inferGeoHubsFromTitle(title: string): GeoHubMatch[] { const index = buildGeoHubIndex(); const matches: GeoHubMatch[] = []; - const titleLower = title.toLowerCase(); + const tokens = tokenizeForMatch(title); const seenHubs = new Set(); for (const [keyword, hubIds] of index.byKeyword) { if (keyword.length < 2) continue; - // Word boundary check for short keywords to avoid false positives - const regex = keyword.length < 5 - ? new RegExp(`\\b${keyword}\\b`, 'i') - : null; - - const found = regex - ? regex.test(titleLower) - : titleLower.includes(keyword); - - if (found) { + if (matchKeyword(tokens, keyword)) { for (const hubId of hubIds) { if (seenHubs.has(hubId)) continue; seenHubs.add(hubId); diff --git a/src/services/related-assets.ts b/src/services/related-assets.ts index f89e03930..b0da5edc6 100644 --- a/src/services/related-assets.ts +++ b/src/services/related-assets.ts @@ -1,4 +1,5 @@ import type { ClusteredEvent, RelatedAsset, AssetType, RelatedAssetContext } from '@/types'; +import { tokenizeForMatch, matchKeyword } from '@/utils/keyword-match'; import { t } from '@/services/i18n'; import { INTEL_HOTSPOTS, @@ -27,24 +28,20 @@ interface AssetOrigin { label: string; } -function toTitleLower(titles: string[]): string[] { - return titles.map(title => title.toLowerCase()); -} - function detectAssetTypes(titles: string[]): AssetType[] { - const normalized = toTitleLower(titles); + const tokenized = titles.map(t => tokenizeForMatch(t)); const types = Object.entries(ASSET_KEYWORDS) .filter(([, keywords]) => - normalized.some(title => keywords.some(keyword => title.includes(keyword))) + tokenized.some(tokens => keywords.some(keyword => matchKeyword(tokens, keyword))) ) .map(([type]) => type as AssetType); return types; } function countKeywordMatches(titles: string[], keywords: string[]): number { - const normalized = toTitleLower(titles); + const tokenized = titles.map(t => tokenizeForMatch(t)); return keywords.reduce((count, keyword) => { - return count + normalized.filter(title => title.includes(keyword)).length; + return count + tokenized.filter(tokens => matchKeyword(tokens, keyword)).length; }, 0); } diff --git a/src/services/story-data.ts b/src/services/story-data.ts index 666f6de14..236b7df30 100644 --- a/src/services/story-data.ts +++ b/src/services/story-data.ts @@ -2,6 +2,7 @@ import { calculateCII, type CountryScore } from './country-instability'; import type { ClusteredEvent } from '@/types'; import type { ThreatLevel } from './threat-classifier'; import { CURATED_COUNTRIES } from '@/config/countries'; +import { tokenizeForMatch, matchKeyword } from '@/utils/keyword-match'; export interface StoryData { countryCode: string; @@ -65,8 +66,8 @@ export function collectStoryData( const keywords = CURATED_COUNTRIES[countryCode]?.scoringKeywords || [countryName.toLowerCase()]; const countryNews = allNews.filter(e => { - const lower = e.primaryTitle.toLowerCase(); - return keywords.some(kw => lower.includes(kw)); + const tokens = tokenizeForMatch(e.primaryTitle); + return keywords.some(kw => matchKeyword(tokens, kw)); }); const sortedNews = [...countryNews].sort((a, b) => { @@ -82,8 +83,8 @@ export function collectStoryData( ) || null; const countryMarkets = predictionMarkets.filter(m => { - const lower = m.title.toLowerCase(); - return keywords.some(kw => lower.includes(kw)); + const mTokens = tokenizeForMatch(m.title); + return keywords.some(kw => matchKeyword(mTokens, kw)); }); const threatCounts = { critical: 0, high: 0, medium: 0, categories: new Set() }; diff --git a/src/services/tech-hub-index.ts b/src/services/tech-hub-index.ts index 58a285f50..4f636997b 100644 --- a/src/services/tech-hub-index.ts +++ b/src/services/tech-hub-index.ts @@ -1,6 +1,7 @@ import { STARTUP_ECOSYSTEMS } from '@/config/startup-ecosystems'; import { TECH_COMPANIES } from '@/config/tech-companies'; import { STARTUP_HUBS } from '@/config/tech-geo'; +import { tokenizeForMatch, matchKeyword } from '@/utils/keyword-match'; export interface TechHubLocation { id: string; @@ -211,14 +212,13 @@ export interface HubMatch { export function inferHubsFromTitle(title: string): HubMatch[] { const index = buildTechHubIndex(); const matches: HubMatch[] = []; - const titleLower = title.toLowerCase(); + const tokens = tokenizeForMatch(title); const seenHubs = new Set(); - // Check each keyword for (const [keyword, hubIds] of index.byKeyword) { - if (keyword.length < 3) continue; // Skip very short keywords + if (keyword.length < 3) continue; - if (titleLower.includes(keyword)) { + if (matchKeyword(tokens, keyword)) { for (const hubId of hubIds) { if (seenHubs.has(hubId)) continue; seenHubs.add(hubId); diff --git a/src/utils/keyword-match.ts b/src/utils/keyword-match.ts new file mode 100644 index 000000000..8b8e7bba5 --- /dev/null +++ b/src/utils/keyword-match.ts @@ -0,0 +1,80 @@ +export interface TokenizedTitle { + words: Set; + ordered: string[]; +} + +const INFLECTION_SUFFIXES = ['s', 'es', 'ian', 'ians', 'ean', 'eans', 'an', 'ans', 'n', 'ns', 'i', 'is', 'ish', 'ese']; +const MIN_SUFFIX_KEYWORD_LEN = 4; + +export function tokenizeForMatch(title: string): TokenizedTitle { + const lower = title.toLowerCase(); + const words = new Set(); + const ordered: string[] = []; + for (const raw of lower.split(/\s+/)) { + const cleaned = raw.replace(/^[^a-z0-9]+|[^a-z0-9]+$/g, ''); + if (!cleaned) continue; + words.add(cleaned); + ordered.push(cleaned); + for (const part of cleaned.split(/[^a-z0-9]+/)) { + if (part) words.add(part); + } + } + return { words, ordered }; +} + +function hasSuffix(word: string, keyword: string): boolean { + if (word.length <= keyword.length) return false; + if (word.startsWith(keyword)) { + const suffix = word.slice(keyword.length); + if (INFLECTION_SUFFIXES.includes(suffix)) return true; + } + if (keyword.endsWith('e')) { + const stem = keyword.slice(0, -1); + if (word.length > stem.length && word.startsWith(stem)) { + const suffix = word.slice(stem.length); + if (INFLECTION_SUFFIXES.includes(suffix)) return true; + } + } + return false; +} + +function wordMatches(token: string, kwPart: string): boolean { + if (token === kwPart) return true; + if (kwPart.length >= MIN_SUFFIX_KEYWORD_LEN) return hasSuffix(token, kwPart); + return false; +} + +function matchSingleWord(words: Set, keyword: string): boolean { + if (words.has(keyword)) return true; + if (keyword.length < MIN_SUFFIX_KEYWORD_LEN) return false; + for (const word of words) { + if (hasSuffix(word, keyword)) return true; + } + return false; +} + +export function matchKeyword(tokens: TokenizedTitle, keyword: string): boolean { + const parts = keyword.toLowerCase().split(/\s+/).filter((w): w is string => w.length > 0); + if (parts.length === 0) return false; + if (parts.length === 1) return matchSingleWord(tokens.words, parts[0]!); + const { ordered } = tokens; + for (let i = 0; i <= ordered.length - parts.length; i++) { + let match = true; + for (let j = 0; j < parts.length; j++) { + if (!wordMatches(ordered[i + j]!, parts[j]!)) { match = false; break; } + } + if (match) return true; + } + return false; +} + +export function matchesAnyKeyword(tokens: TokenizedTitle, keywords: string[]): boolean { + for (const kw of keywords) { + if (matchKeyword(tokens, kw)) return true; + } + return false; +} + +export function findMatchingKeywords(tokens: TokenizedTitle, keywords: string[]): string[] { + return keywords.filter(kw => matchKeyword(tokens, kw)); +} diff --git a/tests/geo-keyword-matching.test.mts b/tests/geo-keyword-matching.test.mts new file mode 100644 index 000000000..2b064daf7 --- /dev/null +++ b/tests/geo-keyword-matching.test.mts @@ -0,0 +1,393 @@ +import { describe, it } from 'node:test'; +import assert from 'node:assert/strict'; +import { tokenizeForMatch, matchKeyword, matchesAnyKeyword, findMatchingKeywords } from '../src/utils/keyword-match.ts'; + +// --- Tokenizer tests --- + +describe('tokenizeForMatch', () => { + it('splits on whitespace and lowercases', () => { + const t = tokenizeForMatch('Assad Forces Advance'); + assert.ok(t.words.has('assad')); + assert.ok(t.words.has('forces')); + assert.ok(t.words.has('advance')); + assert.deepStrictEqual(t.ordered, ['assad', 'forces', 'advance']); + }); + + it('strips leading/trailing punctuation', () => { + const t = tokenizeForMatch('"Syria!" (conflict)'); + assert.ok(t.words.has('syria')); + assert.ok(t.words.has('conflict')); + assert.ok(!t.words.has('"syria!"')); + }); + + it('decomposes possessives', () => { + const t = tokenizeForMatch("Assad's forces"); + assert.ok(t.words.has("assad's")); + assert.ok(t.words.has('assad')); + assert.ok(t.words.has('s')); + assert.ok(t.words.has('forces')); + }); + + it('decomposes hyphenated words', () => { + const t = tokenizeForMatch('al-Sham fighters'); + assert.ok(t.words.has('al-sham')); + assert.ok(t.words.has('al')); + assert.ok(t.words.has('sham')); + }); + + it('handles empty input', () => { + const t = tokenizeForMatch(''); + assert.strictEqual(t.words.size, 0); + assert.strictEqual(t.ordered.length, 0); + }); + + it('handles punctuation-only tokens', () => { + const t = tokenizeForMatch('--- *** !!!'); + assert.strictEqual(t.words.size, 0); + assert.strictEqual(t.ordered.length, 0); + }); +}); + +// --- False positive prevention --- + +describe('false positive prevention', () => { + it('"ambassador" does NOT match "assad"', () => { + const t = tokenizeForMatch('French Ambassador outlines new strategy'); + assert.ok(!matchKeyword(t, 'assad')); + }); + + it('"rights" does NOT match "hts"', () => { + const t = tokenizeForMatch('Human rights groups condemn violence'); + assert.ok(!matchKeyword(t, 'hts')); + }); + + it('"Ukrainian" does NOT match "iran"', () => { + const t = tokenizeForMatch('Ukrainian forces push forward'); + assert.ok(!matchKeyword(t, 'iran')); + }); + + it('"focus" does NOT match "us"', () => { + const t = tokenizeForMatch('Leaders focus on economy'); + assert.ok(!matchKeyword(t, 'us')); + }); + + it('"housing" does NOT match "house"', () => { + const t = tokenizeForMatch('Housing prices rise sharply'); + assert.ok(!matchKeyword(t, 'house')); + }); + + it('"warehouse" does NOT match "house"', () => { + const t = tokenizeForMatch('Amazon warehouse workers strike'); + assert.ok(!matchKeyword(t, 'house')); + }); + + it('"discuss" does NOT match "us"', () => { + const t = tokenizeForMatch('Leaders discuss trade policy'); + assert.ok(!matchKeyword(t, 'us')); + }); + + it('"bushfire" does NOT match "us"', () => { + const t = tokenizeForMatch('Bushfire threatens suburbs'); + assert.ok(!matchKeyword(t, 'us')); + }); + + it('"Thailand" does NOT match "ai"', () => { + const t = tokenizeForMatch('Thailand exports surge'); + assert.ok(!matchKeyword(t, 'ai')); + }); +}); + +// --- True positive preservation --- + +describe('true positive preservation', () => { + it('"Assad regime forces" matches "assad"', () => { + const t = tokenizeForMatch('Assad regime forces advance in Idlib'); + assert.ok(matchKeyword(t, 'assad')); + }); + + it('"HTS forces advance" matches "hts"', () => { + const t = tokenizeForMatch('HTS forces advance in northern Syria'); + assert.ok(matchKeyword(t, 'hts')); + }); + + it('"Iran sanctions" matches "iran"', () => { + const t = tokenizeForMatch('Iran sanctions lifted after talks'); + assert.ok(matchKeyword(t, 'iran')); + }); + + it('"US announces" matches "us"', () => { + const t = tokenizeForMatch('US announces new trade deal'); + assert.ok(matchKeyword(t, 'us')); + }); + + it('"The House voted" matches "house"', () => { + const t = tokenizeForMatch('The House voted on the bill'); + assert.ok(matchKeyword(t, 'house')); + }); +}); + +// --- Possessives --- + +describe('possessive matching', () => { + it('"Assad\'s forces" matches "assad"', () => { + const t = tokenizeForMatch("Assad's forces advance"); + assert.ok(matchKeyword(t, 'assad')); + }); + + it('"Iran\'s nuclear program" matches "iran"', () => { + const t = tokenizeForMatch("Iran's nuclear program concerns grow"); + assert.ok(matchKeyword(t, 'iran')); + }); + + it('"Putin\'s war" matches "putin"', () => { + const t = tokenizeForMatch("Putin's war strategy shifts"); + assert.ok(matchKeyword(t, 'putin')); + }); + + it('"China\'s economy" matches "china"', () => { + const t = tokenizeForMatch("China's economy slows further"); + assert.ok(matchKeyword(t, 'china')); + }); +}); + +// --- Inflection / suffix matching (plurals, demonyms) --- + +describe('inflection suffix matching', () => { + it('"houthis" matches keyword "houthi" (plural -s)', () => { + const t = tokenizeForMatch('Houthis attack Red Sea shipping'); + assert.ok(matchKeyword(t, 'houthi')); + }); + + it('"missiles" matches keyword "missile" (plural -s)', () => { + const t = tokenizeForMatch('Missiles launched from Yemen'); + assert.ok(matchKeyword(t, 'missile')); + }); + + it('"drones" matches keyword "drone" (plural -s)', () => { + const t = tokenizeForMatch('Drones spotted over base'); + assert.ok(matchKeyword(t, 'drone')); + }); + + it('"Ukrainian" matches keyword "ukraine" (demonym -ian)', () => { + const t = tokenizeForMatch('Ukrainian forces push forward'); + assert.ok(matchKeyword(t, 'ukraine')); + }); + + it('"Iranian" matches keyword "iran" (demonym -ian)', () => { + const t = tokenizeForMatch('Iranian senate debates sanctions'); + assert.ok(matchKeyword(t, 'iran')); + }); + + it('"Israeli" matches keyword "israel" (demonym -i)', () => { + const t = tokenizeForMatch('Israeli military conducts operation'); + assert.ok(matchKeyword(t, 'israel')); + }); + + it('"Russian" matches keyword "russia" (demonym -n)', () => { + const t = tokenizeForMatch('Russian forces advance'); + assert.ok(matchKeyword(t, 'russia')); + }); + + it('"Taiwanese" matches keyword "taiwan" (demonym -ese)', () => { + const t = tokenizeForMatch('Taiwanese military drills begin'); + assert.ok(matchKeyword(t, 'taiwan')); + }); + + it('suffix matching does NOT cause false positives for unrelated words', () => { + const t = tokenizeForMatch('The situation worsens dramatically'); + assert.ok(!matchKeyword(t, 'situ')); + assert.ok(!matchKeyword(t, 'drama')); + }); + + it('"Iranians" matches keyword "iran" (plural demonym -ians)', () => { + assert.ok(matchKeyword(tokenizeForMatch('Iranians protest in Tehran'), 'iran')); + }); + + it('"Ukrainians" matches keyword "ukraine" (plural demonym -ians with e-drop)', () => { + assert.ok(matchKeyword(tokenizeForMatch('Ukrainians seek aid'), 'ukraine')); + }); + + it('"Russians" matches keyword "russia" (plural demonym -ns)', () => { + assert.ok(matchKeyword(tokenizeForMatch('Russians advance on front'), 'russia')); + }); + + it('"Israelis" matches keyword "israel" (plural demonym -is)', () => { + assert.ok(matchKeyword(tokenizeForMatch('Israelis evacuate border towns'), 'israel')); + }); + + it('short keywords (<4 chars) do NOT suffix-match', () => { + assert.ok(!matchKeyword(tokenizeForMatch('AIS signals disrupted'), 'ai')); + assert.ok(!matchKeyword(tokenizeForMatch('Russia uses drones'), 'us')); + assert.ok(!matchKeyword(tokenizeForMatch('The bus arrived'), 'bu')); + }); + + it('short keywords still exact-match', () => { + assert.ok(matchKeyword(tokenizeForMatch('AI revolution continues'), 'ai')); + assert.ok(matchKeyword(tokenizeForMatch('US announces deal'), 'us')); + assert.ok(matchKeyword(tokenizeForMatch('HTS forces advance'), 'hts')); + }); +}); + +// --- Multi-word phrases --- + +describe('multi-word phrase matching', () => { + it('"White House announces" matches "white house"', () => { + const t = tokenizeForMatch('White House announces new policy'); + assert.ok(matchKeyword(t, 'white house')); + }); + + it('"The house is painted white" does NOT match "white house"', () => { + const t = tokenizeForMatch('The house is painted white'); + assert.ok(!matchKeyword(t, 'white house')); + }); + + it('"supreme court" matches multi-word', () => { + const t = tokenizeForMatch('Supreme Court rules on case'); + assert.ok(matchKeyword(t, 'supreme court')); + }); + + it('"silicon valley" matches multi-word', () => { + const t = tokenizeForMatch('Silicon Valley startups surge'); + assert.ok(matchKeyword(t, 'silicon valley')); + }); + + it('"South Korean" matches "south korea" (multi-word demonym)', () => { + const t = tokenizeForMatch('South Korean military drills continue'); + assert.ok(matchKeyword(t, 'south korea')); + }); + + it('"North Korean" matches "north korea" (multi-word demonym)', () => { + const t = tokenizeForMatch('North Korean missile launch detected'); + assert.ok(matchKeyword(t, 'north korea')); + }); + + it('"South Koreans" matches "south korea" (multi-word plural demonym)', () => { + const t = tokenizeForMatch('South Koreans vote in election'); + assert.ok(matchKeyword(t, 'south korea')); + }); + + it('"tech layoffs" matches multi-word', () => { + const t = tokenizeForMatch('Tech layoffs hit record numbers'); + assert.ok(matchKeyword(t, 'tech layoffs')); + }); +}); + +// --- DC keywords cleanup --- + +describe('DC keywords (cleaned)', () => { + const dcKeywords = ['pentagon', 'white house', 'congress', 'cia', 'nsa', 'washington', 'biden', 'trump', 'senate', 'supreme court', 'vance', 'elon']; + + it('does NOT contain "house" as standalone keyword', () => { + assert.ok(!dcKeywords.includes('house')); + }); + + it('does NOT contain "us " trailing-space hack', () => { + assert.ok(!dcKeywords.includes('us ')); + }); + + it('"Housing market crashes" does NOT match any DC keyword', () => { + const t = tokenizeForMatch('Housing market crashes nationwide'); + assert.ok(!matchesAnyKeyword(t, dcKeywords)); + }); + + it('"White House announces budget" DOES match DC', () => { + const t = tokenizeForMatch('White House announces budget cuts'); + assert.ok(matchesAnyKeyword(t, dcKeywords)); + }); + + it('"Congress passes bill" DOES match DC', () => { + const t = tokenizeForMatch('Congress passes new spending bill'); + assert.ok(matchesAnyKeyword(t, dcKeywords)); + }); +}); + +// --- Integration: hub matching end-to-end --- + +describe('integration: hub keyword matching', () => { + const damascusKeywords = ['syria', 'damascus', 'assad', 'syrian', 'hts']; + + it('matches Damascus for Syrian conflict news', () => { + const t = tokenizeForMatch("Assad's forces clash with HTS near Damascus"); + const matched = findMatchingKeywords(t, damascusKeywords); + assert.ok(matched.length >= 2); + assert.ok(matched.includes('assad')); + assert.ok(matched.includes('hts')); + assert.ok(matched.includes('damascus')); + }); + + it('does NOT match Damascus for "ambassador rights" headline', () => { + const t = tokenizeForMatch('French Ambassador discusses human rights in Geneva'); + const matched = findMatchingKeywords(t, damascusKeywords); + assert.strictEqual(matched.length, 0); + }); + + it('matches Damascus for "Syrian" as standalone word', () => { + const t = tokenizeForMatch('Syrian refugees seek asylum'); + const matched = findMatchingKeywords(t, damascusKeywords); + assert.ok(matched.includes('syrian')); + }); + + it('matches conflict zone keywords with plural forms', () => { + const redSeaKeywords = ['houthi', 'red sea', 'yemen', 'missile', 'drone', 'ship']; + const t = tokenizeForMatch('Houthis launch missiles at ships in Red Sea'); + const matched = findMatchingKeywords(t, redSeaKeywords); + assert.ok(matched.includes('houthi')); + assert.ok(matched.includes('missile')); + assert.ok(matched.includes('ship')); + assert.ok(matched.includes('red sea')); + }); +}); + +// --- matchesAnyKeyword --- + +describe('matchesAnyKeyword', () => { + it('returns true when any keyword matches', () => { + const t = tokenizeForMatch('Pentagon releases new report'); + assert.ok(matchesAnyKeyword(t, ['pentagon', 'white house'])); + }); + + it('returns false when no keyword matches', () => { + const t = tokenizeForMatch('Local farmer wins award'); + assert.ok(!matchesAnyKeyword(t, ['pentagon', 'white house'])); + }); +}); + +// --- findMatchingKeywords --- + +describe('findMatchingKeywords', () => { + it('returns all matching keywords', () => { + const t = tokenizeForMatch('Trump meets with CIA director at Pentagon'); + const matched = findMatchingKeywords(t, ['trump', 'cia', 'pentagon', 'nsa']); + assert.deepStrictEqual(matched.sort(), ['cia', 'pentagon', 'trump']); + }); + + it('returns empty array when nothing matches', () => { + const t = tokenizeForMatch('Weather forecast looks sunny'); + const matched = findMatchingKeywords(t, ['trump', 'cia', 'pentagon']); + assert.strictEqual(matched.length, 0); + }); +}); + +// --- Edge cases --- + +describe('edge cases', () => { + it('empty keyword returns false', () => { + const t = tokenizeForMatch('Some title'); + assert.ok(!matchKeyword(t, '')); + assert.ok(!matchKeyword(t, ' ')); + }); + + it('numbers in tokens work', () => { + const t = tokenizeForMatch('F-35 crashes in test flight'); + assert.ok(t.words.has('f-35')); + assert.ok(t.words.has('35')); + assert.ok(t.words.has('f')); + }); + + it('case insensitive matching', () => { + const t = tokenizeForMatch('IRAN LAUNCHES MISSILE'); + assert.ok(matchKeyword(t, 'iran')); + assert.ok(matchKeyword(t, 'IRAN')); + assert.ok(matchKeyword(t, 'Iran')); + }); +});