From 9ecb10091077e8190aaee3ce00cddbb5ecfa348b Mon Sep 17 00:00:00 2001 From: Will Haynes Date: Fri, 20 Feb 2026 01:08:41 -0600 Subject: [PATCH 1/3] codex(task): phase1 ingest transcript canonicalization v1 --- codex/tasks/latest.json | 68 +++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/codex/tasks/latest.json b/codex/tasks/latest.json index 3a88eb3a..58bdcf55 100644 --- a/codex/tasks/latest.json +++ b/codex/tasks/latest.json @@ -1,45 +1,55 @@ { - "task_id": "phase0-spine-lockdown-2026-02-19", - "title": "Phase 0 Spine Lockdown: freeze contract vocab, kill ambiguous receipt offsets, harden emission + narrative policy", + "task_id": "phase1_ingest_canonicalization_2026_02_20", + "title": "Phase 1: Canonicalize transcript at ingest + stable transcriptHash", + "summary": "Implement ingest-time transcript canonicalization (NFKC + punctuation folding + line-ending normalization + BOM/null stripping) with versioning. Store rawTranscript + canonicalTranscript + transcriptHash + canonicalizationVersion on Entry for all write paths (upload route + GraphQL addEntry/updateEntry + any other transcript writers). Add deterministic tests for the canonicalization corpus. Do not bulk-migrate existing entries; freeze legacy entries at canonicalizationVersion=0/null and only apply v1 on new/updated transcripts going forward.", "base_branch": "develop", - "branch_name": "codex/phase0-spine-lockdown-exec-2026-02-19", - "summary": "Seal the Meaning Spine by freezing contract reason codes, enforcing unique-match offset inference (ambiguity=poison), hardening validateReceipt (strict V1 never falls through), ensuring ENTRY_ANALYZED emits contract+sanitized cards only (no raw reflection text), and locking narrative toggle behind a shared policy utility that callers cannot override. Add/adjust regression tests to prevent drift.", + "branch_name": "codex/phase1-ingest-canonicalization-exec-2026-02-20", "repo_scope": [ - "codex/tasks/latest.json", + "server/models/Entry.js", + "server/routes/upload.js", + "server/graphql/resolvers/index.js", + "server/src/workers/scribe.worker.js", "server/src/workers/reflection.worker.js", - "server/src/utils/truthValidator.js", "server/src/utils/**", - "server/src/workers/__tests__/**", + "server/utils/**", + "server/models/__tests__/**", + "server/src/**/__tests__/**", "server/tests/**", - "docs/testing-doctrine.md" + "scripts/codex_preflight.mjs", + "codex/tasks/latest.json" ], - "agents_involved": ["codex_web"], - "risk_level": "low", + "agents_involved": [ + "codex-web" + ], + "risk_level": "medium", "tests_to_run": [ + "node -e \"JSON.parse(require('fs').readFileSync('codex/tasks/latest.json','utf8')); console.log('latest.json ok')\"", "node scripts/codex_preflight.mjs --ci", "pnpm -C server test" ], "constraints": [ - "CODEX_WEB: Do NOT run git network commands (no git fetch/pull/push/clone). Use the UI “Create PR” button if a PR is needed.", - "CODEX_WEB_HEAD: In Codex Web, the checked-out branch name may be 'work'. Do NOT treat HEAD name mismatch as stale. Locks+canary are the source of truth.", - "ANTI-COP-OUT: No diff => no PR. If no actionable work exists, stop and report evidence.", - "SCOPE: Do not modify files outside repo_scope. If out-of-scope issues are found, produce a Repair Manifest instead of changing them.", - "ALIGNMENT: Print task_id/base_branch/branch_name/canary from latest.json before doing any work.", - "EVIDENCE_BUNDLE: Provide evidence in 4 phases: Alignment, Work-Exists Gate, Change Proof, Tests.", - "PR_BASE: Ensure PR base branch is develop (not another codex/* branch). Do not create draft PRs.", - "NO_PLACEHOLDERS: Do not create empty directories or placeholder files. Only create files with real content and tests.", - "NO_NETWORK: Tests must not touch real external network services." + "Codex Web environment: do NOT run git push; use the Create PR button.", + "Do NOT create placeholder files or empty directories. If no diff is needed, stop and report; do not create a PR.", + "All changes must remain within repo_scope. If a necessary fix is out-of-scope, produce a Repair Manifest instead of changing it.", + "Canonicalization happens at ingest/write time only (identity). Do not re-canonicalize during validation except legacy v0 fallback.", + "Do NOT bulk-migrate existing stored transcripts. Implement freeze+version: legacy entries are v0/null; new writes become v1.", + "Hashing must be based on canonicalTranscript and must NOT use locale-sensitive casefolding (no toLowerCase/toUpperCase on hash inputs).", + "No raw user transcript content may be logged or emitted into events as part of this change." ], "acceptance_checks": [ - "Alignment Evidence: show codex/tasks/latest.json values for task_id, base_branch, branch_name, and canary.", - "Alignment Evidence: print `git rev-parse --abbrev-ref HEAD` and `git rev-parse HEAD` for evidence; do NOT stop on SHA mismatch.", - "Work-Exists Gate: prove target symbols exist via grep or file navigation; if not found, stop and report: findReceiptOffsets (or equivalent), emitEntryAnalyzed callsite/payload, sanitizeBloomCardsWithContract boundary, validateReceipt in server/src/utils/truthValidator.js (or its imported helpers).", - "Freeze contract reason codes: add a shared constants module and replace raw string comparisons/assignments in Meaning Spine paths touched by this task.", - "Unique Match Rule: any transcript-search offset inference must return null on ambiguous multi-occurrence matches (firstIndex !== lastIndex). Ambiguity must drop the receipt/card safely and be reflected in contract/dropped reasons.", - "validateReceipt hardening: strict V1 path must not fall through to weaker matching if offsets fail; invalid shapes return explicit failure reasons and do not throw.", - "Emission hardening: ENTRY_ANALYZED payload must contain sanitized cards AND the Meaning Contract ledger; payload must not include raw reflection text anywhere.", - "Tests: add/adjust regression tests that fail if raw model output leaks into emission serialization; add/adjust tests verifying ambiguous quote matches are dropped.", - "Proof: include git status -sb and git diff --stat after changes; run tests_to_run and report results. (Run `pnpm -w test` locally after PR if desired.)" + "Alignment Evidence: print task_id, base_branch, branch_name, repo_scope, tests_to_run at start of run.", + "Work-Exists Gate: identify all transcript write paths (upload.js, GraphQL addEntry/updateEntry, scribe worker transcript persistence) and show exact files/lines to be changed.", + "Implement a single ingest canonicalization function (v1) using NFKC + punctuation folding + newline normalization + BOM/null stripping + trim; store canonicalizationVersion='1'.", + "Entry stores rawTranscript (untouched) and canonicalTranscript (canonicalized). transcriptHash is sha256(canonicalTranscript).", + "All transcript-writing paths set/update canonical fields consistently when transcript changes.", + "Add/extend deterministic tests covering: smart quotes folding, dash folding, ellipsis folding, CRLF/CR normalization, BOM/null stripping, and idempotency (canon(canon(x))==canon(x)).", + "Run tests_to_run and show outputs. If any test is skipped, explain why and provide a safe alternative.", + "Change Proof: show git status -sb and git diff --stat at end. No diff => no PR." ], - "canary": "CANARY_PHASE0_SPINE_LOCKDOWN_2026_02_19" + "locks": { + "task_id": "phase1_ingest_canonicalization_2026_02_20", + "base_branch": "develop", + "branch_name": "codex/phase1-ingest-canonicalization-exec-2026-02-20", + "canary": "PHASE1_INGEST_CANON_V1_CANARY_2026_02_20" + } } From f5c9c9a8888e084e3e38e8a420b8de6ceaaa903d Mon Sep 17 00:00:00 2001 From: William Leland Haynes <142263841+wileland@users.noreply.github.com> Date: Fri, 20 Feb 2026 06:38:03 -0600 Subject: [PATCH 2/3] feat(server): canonicalize transcripts at ingest v1 --- server/__tests__/upload.demo.test.js | 7 ++ server/graphql/resolvers/index.js | 29 +++++--- server/models/Entry.js | 70 ++++++++++++++++++- server/models/__tests__/Entry.test.js | 33 ++++++++- server/routes/__tests__/upload.test.js | 7 ++ server/routes/upload.js | 6 +- .../__tests__/reflection.worker.test.ts | 4 +- .../src/workers/__tests__/scribe.flow.test.js | 7 ++ .../workers/__tests__/scribe.worker.test.ts | 7 ++ server/src/workers/reflection.worker.js | 28 +++----- server/src/workers/scribe.worker.js | 6 +- server/tests/receipt.v1.test.js | 2 + 12 files changed, 171 insertions(+), 35 deletions(-) diff --git a/server/__tests__/upload.demo.test.js b/server/__tests__/upload.demo.test.js index c6baf0bb..5ca2194f 100644 --- a/server/__tests__/upload.demo.test.js +++ b/server/__tests__/upload.demo.test.js @@ -14,6 +14,13 @@ vi.mock('../models/Entry.js', () => ({ _id: 'demo-entry-id', })), }, + buildCanonicalTranscriptPayload: (value) => ({ + transcript: typeof value === 'string' ? value : '', + rawTranscript: typeof value === 'string' ? value : '', + canonicalTranscript: typeof value === 'string' ? value : '', + transcriptHash: 'mock-hash', + canonicalizationVersion: '1', + }), })); vi.mock('../src/orchestration/agentOrchestration.js', () => ({ diff --git a/server/graphql/resolvers/index.js b/server/graphql/resolvers/index.js index 19ac4a4c..7d216322 100644 --- a/server/graphql/resolvers/index.js +++ b/server/graphql/resolvers/index.js @@ -2,7 +2,7 @@ import jwt from 'jsonwebtoken'; -import Entry from '../../models/Entry.js'; +import Entry, { buildCanonicalTranscriptPayload } from '../../models/Entry.js'; import TagModel from '../../models/Tag.js'; import DuelSession from '../../models/DuelSession.js'; import User from '../../models/User.js'; @@ -473,7 +473,6 @@ const resolvers = { const ModelToUse = ContextEntry || Entry; - const aiTags = await inferTagsForEntry(transcript); const processedTags = (tags || []) .map((t) => { @@ -483,12 +482,6 @@ const resolvers = { }) .filter(Boolean); - for (const t of aiTags) { - if (!processedTags.find((pt) => pt.label === t)) { - processedTags.push({ label: t, source: 'ai' }); - } - } - const rawAudioUrl = typeof audioUrl === 'string' ? audioUrl.trim() : ''; const hasAudio = rawAudioUrl !== ''; const duration = Number.isFinite(audioDurationSeconds) ? audioDurationSeconds : null; @@ -503,10 +496,19 @@ const resolvers = { const shouldPersistUrl = hasAudio && !hasS3Bucket && !isPresignedUrl(rawAudioUrl); + const transcriptPayload = buildCanonicalTranscriptPayload(transcript); + const aiTags = await inferTagsForEntry(transcriptPayload.transcript); + + for (const t of aiTags) { + if (!processedTags.find((pt) => pt.label === t)) { + processedTags.push({ label: t, source: 'ai' }); + } + } + const entry = await ModelToUse.create({ userId, title, - transcript, + ...transcriptPayload, // Persist audioUrl ONLY when it is stable (local path or non-S3 remote without presign). audioUrl: shouldPersistUrl ? rawAudioUrl : undefined, @@ -562,7 +564,14 @@ const resolvers = { entry.version += 1; if (title !== undefined) entry.title = title; - if (transcript !== undefined) entry.transcript = transcript; + if (transcript !== undefined) { + const transcriptPayload = buildCanonicalTranscriptPayload(transcript); + entry.transcript = transcriptPayload.transcript; + entry.rawTranscript = transcriptPayload.rawTranscript; + entry.canonicalTranscript = transcriptPayload.canonicalTranscript; + entry.transcriptHash = transcriptPayload.transcriptHash; + entry.canonicalizationVersion = transcriptPayload.canonicalizationVersion; + } if (tags !== undefined) { entry.tags = tags diff --git a/server/models/Entry.js b/server/models/Entry.js index 1db01c73..bb004d72 100644 --- a/server/models/Entry.js +++ b/server/models/Entry.js @@ -1,6 +1,7 @@ // File: /server/models/Entry.js import mongoose from 'mongoose'; +import { createHash } from 'node:crypto'; import reflectionSchema from './subschemas/Reflection.js'; import tagSchema from './subschemas/Tag.js'; @@ -17,6 +18,43 @@ const EMOTION_VALUES = [ ]; const DUEL_OUTCOME_VALUES = ['TRANSMUTED', 'LIBERATED', 'STABILIZED', 'FALTERED']; +export const TRANSCRIPT_CANONICALIZATION_VERSION_V1 = '1'; + +const DOUBLE_SMART_QUOTES_REGEX = /[\u201C\u201D\u201E\u201F\u2033\u2036]/g; +const SINGLE_SMART_QUOTES_REGEX = /[\u2018\u2019\u201A\u201B\u2032\u2035]/g; +const DASH_VARIANTS_REGEX = /[\u2010\u2011\u2012\u2013\u2014\u2015\u2212]/g; + +export function canonicalizeTranscriptV1(input) { + if (typeof input !== 'string') return ''; + + return input + .replace(/\uFEFF/g, '') + .replace(/\u0000/g, '') + .normalize('NFKC') + .replace(DOUBLE_SMART_QUOTES_REGEX, '"') + .replace(SINGLE_SMART_QUOTES_REGEX, "'") + .replace(DASH_VARIANTS_REGEX, '-') + .replace(/\u2026/g, '...') + .replace(/\r\n?/g, '\n') + .trim(); +} + +export function sha256Hex(input) { + return createHash('sha256').update(String(input || ''), 'utf8').digest('hex'); +} + +export function buildCanonicalTranscriptPayload(transcriptInput) { + const rawTranscript = typeof transcriptInput === 'string' ? transcriptInput : ''; + const canonicalTranscript = canonicalizeTranscriptV1(rawTranscript); + + return { + transcript: canonicalTranscript, + rawTranscript, + canonicalTranscript, + transcriptHash: sha256Hex(canonicalTranscript), + canonicalizationVersion: TRANSCRIPT_CANONICALIZATION_VERSION_V1, + }; +} const duelHistorySchema = new Schema( { @@ -77,6 +115,31 @@ const entrySchema = new Schema( default: '', }, + /** + * Canonicalization v1 stores both source text and a deterministic normalized view. + * Hashing source-of-truth is canonicalTranscript (not rawTranscript). + */ + rawTranscript: { + type: String, + default: '', + }, + + canonicalTranscript: { + type: String, + default: '', + }, + + transcriptHash: { + type: String, + trim: true, + default: '', + }, + + canonicalizationVersion: { + type: String, + default: null, + }, + /** * Transcript lifecycle state (worker-friendly). * Keep enum small and stable. @@ -458,7 +521,12 @@ entrySchema.methods.setTranscriptState = async function setTranscriptState({ if (status) this.transcriptStatus = status; if (typeof transcript === 'string') { - this.transcript = transcript; + const canonicalPayload = buildCanonicalTranscriptPayload(transcript); + this.transcript = canonicalPayload.transcript; + this.rawTranscript = canonicalPayload.rawTranscript; + this.canonicalTranscript = canonicalPayload.canonicalTranscript; + this.transcriptHash = canonicalPayload.transcriptHash; + this.canonicalizationVersion = canonicalPayload.canonicalizationVersion; if (!status) this.transcriptStatus = 'transcript_ready'; } diff --git a/server/models/__tests__/Entry.test.js b/server/models/__tests__/Entry.test.js index 5a769bec..eeb0d474 100644 --- a/server/models/__tests__/Entry.test.js +++ b/server/models/__tests__/Entry.test.js @@ -1,6 +1,11 @@ import { describe, expect, it } from 'vitest'; -import Entry from '../Entry.js'; +import Entry, { + buildCanonicalTranscriptPayload, + canonicalizeTranscriptV1, + sha256Hex, + TRANSCRIPT_CANONICALIZATION_VERSION_V1, +} from '../Entry.js'; describe('Entry emotionalState schema', () => { it('includes appraisal/coreNeed enums with safe defaults', () => { @@ -21,3 +26,29 @@ describe('Entry emotionalState schema', () => { ); }); }); + +describe('transcript canonicalization v1', () => { + it('normalizes corpus variants deterministically and idempotently', () => { + const corpus = '\uFEFF\u0000 “Curly”\r\nline—two\rline…\u2018ok\u2019 \u0000'; + + const canonical = canonicalizeTranscriptV1(corpus); + + expect(canonical).toBe('"Curly"\nline-two\nline...\'ok\''); + expect(canonicalizeTranscriptV1(canonical)).toBe(canonical); + }); + + it('builds stable hash payloads from canonical transcript only', () => { + const payload = buildCanonicalTranscriptPayload('A\r\nB…'); + + expect(payload).toMatchObject({ + transcript: 'A\nB...', + rawTranscript: 'A\r\nB…', + canonicalTranscript: 'A\nB...', + canonicalizationVersion: TRANSCRIPT_CANONICALIZATION_VERSION_V1, + }); + + expect(payload.transcriptHash).toBe(sha256Hex(payload.canonicalTranscript)); + expect(payload.transcriptHash).toBe(sha256Hex('A\nB...')); + expect(payload.transcriptHash).not.toBe(sha256Hex(payload.rawTranscript)); + }); +}); diff --git a/server/routes/__tests__/upload.test.js b/server/routes/__tests__/upload.test.js index d3723b89..71bb54e9 100644 --- a/server/routes/__tests__/upload.test.js +++ b/server/routes/__tests__/upload.test.js @@ -29,6 +29,13 @@ vi.mock('../../models/Entry.js', () => ({ _id: 'entry-mock-id', })), }, + buildCanonicalTranscriptPayload: (value) => ({ + transcript: typeof value === 'string' ? value : '', + rawTranscript: typeof value === 'string' ? value : '', + canonicalTranscript: typeof value === 'string' ? value : '', + transcriptHash: 'mock-hash', + canonicalizationVersion: '1', + }), })); vi.mock('../../src/orchestration/agentOrchestration.js', () => ({ createScribeTask: vi.fn(async () => ({ diff --git a/server/routes/upload.js b/server/routes/upload.js index 7ed429fc..4373a35b 100644 --- a/server/routes/upload.js +++ b/server/routes/upload.js @@ -5,7 +5,7 @@ import { createRequire } from 'module'; import * as mm from 'music-metadata'; import { storeAudio } from '../services/audioStorage.js'; -import Entry from '../models/Entry.js'; +import Entry, { buildCanonicalTranscriptPayload } from '../models/Entry.js'; import { createScribeTask } from '../src/orchestration/agentOrchestration.js'; const require = createRequire(import.meta.url); @@ -133,6 +133,8 @@ router.post('/', async (req, res) => { const isS3 = storage === 's3'; const isLocal = storage === 'local'; + const transcriptPayload = buildCanonicalTranscriptPayload(''); + const entry = await Entry.create({ userId, @@ -145,7 +147,7 @@ router.post('/', async (req, res) => { audioUrl: isLocal ? url : undefined, audioDurationSeconds, - transcript: '', + ...transcriptPayload, }); const task = await createScribeTask(entry._id.toString()); diff --git a/server/src/workers/__tests__/reflection.worker.test.ts b/server/src/workers/__tests__/reflection.worker.test.ts index fbe6909a..d328f1bc 100644 --- a/server/src/workers/__tests__/reflection.worker.test.ts +++ b/server/src/workers/__tests__/reflection.worker.test.ts @@ -59,6 +59,8 @@ vi.mock('../../../models/Entry.js', () => ({ find: mocks.findMock, updateOne: mocks.updateOneMock, }, + canonicalizeTranscriptV1: (value: unknown) => String(value ?? '').trim(), + sha256Hex: (value: unknown) => `hash:${String(value ?? '')}`, })); // ----------------------------- @@ -602,7 +604,7 @@ ${JSON.stringify([{ type: 'reflection', headline: 'Safe headline', confidence: 0 ); expect(historyQuery.select).toHaveBeenCalledWith( - 'transcript createdAt emotionalIntensity vibe transcriptVersion' + 'transcript canonicalTranscript canonicalizationVersion createdAt emotionalIntensity vibe transcriptVersion' ); expect(historyQuery.sort).toHaveBeenCalledWith({ createdAt: -1 }); expect(historyQuery.limit).toHaveBeenCalledWith(100); diff --git a/server/src/workers/__tests__/scribe.flow.test.js b/server/src/workers/__tests__/scribe.flow.test.js index f7c1a192..855a6b35 100644 --- a/server/src/workers/__tests__/scribe.flow.test.js +++ b/server/src/workers/__tests__/scribe.flow.test.js @@ -13,6 +13,13 @@ vi.mock('../../../models/Entry.js', () => ({ updateOne: updateOneMock, findById: findByIdMock, }, + buildCanonicalTranscriptPayload: (value) => ({ + transcript: String(value ?? ''), + rawTranscript: String(value ?? ''), + canonicalTranscript: String(value ?? ''), + transcriptHash: 'mock-hash', + canonicalizationVersion: '1', + }), })); vi.mock('../../models/AgentTask.js', () => ({ diff --git a/server/src/workers/__tests__/scribe.worker.test.ts b/server/src/workers/__tests__/scribe.worker.test.ts index 43004e83..5f5b5173 100644 --- a/server/src/workers/__tests__/scribe.worker.test.ts +++ b/server/src/workers/__tests__/scribe.worker.test.ts @@ -15,6 +15,13 @@ vi.mock('../../../models/Entry.js', () => ({ updateOne: updateOneMock, findById: findByIdMock, }, + buildCanonicalTranscriptPayload: (value) => ({ + transcript: String(value ?? ''), + rawTranscript: String(value ?? ''), + canonicalTranscript: String(value ?? ''), + transcriptHash: 'mock-hash', + canonicalizationVersion: '1', + }), })); vi.mock('../../models/AgentTask.js', () => ({ diff --git a/server/src/workers/reflection.worker.js b/server/src/workers/reflection.worker.js index 23fba598..d4df9f86 100644 --- a/server/src/workers/reflection.worker.js +++ b/server/src/workers/reflection.worker.js @@ -1,9 +1,8 @@ // File: server/src/workers/reflection.worker.js -import { createHash } from 'node:crypto'; import { Worker } from 'bullmq'; -import Entry from '../../models/Entry.js'; +import Entry, { canonicalizeTranscriptV1, sha256Hex } from '../../models/Entry.js'; import AgentTask from '../models/AgentTask.js'; import { connection } from '../queues/index.js'; @@ -33,18 +32,6 @@ export const REFLECTION_MODE = 'reflection'; // Deterministic, non-deceptive terminal placeholder. const NO_RECEIPTED_MEANING_PLACEHOLDER = 'No receipted meaning available.'; -// Single canonicalization routine (boundary-only, not a safety module). -// Keep stable across OS newline differences + Unicode composition. -function canonicalizeText(input) { - if (typeof input !== 'string') return ''; - return input - .normalize('NFC') - .replace(/\r\n/g, '\n') - .replace(/\r/g, '\n') - .trim() - .replace(/[^\S\n]+/g, ' '); // collapse spaces/tabs, preserve newlines -} - function logSafetyEvent({ severity = 'INFO', type, userId, entryId, details = {} }) { const payload = { event: 'SAFETY_EVENT', @@ -101,7 +88,6 @@ const getReceiptAnchor = (receipt) => { return receipt.anchor || receipt.quote || receipt.text || ''; }; -const sha256Hex = (value) => createHash('sha256').update(String(value || ''), 'utf8').digest('hex'); const resolveReceiptOffsets = (receipt) => { if (!receipt || typeof receipt !== 'object') return null; @@ -538,7 +524,10 @@ export async function handleReflectionJob(job) { const entryIdStr = String(entry._id); const userIdStr = String(entry.userId); - const canonicalTranscript = canonicalizeText(entry.transcript || ''); + const canonicalTranscript = + entry?.canonicalizationVersion === '1' && typeof entry?.canonicalTranscript === 'string' + ? entry.canonicalTranscript + : canonicalizeTranscriptV1(entry?.transcript || ''); const transcriptVersion = coerceTranscriptVersionString(entry); if (entry.pipelineStatus?.reflection === 'completed') { @@ -607,7 +596,7 @@ export async function handleReflectionJob(job) { userId: entry.userId, createdAt: { $gte: thirtyDaysAgo }, }) - .select('transcript createdAt emotionalIntensity vibe transcriptVersion') + .select('transcript canonicalTranscript canonicalizationVersion createdAt emotionalIntensity vibe transcriptVersion') .sort({ createdAt: -1 }) .limit(100) .lean(); @@ -620,7 +609,10 @@ export async function handleReflectionJob(job) { const intensity = Number(e.emotionalIntensity ?? e?.vibe?.intensity ?? e?.vibe?.score ?? 0); return { - text: canonicalizeText(e.transcript || ''), + text: + e?.canonicalizationVersion === '1' && typeof e?.canonicalTranscript === 'string' + ? e.canonicalTranscript + : canonicalizeTranscriptV1(e?.transcript || ''), intensity: Number.isFinite(intensity) ? intensity : 0, timestamp, }; diff --git a/server/src/workers/scribe.worker.js b/server/src/workers/scribe.worker.js index 7b45360c..80852e5f 100644 --- a/server/src/workers/scribe.worker.js +++ b/server/src/workers/scribe.worker.js @@ -6,7 +6,7 @@ import os from 'os'; import { Readable } from 'stream'; import { pipeline } from 'stream/promises'; -import Entry from '../../models/Entry.js'; +import Entry, { buildCanonicalTranscriptPayload } from '../../models/Entry.js'; import { langfuse } from '../../utils/langfuse.js'; import { createAgent } from '../../utils/agents/createAgent.js'; @@ -338,11 +338,13 @@ export async function handleScribeJob(job, context = {}) { // 2) Persist transcript ASAP (never risk losing it due to downstream best-effort work) // Dual-Write: Mark both legacy and new flags as complete. + const transcriptPayload = buildCanonicalTranscriptPayload(transcript); + await Entry.updateOne( { _id: entryId }, { $set: { - transcript, + ...transcriptPayload, transcriptStatus: 'transcript_ready', // Legacy UI support (stops spinner) transcriptReadyAt: new Date(), 'pipelineStatus.upload': 'completed', diff --git a/server/tests/receipt.v1.test.js b/server/tests/receipt.v1.test.js index 34bc3193..6c5c2336 100644 --- a/server/tests/receipt.v1.test.js +++ b/server/tests/receipt.v1.test.js @@ -28,6 +28,8 @@ beforeAll(async () => { // IMPORTANT: these specifiers must match what reflection.worker.js imports. vi.mock('../models/Entry.js', () => ({ default: { findById: vi.fn(), updateOne: vi.fn(), find: vi.fn() }, + canonicalizeTranscriptV1: (value) => String(value ?? '').trim(), + sha256Hex: (value) => createHash('sha256').update(String(value || ''), 'utf8').digest('hex'), })); vi.mock('../src/models/AgentTask.js', () => ({ From 63bc1f206034da8b3583e5d673905c92fd29596f Mon Sep 17 00:00:00 2001 From: Will Haynes Date: Fri, 20 Feb 2026 09:05:53 -0600 Subject: [PATCH 3/3] fix(codex): align latest.json locks/scope; fold transcript whitespace safely --- codex/tasks/latest.json | 14 ++++++------ server/models/Entry.js | 32 ++++++++++++++++++++++----- server/models/__tests__/Entry.test.js | 22 +++++++++++++----- 3 files changed, 50 insertions(+), 18 deletions(-) diff --git a/codex/tasks/latest.json b/codex/tasks/latest.json index 58bdcf55..e5f76b58 100644 --- a/codex/tasks/latest.json +++ b/codex/tasks/latest.json @@ -3,7 +3,7 @@ "title": "Phase 1: Canonicalize transcript at ingest + stable transcriptHash", "summary": "Implement ingest-time transcript canonicalization (NFKC + punctuation folding + line-ending normalization + BOM/null stripping) with versioning. Store rawTranscript + canonicalTranscript + transcriptHash + canonicalizationVersion on Entry for all write paths (upload route + GraphQL addEntry/updateEntry + any other transcript writers). Add deterministic tests for the canonicalization corpus. Do not bulk-migrate existing entries; freeze legacy entries at canonicalizationVersion=0/null and only apply v1 on new/updated transcripts going forward.", "base_branch": "develop", - "branch_name": "codex/phase1-ingest-canonicalization-exec-2026-02-20", + "branch_name": "codex/implement-transcript-canonicalization-at-ingest", "repo_scope": [ "server/models/Entry.js", "server/routes/upload.js", @@ -15,12 +15,12 @@ "server/models/__tests__/**", "server/src/**/__tests__/**", "server/tests/**", + "server/__tests__/**", + "server/routes/__tests__/**", "scripts/codex_preflight.mjs", "codex/tasks/latest.json" ], - "agents_involved": [ - "codex-web" - ], + "agents_involved": ["codex-web"], "risk_level": "medium", "tests_to_run": [ "node -e \"JSON.parse(require('fs').readFileSync('codex/tasks/latest.json','utf8')); console.log('latest.json ok')\"", @@ -39,17 +39,17 @@ "acceptance_checks": [ "Alignment Evidence: print task_id, base_branch, branch_name, repo_scope, tests_to_run at start of run.", "Work-Exists Gate: identify all transcript write paths (upload.js, GraphQL addEntry/updateEntry, scribe worker transcript persistence) and show exact files/lines to be changed.", - "Implement a single ingest canonicalization function (v1) using NFKC + punctuation folding + newline normalization + BOM/null stripping + trim; store canonicalizationVersion='1'.", + "Implement a single ingest canonicalization function (v1) using NFKC + punctuation folding + newline normalization + BOM/null stripping + internal whitespace folding (preserve newlines) + trim; store canonicalizationVersion='1'.", "Entry stores rawTranscript (untouched) and canonicalTranscript (canonicalized). transcriptHash is sha256(canonicalTranscript).", "All transcript-writing paths set/update canonical fields consistently when transcript changes.", - "Add/extend deterministic tests covering: smart quotes folding, dash folding, ellipsis folding, CRLF/CR normalization, BOM/null stripping, and idempotency (canon(canon(x))==canon(x)).", + "Add/extend deterministic tests covering: smart quotes folding, dash folding, ellipsis folding, CRLF/CR normalization, BOM/null stripping, internal whitespace folding (tabs/multi-spaces without breaking newlines), and idempotency (canon(canon(x))==canon(x)).", "Run tests_to_run and show outputs. If any test is skipped, explain why and provide a safe alternative.", "Change Proof: show git status -sb and git diff --stat at end. No diff => no PR." ], "locks": { "task_id": "phase1_ingest_canonicalization_2026_02_20", "base_branch": "develop", - "branch_name": "codex/phase1-ingest-canonicalization-exec-2026-02-20", + "branch_name": "codex/implement-transcript-canonicalization-at-ingest", "canary": "PHASE1_INGEST_CANON_V1_CANARY_2026_02_20" } } diff --git a/server/models/Entry.js b/server/models/Entry.js index bb004d72..c9643b3b 100644 --- a/server/models/Entry.js +++ b/server/models/Entry.js @@ -20,22 +20,38 @@ const EMOTION_VALUES = [ const DUEL_OUTCOME_VALUES = ['TRANSMUTED', 'LIBERATED', 'STABILIZED', 'FALTERED']; export const TRANSCRIPT_CANONICALIZATION_VERSION_V1 = '1'; +// NOTE: Canonicalization v1 is intended to stabilize voice transcripts across: +// - OS newline differences +// - Unicode composition / compatibility variants (NFKC) +// - common typography artifacts (smart quotes/dashes/ellipsis) +// - transcription spacing artifacts (tabs / double-spaces) +// It MUST preserve newlines, but should fold intra-line whitespace to single spaces. const DOUBLE_SMART_QUOTES_REGEX = /[\u201C\u201D\u201E\u201F\u2033\u2036]/g; const SINGLE_SMART_QUOTES_REGEX = /[\u2018\u2019\u201A\u201B\u2032\u2035]/g; const DASH_VARIANTS_REGEX = /[\u2010\u2011\u2012\u2013\u2014\u2015\u2212]/g; +const INTERNAL_WHITESPACE_EXCEPT_NEWLINE_REGEX = /[^\S\n]+/g; export function canonicalizeTranscriptV1(input) { if (typeof input !== 'string') return ''; return input + // Remove BOM + NULs (rare, but show up in some pipelines) .replace(/\uFEFF/g, '') .replace(/\u0000/g, '') + // Compatibility normalize to reduce variant churn in transcripts .normalize('NFKC') + // Typography folding .replace(DOUBLE_SMART_QUOTES_REGEX, '"') .replace(SINGLE_SMART_QUOTES_REGEX, "'") .replace(DASH_VARIANTS_REGEX, '-') .replace(/\u2026/g, '...') + // Newline normalization .replace(/\r\n?/g, '\n') + // ✅ Critical: fold internal whitespace runs (tabs/multi-spaces), preserve newlines + .replace(INTERNAL_WHITESPACE_EXCEPT_NEWLINE_REGEX, ' ') + // ✅ remove whitespace at end-of-line + .replace(/[^\S\n]+\n/g, '\n') + // Boundary trim .trim(); } @@ -48,7 +64,10 @@ export function buildCanonicalTranscriptPayload(transcriptInput) { const canonicalTranscript = canonicalizeTranscriptV1(rawTranscript); return { + // Legacy read surface stays `transcript` but now equals canonical transcript: canonicalTranscript, + + // New v1 fields rawTranscript, canonicalTranscript, transcriptHash: sha256Hex(canonicalTranscript), @@ -109,6 +128,10 @@ const entrySchema = new Schema( maxlength: 200, }, + /** + * Legacy transcript surface. + * Canonicalization v1 writes canonical text here to keep old readers stable. + */ transcript: { type: String, trim: true, @@ -389,8 +412,8 @@ entrySchema.index({ title: 'text', transcript: 'text' }); * Default: exclude soft-deleted docs for find* queries. * * Opt-out: - * 1) Query option: Entry.find(...).setOptions({ includeDeleted: true }) - * 2) Query sentinel: Entry.find({ includeDeleted: true }) // legacy-friendly + * 1) Query option: Entry.find(...).setOptions({ includeDeleted: true }) + * 2) Query sentinel: Entry.find({ includeDeleted: true }) // legacy-friendly */ function excludeDeletedQuery(next) { const opts = (typeof this.getOptions === 'function' && this.getOptions()) || {}; @@ -436,8 +459,7 @@ entrySchema.pre('aggregate', function excludeDeletedAggregate(next) { Array.isArray(pipeline) && pipeline.some( (stage) => - stage?.$match && - Object.prototype.hasOwnProperty.call(stage.$match, 'isDeleted') + stage?.$match && Object.prototype.hasOwnProperty.call(stage.$match, 'isDeleted') ); if (alreadyMatched) return next(); @@ -553,4 +575,4 @@ entrySchema.methods.setTranscriptState = async function setTranscriptState({ // Prevent OverwriteModelError in watch/test environments. const Entry = mongoose.models.Entry || mongoose.model('Entry', entrySchema); -export default Entry; +export default Entry; \ No newline at end of file diff --git a/server/models/__tests__/Entry.test.js b/server/models/__tests__/Entry.test.js index eeb0d474..e2987583 100644 --- a/server/models/__tests__/Entry.test.js +++ b/server/models/__tests__/Entry.test.js @@ -1,3 +1,5 @@ +// File: /server/models/__tests__/Entry.test.js + import { describe, expect, it } from 'vitest'; import Entry, { @@ -19,36 +21,44 @@ describe('Entry emotionalState schema', () => { const coreNeedEnum = Entry.schema.path('emotionalState.coreNeed').enumValues; expect(appraisalEnum).toEqual( - expect.arrayContaining(['LOSS', 'THREAT', 'VIOLATION', 'CHALLENGE']), + expect.arrayContaining(['LOSS', 'THREAT', 'VIOLATION', 'CHALLENGE']) ); expect(coreNeedEnum).toEqual( - expect.arrayContaining(['SAFETY', 'CONNECTION', 'AUTONOMY', 'COMPETENCE']), + expect.arrayContaining(['SAFETY', 'CONNECTION', 'AUTONOMY', 'COMPETENCE']) ); }); }); describe('transcript canonicalization v1', () => { it('normalizes corpus variants deterministically and idempotently', () => { - const corpus = '\uFEFF\u0000 “Curly”\r\nline—two\rline…\u2018ok\u2019 \u0000'; + // Includes: + // - BOM + NUL stripping + // - smart quotes/dashes/ellipsis folding + // - CRLF/CR to LF normalization + // - internal whitespace folding (tabs + multi-spaces) while preserving newlines + const corpus = + '\uFEFF\u0000 “Curly”\t\t\r\nline—two \rline…\u2018ok\u2019 \t \u0000'; const canonical = canonicalizeTranscriptV1(corpus); + // Tabs and multi-spaces collapse to single spaces; newlines are preserved. expect(canonical).toBe('"Curly"\nline-two\nline...\'ok\''); expect(canonicalizeTranscriptV1(canonical)).toBe(canonical); }); it('builds stable hash payloads from canonical transcript only', () => { - const payload = buildCanonicalTranscriptPayload('A\r\nB…'); + const payload = buildCanonicalTranscriptPayload('A\t \r\nB…'); expect(payload).toMatchObject({ transcript: 'A\nB...', - rawTranscript: 'A\r\nB…', + rawTranscript: 'A\t \r\nB…', canonicalTranscript: 'A\nB...', canonicalizationVersion: TRANSCRIPT_CANONICALIZATION_VERSION_V1, }); + // Hash must be derived from canonicalTranscript only (never rawTranscript). expect(payload.transcriptHash).toBe(sha256Hex(payload.canonicalTranscript)); expect(payload.transcriptHash).toBe(sha256Hex('A\nB...')); expect(payload.transcriptHash).not.toBe(sha256Hex(payload.rawTranscript)); }); -}); +}); \ No newline at end of file