From 9ecb10091077e8190aaee3ce00cddbb5ecfa348b Mon Sep 17 00:00:00 2001
From: Will Haynes <wileland7@gmail.com>
Date: Fri, 20 Feb 2026 01:08:41 -0600
Subject: [PATCH 1/3] codex(task): phase1 ingest transcript canonicalization v1

---
 codex/tasks/latest.json | 68 +++++++++++++++++++++++------------------
 1 file changed, 39 insertions(+), 29 deletions(-)

diff --git a/codex/tasks/latest.json b/codex/tasks/latest.json
index 3a88eb3a..58bdcf55 100644
--- a/codex/tasks/latest.json
+++ b/codex/tasks/latest.json
@@ -1,45 +1,55 @@
 {
-  "task_id": "phase0-spine-lockdown-2026-02-19",
-  "title": "Phase 0 Spine Lockdown: freeze contract vocab, kill ambiguous receipt offsets, harden emission + narrative policy",
+  "task_id": "phase1_ingest_canonicalization_2026_02_20",
+  "title": "Phase 1: Canonicalize transcript at ingest + stable transcriptHash",
+  "summary": "Implement ingest-time transcript canonicalization (NFKC + punctuation folding + line-ending normalization + BOM/null stripping) with versioning. Store rawTranscript + canonicalTranscript + transcriptHash + canonicalizationVersion on Entry for all write paths (upload route + GraphQL addEntry/updateEntry + any other transcript writers). Add deterministic tests for the canonicalization corpus. Do not bulk-migrate existing entries; freeze legacy entries at canonicalizationVersion=0/null and only apply v1 on new/updated transcripts going forward.",
   "base_branch": "develop",
-  "branch_name": "codex/phase0-spine-lockdown-exec-2026-02-19",
-  "summary": "Seal the Meaning Spine by freezing contract reason codes, enforcing unique-match offset inference (ambiguity=poison), hardening validateReceipt (strict V1 never falls through), ensuring ENTRY_ANALYZED emits contract+sanitized cards only (no raw reflection text), and locking narrative toggle behind a shared policy utility that callers cannot override. Add/adjust regression tests to prevent drift.",
+  "branch_name": "codex/phase1-ingest-canonicalization-exec-2026-02-20",
   "repo_scope": [
-    "codex/tasks/latest.json",
+    "server/models/Entry.js",
+    "server/routes/upload.js",
+    "server/graphql/resolvers/index.js",
+    "server/src/workers/scribe.worker.js",
     "server/src/workers/reflection.worker.js",
-    "server/src/utils/truthValidator.js",
     "server/src/utils/**",
-    "server/src/workers/__tests__/**",
+    "server/utils/**",
+    "server/models/__tests__/**",
+    "server/src/**/__tests__/**",
     "server/tests/**",
-    "docs/testing-doctrine.md"
+    "scripts/codex_preflight.mjs",
+    "codex/tasks/latest.json"
   ],
-  "agents_involved": ["codex_web"],
-  "risk_level": "low",
+  "agents_involved": [
+    "codex-web"
+  ],
+  "risk_level": "medium",
   "tests_to_run": [
+    "node -e \"JSON.parse(require('fs').readFileSync('codex/tasks/latest.json','utf8')); console.log('latest.json ok')\"",
     "node scripts/codex_preflight.mjs --ci",
     "pnpm -C server test"
   ],
   "constraints": [
-    "CODEX_WEB: Do NOT run git network commands (no git fetch/pull/push/clone). Use the UI “Create PR” button if a PR is needed.",
-    "CODEX_WEB_HEAD: In Codex Web, the checked-out branch name may be 'work'. Do NOT treat HEAD name mismatch as stale. Locks+canary are the source of truth.",
-    "ANTI-COP-OUT: No diff => no PR. If no actionable work exists, stop and report evidence.",
-    "SCOPE: Do not modify files outside repo_scope. If out-of-scope issues are found, produce a Repair Manifest instead of changing them.",
-    "ALIGNMENT: Print task_id/base_branch/branch_name/canary from latest.json before doing any work.",
-    "EVIDENCE_BUNDLE: Provide evidence in 4 phases: Alignment, Work-Exists Gate, Change Proof, Tests.",
-    "PR_BASE: Ensure PR base branch is develop (not another codex/* branch). Do not create draft PRs.",
-    "NO_PLACEHOLDERS: Do not create empty directories or placeholder files. Only create files with real content and tests.",
-    "NO_NETWORK: Tests must not touch real external network services."
+    "Codex Web environment: do NOT run git push; use the Create PR button.",
+    "Do NOT create placeholder files or empty directories. If no diff is needed, stop and report; do not create a PR.",
+    "All changes must remain within repo_scope. If a necessary fix is out-of-scope, produce a Repair Manifest instead of changing it.",
+    "Canonicalization happens at ingest/write time only (identity). Do not re-canonicalize during validation except legacy v0 fallback.",
+    "Do NOT bulk-migrate existing stored transcripts. Implement freeze+version: legacy entries are v0/null; new writes become v1.",
+    "Hashing must be based on canonicalTranscript and must NOT use locale-sensitive casefolding (no toLowerCase/toUpperCase on hash inputs).",
+    "No raw user transcript content may be logged or emitted into events as part of this change."
   ],
   "acceptance_checks": [
-    "Alignment Evidence: show codex/tasks/latest.json values for task_id, base_branch, branch_name, and canary.",
-    "Alignment Evidence: print `git rev-parse --abbrev-ref HEAD` and `git rev-parse HEAD` for evidence; do NOT stop on SHA mismatch.",
-    "Work-Exists Gate: prove target symbols exist via grep or file navigation; if not found, stop and report: findReceiptOffsets (or equivalent), emitEntryAnalyzed callsite/payload, sanitizeBloomCardsWithContract boundary, validateReceipt in server/src/utils/truthValidator.js (or its imported helpers).",
-    "Freeze contract reason codes: add a shared constants module and replace raw string comparisons/assignments in Meaning Spine paths touched by this task.",
-    "Unique Match Rule: any transcript-search offset inference must return null on ambiguous multi-occurrence matches (firstIndex !== lastIndex). Ambiguity must drop the receipt/card safely and be reflected in contract/dropped reasons.",
-    "validateReceipt hardening: strict V1 path must not fall through to weaker matching if offsets fail; invalid shapes return explicit failure reasons and do not throw.",
-    "Emission hardening: ENTRY_ANALYZED payload must contain sanitized cards AND the Meaning Contract ledger; payload must not include raw reflection text anywhere.",
-    "Tests: add/adjust regression tests that fail if raw model output leaks into emission serialization; add/adjust tests verifying ambiguous quote matches are dropped.",
-    "Proof: include git status -sb and git diff --stat after changes; run tests_to_run and report results. (Run `pnpm -w test` locally after PR if desired.)"
+    "Alignment Evidence: print task_id, base_branch, branch_name, repo_scope, tests_to_run at start of run.",
+    "Work-Exists Gate: identify all transcript write paths (upload.js, GraphQL addEntry/updateEntry, scribe worker transcript persistence) and show exact files/lines to be changed.",
+    "Implement a single ingest canonicalization function (v1) using NFKC + punctuation folding + newline normalization + BOM/null stripping + trim; store canonicalizationVersion='1'.",
+    "Entry stores rawTranscript (untouched) and canonicalTranscript (canonicalized). transcriptHash is sha256(canonicalTranscript).",
+    "All transcript-writing paths set/update canonical fields consistently when transcript changes.",
+    "Add/extend deterministic tests covering: smart quotes folding, dash folding, ellipsis folding, CRLF/CR normalization, BOM/null stripping, and idempotency (canon(canon(x))==canon(x)).",
+    "Run tests_to_run and show outputs. If any test is skipped, explain why and provide a safe alternative.",
+    "Change Proof: show git status -sb and git diff --stat at end. No diff => no PR."
   ],
-  "canary": "CANARY_PHASE0_SPINE_LOCKDOWN_2026_02_19"
+  "locks": {
+    "task_id": "phase1_ingest_canonicalization_2026_02_20",
+    "base_branch": "develop",
+    "branch_name": "codex/phase1-ingest-canonicalization-exec-2026-02-20",
+    "canary": "PHASE1_INGEST_CANON_V1_CANARY_2026_02_20"
+  }
 }

From f5c9c9a8888e084e3e38e8a420b8de6ceaaa903d Mon Sep 17 00:00:00 2001
From: William Leland Haynes <142263841+wileland@users.noreply.github.com>
Date: Fri, 20 Feb 2026 06:38:03 -0600
Subject: [PATCH 2/3] feat(server): canonicalize transcripts at ingest v1

---
 server/__tests__/upload.demo.test.js          |  7 ++
 server/graphql/resolvers/index.js             | 29 +++++---
 server/models/Entry.js                        | 70 ++++++++++++++++++-
 server/models/__tests__/Entry.test.js         | 33 ++++++++-
 server/routes/__tests__/upload.test.js        |  7 ++
 server/routes/upload.js                       |  6 +-
 .../__tests__/reflection.worker.test.ts       |  4 +-
 .../src/workers/__tests__/scribe.flow.test.js |  7 ++
 .../workers/__tests__/scribe.worker.test.ts   |  7 ++
 server/src/workers/reflection.worker.js       | 28 +++-----
 server/src/workers/scribe.worker.js           |  6 +-
 server/tests/receipt.v1.test.js               |  2 +
 12 files changed, 171 insertions(+), 35 deletions(-)

diff --git a/server/__tests__/upload.demo.test.js b/server/__tests__/upload.demo.test.js
index c6baf0bb..5ca2194f 100644
--- a/server/__tests__/upload.demo.test.js
+++ b/server/__tests__/upload.demo.test.js
@@ -14,6 +14,13 @@ vi.mock('../models/Entry.js', () => ({
       _id: 'demo-entry-id',
     })),
   },
+  buildCanonicalTranscriptPayload: (value) => ({
+    transcript: typeof value === 'string' ? value : '',
+    rawTranscript: typeof value === 'string' ? value : '',
+    canonicalTranscript: typeof value === 'string' ? value : '',
+    transcriptHash: 'mock-hash',
+    canonicalizationVersion: '1',
+  }),
 }));
 
 vi.mock('../src/orchestration/agentOrchestration.js', () => ({
diff --git a/server/graphql/resolvers/index.js b/server/graphql/resolvers/index.js
index 19ac4a4c..7d216322 100644
--- a/server/graphql/resolvers/index.js
+++ b/server/graphql/resolvers/index.js
@@ -2,7 +2,7 @@
 
 import jwt from 'jsonwebtoken';
 
-import Entry from '../../models/Entry.js';
+import Entry, { buildCanonicalTranscriptPayload } from '../../models/Entry.js';
 import TagModel from '../../models/Tag.js';
 import DuelSession from '../../models/DuelSession.js';
 import User from '../../models/User.js';
@@ -473,7 +473,6 @@ const resolvers = {
 
       const ModelToUse = ContextEntry || Entry;
 
-      const aiTags = await inferTagsForEntry(transcript);
 
       const processedTags = (tags || [])
         .map((t) => {
@@ -483,12 +482,6 @@ const resolvers = {
         })
         .filter(Boolean);
 
-      for (const t of aiTags) {
-        if (!processedTags.find((pt) => pt.label === t)) {
-          processedTags.push({ label: t, source: 'ai' });
-        }
-      }
-
       const rawAudioUrl = typeof audioUrl === 'string' ? audioUrl.trim() : '';
       const hasAudio = rawAudioUrl !== '';
       const duration = Number.isFinite(audioDurationSeconds) ? audioDurationSeconds : null;
@@ -503,10 +496,19 @@ const resolvers = {
 
       const shouldPersistUrl = hasAudio && !hasS3Bucket && !isPresignedUrl(rawAudioUrl);
 
+      const transcriptPayload = buildCanonicalTranscriptPayload(transcript);
+      const aiTags = await inferTagsForEntry(transcriptPayload.transcript);
+
+      for (const t of aiTags) {
+        if (!processedTags.find((pt) => pt.label === t)) {
+          processedTags.push({ label: t, source: 'ai' });
+        }
+      }
+
       const entry = await ModelToUse.create({
         userId,
         title,
-        transcript,
+        ...transcriptPayload,
 
         // Persist audioUrl ONLY when it is stable (local path or non-S3 remote without presign).
         audioUrl: shouldPersistUrl ? rawAudioUrl : undefined,
@@ -562,7 +564,14 @@ const resolvers = {
       entry.version += 1;
 
       if (title !== undefined) entry.title = title;
-      if (transcript !== undefined) entry.transcript = transcript;
+      if (transcript !== undefined) {
+        const transcriptPayload = buildCanonicalTranscriptPayload(transcript);
+        entry.transcript = transcriptPayload.transcript;
+        entry.rawTranscript = transcriptPayload.rawTranscript;
+        entry.canonicalTranscript = transcriptPayload.canonicalTranscript;
+        entry.transcriptHash = transcriptPayload.transcriptHash;
+        entry.canonicalizationVersion = transcriptPayload.canonicalizationVersion;
+      }
 
       if (tags !== undefined) {
         entry.tags = tags
diff --git a/server/models/Entry.js b/server/models/Entry.js
index 1db01c73..bb004d72 100644
--- a/server/models/Entry.js
+++ b/server/models/Entry.js
@@ -1,6 +1,7 @@
 // File: /server/models/Entry.js
 
 import mongoose from 'mongoose';
+import { createHash } from 'node:crypto';
 import reflectionSchema from './subschemas/Reflection.js';
 import tagSchema from './subschemas/Tag.js';
 
@@ -17,6 +18,43 @@ const EMOTION_VALUES = [
 ];
 
 const DUEL_OUTCOME_VALUES = ['TRANSMUTED', 'LIBERATED', 'STABILIZED', 'FALTERED'];
+export const TRANSCRIPT_CANONICALIZATION_VERSION_V1 = '1';
+
+const DOUBLE_SMART_QUOTES_REGEX = /[\u201C\u201D\u201E\u201F\u2033\u2036]/g;
+const SINGLE_SMART_QUOTES_REGEX = /[\u2018\u2019\u201A\u201B\u2032\u2035]/g;
+const DASH_VARIANTS_REGEX = /[\u2010\u2011\u2012\u2013\u2014\u2015\u2212]/g;
+
+export function canonicalizeTranscriptV1(input) {
+  if (typeof input !== 'string') return '';
+
+  return input
+    .replace(/\uFEFF/g, '')
+    .replace(/\u0000/g, '')
+    .normalize('NFKC')
+    .replace(DOUBLE_SMART_QUOTES_REGEX, '"')
+    .replace(SINGLE_SMART_QUOTES_REGEX, "'")
+    .replace(DASH_VARIANTS_REGEX, '-')
+    .replace(/\u2026/g, '...')
+    .replace(/\r\n?/g, '\n')
+    .trim();
+}
+
+export function sha256Hex(input) {
+  return createHash('sha256').update(String(input || ''), 'utf8').digest('hex');
+}
+
+export function buildCanonicalTranscriptPayload(transcriptInput) {
+  const rawTranscript = typeof transcriptInput === 'string' ? transcriptInput : '';
+  const canonicalTranscript = canonicalizeTranscriptV1(rawTranscript);
+
+  return {
+    transcript: canonicalTranscript,
+    rawTranscript,
+    canonicalTranscript,
+    transcriptHash: sha256Hex(canonicalTranscript),
+    canonicalizationVersion: TRANSCRIPT_CANONICALIZATION_VERSION_V1,
+  };
+}
 
 const duelHistorySchema = new Schema(
   {
@@ -77,6 +115,31 @@ const entrySchema = new Schema(
       default: '',
     },
 
+    /**
+     * Canonicalization v1 stores both source text and a deterministic normalized view.
+     * Hashing source-of-truth is canonicalTranscript (not rawTranscript).
+     */
+    rawTranscript: {
+      type: String,
+      default: '',
+    },
+
+    canonicalTranscript: {
+      type: String,
+      default: '',
+    },
+
+    transcriptHash: {
+      type: String,
+      trim: true,
+      default: '',
+    },
+
+    canonicalizationVersion: {
+      type: String,
+      default: null,
+    },
+
     /**
      * Transcript lifecycle state (worker-friendly).
      * Keep enum small and stable.
@@ -458,7 +521,12 @@ entrySchema.methods.setTranscriptState = async function setTranscriptState({
   if (status) this.transcriptStatus = status;
 
   if (typeof transcript === 'string') {
-    this.transcript = transcript;
+    const canonicalPayload = buildCanonicalTranscriptPayload(transcript);
+    this.transcript = canonicalPayload.transcript;
+    this.rawTranscript = canonicalPayload.rawTranscript;
+    this.canonicalTranscript = canonicalPayload.canonicalTranscript;
+    this.transcriptHash = canonicalPayload.transcriptHash;
+    this.canonicalizationVersion = canonicalPayload.canonicalizationVersion;
     if (!status) this.transcriptStatus = 'transcript_ready';
   }
 
diff --git a/server/models/__tests__/Entry.test.js b/server/models/__tests__/Entry.test.js
index 5a769bec..eeb0d474 100644
--- a/server/models/__tests__/Entry.test.js
+++ b/server/models/__tests__/Entry.test.js
@@ -1,6 +1,11 @@
 import { describe, expect, it } from 'vitest';
 
-import Entry from '../Entry.js';
+import Entry, {
+  buildCanonicalTranscriptPayload,
+  canonicalizeTranscriptV1,
+  sha256Hex,
+  TRANSCRIPT_CANONICALIZATION_VERSION_V1,
+} from '../Entry.js';
 
 describe('Entry emotionalState schema', () => {
   it('includes appraisal/coreNeed enums with safe defaults', () => {
@@ -21,3 +26,29 @@ describe('Entry emotionalState schema', () => {
     );
   });
 });
+
+describe('transcript canonicalization v1', () => {
+  it('normalizes corpus variants deterministically and idempotently', () => {
+    const corpus = '\uFEFF\u0000 “Curly”\r\nline—two\rline…\u2018ok\u2019 \u0000';
+
+    const canonical = canonicalizeTranscriptV1(corpus);
+
+    expect(canonical).toBe('"Curly"\nline-two\nline...\'ok\'');
+    expect(canonicalizeTranscriptV1(canonical)).toBe(canonical);
+  });
+
+  it('builds stable hash payloads from canonical transcript only', () => {
+    const payload = buildCanonicalTranscriptPayload('A\r\nB…');
+
+    expect(payload).toMatchObject({
+      transcript: 'A\nB...',
+      rawTranscript: 'A\r\nB…',
+      canonicalTranscript: 'A\nB...',
+      canonicalizationVersion: TRANSCRIPT_CANONICALIZATION_VERSION_V1,
+    });
+
+    expect(payload.transcriptHash).toBe(sha256Hex(payload.canonicalTranscript));
+    expect(payload.transcriptHash).toBe(sha256Hex('A\nB...'));
+    expect(payload.transcriptHash).not.toBe(sha256Hex(payload.rawTranscript));
+  });
+});
diff --git a/server/routes/__tests__/upload.test.js b/server/routes/__tests__/upload.test.js
index d3723b89..71bb54e9 100644
--- a/server/routes/__tests__/upload.test.js
+++ b/server/routes/__tests__/upload.test.js
@@ -29,6 +29,13 @@ vi.mock('../../models/Entry.js', () => ({
       _id: 'entry-mock-id',
     })),
   },
+  buildCanonicalTranscriptPayload: (value) => ({
+    transcript: typeof value === 'string' ? value : '',
+    rawTranscript: typeof value === 'string' ? value : '',
+    canonicalTranscript: typeof value === 'string' ? value : '',
+    transcriptHash: 'mock-hash',
+    canonicalizationVersion: '1',
+  }),
 }));
 vi.mock('../../src/orchestration/agentOrchestration.js', () => ({
   createScribeTask: vi.fn(async () => ({
diff --git a/server/routes/upload.js b/server/routes/upload.js
index 7ed429fc..4373a35b 100644
--- a/server/routes/upload.js
+++ b/server/routes/upload.js
@@ -5,7 +5,7 @@ import { createRequire } from 'module';
 import * as mm from 'music-metadata';
 
 import { storeAudio } from '../services/audioStorage.js';
-import Entry from '../models/Entry.js';
+import Entry, { buildCanonicalTranscriptPayload } from '../models/Entry.js';
 import { createScribeTask } from '../src/orchestration/agentOrchestration.js';
 
 const require = createRequire(import.meta.url);
@@ -133,6 +133,8 @@ router.post('/', async (req, res) => {
       const isS3 = storage === 's3';
       const isLocal = storage === 'local';
 
+      const transcriptPayload = buildCanonicalTranscriptPayload('');
+
       const entry = await Entry.create({
         userId,
 
@@ -145,7 +147,7 @@ router.post('/', async (req, res) => {
         audioUrl: isLocal ? url : undefined,
 
         audioDurationSeconds,
-        transcript: '',
+        ...transcriptPayload,
       });
 
       const task = await createScribeTask(entry._id.toString());
diff --git a/server/src/workers/__tests__/reflection.worker.test.ts b/server/src/workers/__tests__/reflection.worker.test.ts
index fbe6909a..d328f1bc 100644
--- a/server/src/workers/__tests__/reflection.worker.test.ts
+++ b/server/src/workers/__tests__/reflection.worker.test.ts
@@ -59,6 +59,8 @@ vi.mock('../../../models/Entry.js', () => ({
     find: mocks.findMock,
     updateOne: mocks.updateOneMock,
   },
+  canonicalizeTranscriptV1: (value: unknown) => String(value ?? '').trim(),
+  sha256Hex: (value: unknown) => `hash:${String(value ?? '')}`,
 }));
 
 // -----------------------------
@@ -602,7 +604,7 @@ ${JSON.stringify([{ type: 'reflection', headline: 'Safe headline', confidence: 0
     );
 
     expect(historyQuery.select).toHaveBeenCalledWith(
-      'transcript createdAt emotionalIntensity vibe transcriptVersion'
+      'transcript canonicalTranscript canonicalizationVersion createdAt emotionalIntensity vibe transcriptVersion'
     );
     expect(historyQuery.sort).toHaveBeenCalledWith({ createdAt: -1 });
     expect(historyQuery.limit).toHaveBeenCalledWith(100);
diff --git a/server/src/workers/__tests__/scribe.flow.test.js b/server/src/workers/__tests__/scribe.flow.test.js
index f7c1a192..855a6b35 100644
--- a/server/src/workers/__tests__/scribe.flow.test.js
+++ b/server/src/workers/__tests__/scribe.flow.test.js
@@ -13,6 +13,13 @@ vi.mock('../../../models/Entry.js', () => ({
     updateOne: updateOneMock,
     findById: findByIdMock,
   },
+  buildCanonicalTranscriptPayload: (value) => ({
+    transcript: String(value ?? ''),
+    rawTranscript: String(value ?? ''),
+    canonicalTranscript: String(value ?? ''),
+    transcriptHash: 'mock-hash',
+    canonicalizationVersion: '1',
+  }),
 }));
 
 vi.mock('../../models/AgentTask.js', () => ({
diff --git a/server/src/workers/__tests__/scribe.worker.test.ts b/server/src/workers/__tests__/scribe.worker.test.ts
index 43004e83..5f5b5173 100644
--- a/server/src/workers/__tests__/scribe.worker.test.ts
+++ b/server/src/workers/__tests__/scribe.worker.test.ts
@@ -15,6 +15,13 @@ vi.mock('../../../models/Entry.js', () => ({
     updateOne: updateOneMock,
     findById: findByIdMock,
   },
+  buildCanonicalTranscriptPayload: (value) => ({
+    transcript: String(value ?? ''),
+    rawTranscript: String(value ?? ''),
+    canonicalTranscript: String(value ?? ''),
+    transcriptHash: 'mock-hash',
+    canonicalizationVersion: '1',
+  }),
 }));
 
 vi.mock('../../models/AgentTask.js', () => ({
diff --git a/server/src/workers/reflection.worker.js b/server/src/workers/reflection.worker.js
index 23fba598..d4df9f86 100644
--- a/server/src/workers/reflection.worker.js
+++ b/server/src/workers/reflection.worker.js
@@ -1,9 +1,8 @@
 // File: server/src/workers/reflection.worker.js
 
-import { createHash } from 'node:crypto';
 import { Worker } from 'bullmq';
 
-import Entry from '../../models/Entry.js';
+import Entry, { canonicalizeTranscriptV1, sha256Hex } from '../../models/Entry.js';
 import AgentTask from '../models/AgentTask.js';
 import { connection } from '../queues/index.js';
 
@@ -33,18 +32,6 @@ export const REFLECTION_MODE = 'reflection';
 // Deterministic, non-deceptive terminal placeholder.
 const NO_RECEIPTED_MEANING_PLACEHOLDER = 'No receipted meaning available.';
 
-// Single canonicalization routine (boundary-only, not a safety module).
-// Keep stable across OS newline differences + Unicode composition.
-function canonicalizeText(input) {
-  if (typeof input !== 'string') return '';
-  return input
-    .normalize('NFC')
-    .replace(/\r\n/g, '\n')
-    .replace(/\r/g, '\n')
-    .trim()
-    .replace(/[^\S\n]+/g, ' '); // collapse spaces/tabs, preserve newlines
-}
-
 function logSafetyEvent({ severity = 'INFO', type, userId, entryId, details = {} }) {
   const payload = {
     event: 'SAFETY_EVENT',
@@ -101,7 +88,6 @@ const getReceiptAnchor = (receipt) => {
   return receipt.anchor || receipt.quote || receipt.text || '';
 };
 
-const sha256Hex = (value) => createHash('sha256').update(String(value || ''), 'utf8').digest('hex');
 
 const resolveReceiptOffsets = (receipt) => {
   if (!receipt || typeof receipt !== 'object') return null;
@@ -538,7 +524,10 @@ export async function handleReflectionJob(job) {
     const entryIdStr = String(entry._id);
     const userIdStr = String(entry.userId);
 
-    const canonicalTranscript = canonicalizeText(entry.transcript || '');
+    const canonicalTranscript =
+      entry?.canonicalizationVersion === '1' && typeof entry?.canonicalTranscript === 'string'
+        ? entry.canonicalTranscript
+        : canonicalizeTranscriptV1(entry?.transcript || '');
     const transcriptVersion = coerceTranscriptVersionString(entry);
 
     if (entry.pipelineStatus?.reflection === 'completed') {
@@ -607,7 +596,7 @@ export async function handleReflectionJob(job) {
       userId: entry.userId,
       createdAt: { $gte: thirtyDaysAgo },
     })
-      .select('transcript createdAt emotionalIntensity vibe transcriptVersion')
+      .select('transcript canonicalTranscript canonicalizationVersion createdAt emotionalIntensity vibe transcriptVersion')
       .sort({ createdAt: -1 })
       .limit(100)
       .lean();
@@ -620,7 +609,10 @@ export async function handleReflectionJob(job) {
         const intensity = Number(e.emotionalIntensity ?? e?.vibe?.intensity ?? e?.vibe?.score ?? 0);
 
         return {
-          text: canonicalizeText(e.transcript || ''),
+          text:
+            e?.canonicalizationVersion === '1' && typeof e?.canonicalTranscript === 'string'
+              ? e.canonicalTranscript
+              : canonicalizeTranscriptV1(e?.transcript || ''),
           intensity: Number.isFinite(intensity) ? intensity : 0,
           timestamp,
         };
diff --git a/server/src/workers/scribe.worker.js b/server/src/workers/scribe.worker.js
index 7b45360c..80852e5f 100644
--- a/server/src/workers/scribe.worker.js
+++ b/server/src/workers/scribe.worker.js
@@ -6,7 +6,7 @@ import os from 'os';
 import { Readable } from 'stream';
 import { pipeline } from 'stream/promises';
 
-import Entry from '../../models/Entry.js';
+import Entry, { buildCanonicalTranscriptPayload } from '../../models/Entry.js';
 import { langfuse } from '../../utils/langfuse.js';
 import { createAgent } from '../../utils/agents/createAgent.js';
 
@@ -338,11 +338,13 @@ export async function handleScribeJob(job, context = {}) {
 
     // 2) Persist transcript ASAP (never risk losing it due to downstream best-effort work)
     // Dual-Write: Mark both legacy and new flags as complete.
+    const transcriptPayload = buildCanonicalTranscriptPayload(transcript);
+
     await Entry.updateOne(
       { _id: entryId },
       {
         $set: {
-          transcript,
+          ...transcriptPayload,
           transcriptStatus: 'transcript_ready', // Legacy UI support (stops spinner)
           transcriptReadyAt: new Date(),
           'pipelineStatus.upload': 'completed',
diff --git a/server/tests/receipt.v1.test.js b/server/tests/receipt.v1.test.js
index 34bc3193..6c5c2336 100644
--- a/server/tests/receipt.v1.test.js
+++ b/server/tests/receipt.v1.test.js
@@ -28,6 +28,8 @@ beforeAll(async () => {
   // IMPORTANT: these specifiers must match what reflection.worker.js imports.
   vi.mock('../models/Entry.js', () => ({
     default: { findById: vi.fn(), updateOne: vi.fn(), find: vi.fn() },
+    canonicalizeTranscriptV1: (value) => String(value ?? '').trim(),
+    sha256Hex: (value) => createHash('sha256').update(String(value || ''), 'utf8').digest('hex'),
   }));
 
   vi.mock('../src/models/AgentTask.js', () => ({

From 63bc1f206034da8b3583e5d673905c92fd29596f Mon Sep 17 00:00:00 2001
From: Will Haynes <wileland7@gmail.com>
Date: Fri, 20 Feb 2026 09:05:53 -0600
Subject: [PATCH 3/3] fix(codex): align latest.json locks/scope; fold
 transcript whitespace safely

---
 codex/tasks/latest.json               | 14 ++++++------
 server/models/Entry.js                | 32 ++++++++++++++++++++++-----
 server/models/__tests__/Entry.test.js | 22 +++++++++++++-----
 3 files changed, 50 insertions(+), 18 deletions(-)

diff --git a/codex/tasks/latest.json b/codex/tasks/latest.json
index 58bdcf55..e5f76b58 100644
--- a/codex/tasks/latest.json
+++ b/codex/tasks/latest.json
@@ -3,7 +3,7 @@
   "title": "Phase 1: Canonicalize transcript at ingest + stable transcriptHash",
   "summary": "Implement ingest-time transcript canonicalization (NFKC + punctuation folding + line-ending normalization + BOM/null stripping) with versioning. Store rawTranscript + canonicalTranscript + transcriptHash + canonicalizationVersion on Entry for all write paths (upload route + GraphQL addEntry/updateEntry + any other transcript writers). Add deterministic tests for the canonicalization corpus. Do not bulk-migrate existing entries; freeze legacy entries at canonicalizationVersion=0/null and only apply v1 on new/updated transcripts going forward.",
   "base_branch": "develop",
-  "branch_name": "codex/phase1-ingest-canonicalization-exec-2026-02-20",
+  "branch_name": "codex/implement-transcript-canonicalization-at-ingest",
   "repo_scope": [
     "server/models/Entry.js",
     "server/routes/upload.js",
@@ -15,12 +15,12 @@
     "server/models/__tests__/**",
     "server/src/**/__tests__/**",
     "server/tests/**",
+    "server/__tests__/**",
+    "server/routes/__tests__/**",
     "scripts/codex_preflight.mjs",
     "codex/tasks/latest.json"
   ],
-  "agents_involved": [
-    "codex-web"
-  ],
+  "agents_involved": ["codex-web"],
   "risk_level": "medium",
   "tests_to_run": [
     "node -e \"JSON.parse(require('fs').readFileSync('codex/tasks/latest.json','utf8')); console.log('latest.json ok')\"",
@@ -39,17 +39,17 @@
   "acceptance_checks": [
     "Alignment Evidence: print task_id, base_branch, branch_name, repo_scope, tests_to_run at start of run.",
     "Work-Exists Gate: identify all transcript write paths (upload.js, GraphQL addEntry/updateEntry, scribe worker transcript persistence) and show exact files/lines to be changed.",
-    "Implement a single ingest canonicalization function (v1) using NFKC + punctuation folding + newline normalization + BOM/null stripping + trim; store canonicalizationVersion='1'.",
+    "Implement a single ingest canonicalization function (v1) using NFKC + punctuation folding + newline normalization + BOM/null stripping + internal whitespace folding (preserve newlines) + trim; store canonicalizationVersion='1'.",
     "Entry stores rawTranscript (untouched) and canonicalTranscript (canonicalized). transcriptHash is sha256(canonicalTranscript).",
     "All transcript-writing paths set/update canonical fields consistently when transcript changes.",
-    "Add/extend deterministic tests covering: smart quotes folding, dash folding, ellipsis folding, CRLF/CR normalization, BOM/null stripping, and idempotency (canon(canon(x))==canon(x)).",
+    "Add/extend deterministic tests covering: smart quotes folding, dash folding, ellipsis folding, CRLF/CR normalization, BOM/null stripping, internal whitespace folding (tabs/multi-spaces without breaking newlines), and idempotency (canon(canon(x))==canon(x)).",
     "Run tests_to_run and show outputs. If any test is skipped, explain why and provide a safe alternative.",
     "Change Proof: show git status -sb and git diff --stat at end. No diff => no PR."
   ],
   "locks": {
     "task_id": "phase1_ingest_canonicalization_2026_02_20",
     "base_branch": "develop",
-    "branch_name": "codex/phase1-ingest-canonicalization-exec-2026-02-20",
+    "branch_name": "codex/implement-transcript-canonicalization-at-ingest",
     "canary": "PHASE1_INGEST_CANON_V1_CANARY_2026_02_20"
   }
 }
diff --git a/server/models/Entry.js b/server/models/Entry.js
index bb004d72..c9643b3b 100644
--- a/server/models/Entry.js
+++ b/server/models/Entry.js
@@ -20,22 +20,38 @@ const EMOTION_VALUES = [
 const DUEL_OUTCOME_VALUES = ['TRANSMUTED', 'LIBERATED', 'STABILIZED', 'FALTERED'];
 export const TRANSCRIPT_CANONICALIZATION_VERSION_V1 = '1';
 
+// NOTE: Canonicalization v1 is intended to stabilize voice transcripts across:
+// - OS newline differences
+// - Unicode composition / compatibility variants (NFKC)
+// - common typography artifacts (smart quotes/dashes/ellipsis)
+// - transcription spacing artifacts (tabs / double-spaces)
+// It MUST preserve newlines, but should fold intra-line whitespace to single spaces.
 const DOUBLE_SMART_QUOTES_REGEX = /[\u201C\u201D\u201E\u201F\u2033\u2036]/g;
 const SINGLE_SMART_QUOTES_REGEX = /[\u2018\u2019\u201A\u201B\u2032\u2035]/g;
 const DASH_VARIANTS_REGEX = /[\u2010\u2011\u2012\u2013\u2014\u2015\u2212]/g;
+const INTERNAL_WHITESPACE_EXCEPT_NEWLINE_REGEX = /[^\S\n]+/g;
 
 export function canonicalizeTranscriptV1(input) {
   if (typeof input !== 'string') return '';
 
   return input
+    // Remove BOM + NULs (rare, but show up in some pipelines)
     .replace(/\uFEFF/g, '')
     .replace(/\u0000/g, '')
+    // Compatibility normalize to reduce variant churn in transcripts
     .normalize('NFKC')
+    // Typography folding
     .replace(DOUBLE_SMART_QUOTES_REGEX, '"')
     .replace(SINGLE_SMART_QUOTES_REGEX, "'")
     .replace(DASH_VARIANTS_REGEX, '-')
     .replace(/\u2026/g, '...')
+    // Newline normalization
     .replace(/\r\n?/g, '\n')
+    // ✅ Critical: fold internal whitespace runs (tabs/multi-spaces), preserve newlines
+    .replace(INTERNAL_WHITESPACE_EXCEPT_NEWLINE_REGEX, ' ')
+    // ✅ remove whitespace at end-of-line
+    .replace(/[^\S\n]+\n/g, '\n')
+    // Boundary trim
     .trim();
 }
 
@@ -48,7 +64,10 @@ export function buildCanonicalTranscriptPayload(transcriptInput) {
   const canonicalTranscript = canonicalizeTranscriptV1(rawTranscript);
 
   return {
+    // Legacy read surface stays `transcript` but now equals canonical
     transcript: canonicalTranscript,
+
+    // New v1 fields
     rawTranscript,
     canonicalTranscript,
     transcriptHash: sha256Hex(canonicalTranscript),
@@ -109,6 +128,10 @@ const entrySchema = new Schema(
       maxlength: 200,
     },
 
+    /**
+     * Legacy transcript surface.
+     * Canonicalization v1 writes canonical text here to keep old readers stable.
+     */
     transcript: {
       type: String,
       trim: true,
@@ -389,8 +412,8 @@ entrySchema.index({ title: 'text', transcript: 'text' });
  * Default: exclude soft-deleted docs for find* queries.
  *
  * Opt-out:
- *  1) Query option: Entry.find(...).setOptions({ includeDeleted: true })
- *  2) Query sentinel: Entry.find({ includeDeleted: true }) // legacy-friendly
+ * 1) Query option: Entry.find(...).setOptions({ includeDeleted: true })
+ * 2) Query sentinel: Entry.find({ includeDeleted: true }) // legacy-friendly
  */
 function excludeDeletedQuery(next) {
   const opts = (typeof this.getOptions === 'function' && this.getOptions()) || {};
@@ -436,8 +459,7 @@ entrySchema.pre('aggregate', function excludeDeletedAggregate(next) {
     Array.isArray(pipeline) &&
     pipeline.some(
       (stage) =>
-        stage?.$match &&
-        Object.prototype.hasOwnProperty.call(stage.$match, 'isDeleted')
+        stage?.$match && Object.prototype.hasOwnProperty.call(stage.$match, 'isDeleted')
     );
 
   if (alreadyMatched) return next();
@@ -553,4 +575,4 @@ entrySchema.methods.setTranscriptState = async function setTranscriptState({
 // Prevent OverwriteModelError in watch/test environments.
 const Entry = mongoose.models.Entry || mongoose.model('Entry', entrySchema);
 
-export default Entry;
+export default Entry;
\ No newline at end of file
diff --git a/server/models/__tests__/Entry.test.js b/server/models/__tests__/Entry.test.js
index eeb0d474..e2987583 100644
--- a/server/models/__tests__/Entry.test.js
+++ b/server/models/__tests__/Entry.test.js
@@ -1,3 +1,5 @@
+// File: /server/models/__tests__/Entry.test.js
+
 import { describe, expect, it } from 'vitest';
 
 import Entry, {
@@ -19,36 +21,44 @@ describe('Entry emotionalState schema', () => {
     const coreNeedEnum = Entry.schema.path('emotionalState.coreNeed').enumValues;
 
     expect(appraisalEnum).toEqual(
-      expect.arrayContaining(['LOSS', 'THREAT', 'VIOLATION', 'CHALLENGE']),
+      expect.arrayContaining(['LOSS', 'THREAT', 'VIOLATION', 'CHALLENGE'])
     );
     expect(coreNeedEnum).toEqual(
-      expect.arrayContaining(['SAFETY', 'CONNECTION', 'AUTONOMY', 'COMPETENCE']),
+      expect.arrayContaining(['SAFETY', 'CONNECTION', 'AUTONOMY', 'COMPETENCE'])
     );
   });
 });
 
 describe('transcript canonicalization v1', () => {
   it('normalizes corpus variants deterministically and idempotently', () => {
-    const corpus = '\uFEFF\u0000 “Curly”\r\nline—two\rline…\u2018ok\u2019 \u0000';
+    // Includes:
+    // - BOM + NUL stripping
+    // - smart quotes/dashes/ellipsis folding
+    // - CRLF/CR to LF normalization
+    // - internal whitespace folding (tabs + multi-spaces) while preserving newlines
+    const corpus =
+      '\uFEFF\u0000 “Curly”\t\t\r\nline—two  \rline…\u2018ok\u2019 \t \u0000';
 
     const canonical = canonicalizeTranscriptV1(corpus);
 
+    // Tabs and multi-spaces collapse to single spaces; newlines are preserved.
     expect(canonical).toBe('"Curly"\nline-two\nline...\'ok\'');
     expect(canonicalizeTranscriptV1(canonical)).toBe(canonical);
   });
 
   it('builds stable hash payloads from canonical transcript only', () => {
-    const payload = buildCanonicalTranscriptPayload('A\r\nB…');
+    const payload = buildCanonicalTranscriptPayload('A\t \r\nB…');
 
     expect(payload).toMatchObject({
       transcript: 'A\nB...',
-      rawTranscript: 'A\r\nB…',
+      rawTranscript: 'A\t \r\nB…',
       canonicalTranscript: 'A\nB...',
       canonicalizationVersion: TRANSCRIPT_CANONICALIZATION_VERSION_V1,
     });
 
+    // Hash must be derived from canonicalTranscript only (never rawTranscript).
     expect(payload.transcriptHash).toBe(sha256Hex(payload.canonicalTranscript));
     expect(payload.transcriptHash).toBe(sha256Hex('A\nB...'));
     expect(payload.transcriptHash).not.toBe(sha256Hex(payload.rawTranscript));
   });
-});
+});
\ No newline at end of file