wileland · wileland · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026
diff --git a/codex/tasks/latest.json b/codex/tasks/latest.json
@@ -1,45 +1,55 @@
 {
-  "task_id": "phase0-spine-lockdown-2026-02-19",
-  "title": "Phase 0 Spine Lockdown: freeze contract vocab, kill ambiguous receipt offsets, harden emission + narrative policy",
+  "task_id": "phase1_ingest_canonicalization_2026_02_20",
+  "title": "Phase 1: Canonicalize transcript at ingest + stable transcriptHash",
+  "summary": "Implement ingest-time transcript canonicalization (NFKC + punctuation folding + line-ending normalization + BOM/null stripping) with versioning. Store rawTranscript + canonicalTranscript + transcriptHash + canonicalizationVersion on Entry for all write paths (upload route + GraphQL addEntry/updateEntry + any other transcript writers). Add deterministic tests for the canonicalization corpus. Do not bulk-migrate existing entries; freeze legacy entries at canonicalizationVersion=0/null and only apply v1 on new/updated transcripts going forward.",
   "base_branch": "develop",
-  "branch_name": "codex/phase0-spine-lockdown-exec-2026-02-19",
-  "summary": "Seal the Meaning Spine by freezing contract reason codes, enforcing unique-match offset inference (ambiguity=poison), hardening validateReceipt (strict V1 never falls through), ensuring ENTRY_ANALYZED emits contract+sanitized cards only (no raw reflection text), and locking narrative toggle behind a shared policy utility that callers cannot override. Add/adjust regression tests to prevent drift.",
+  "branch_name": "codex/implement-transcript-canonicalization-at-ingest",
   "repo_scope": [
-    "codex/tasks/latest.json",
+    "server/models/Entry.js",
+    "server/routes/upload.js",
+    "server/graphql/resolvers/index.js",
+    "server/src/workers/scribe.worker.js",
     "server/src/workers/reflection.worker.js",
-    "server/src/utils/truthValidator.js",
     "server/src/utils/**",
-    "server/src/workers/__tests__/**",
+    "server/utils/**",
+    "server/models/__tests__/**",
+    "server/src/**/__tests__/**",
     "server/tests/**",
-    "docs/testing-doctrine.md"
+    "server/__tests__/**",
+    "server/routes/__tests__/**",
+    "scripts/codex_preflight.mjs",
+    "codex/tasks/latest.json"
   ],
-  "agents_involved": ["codex_web"],
-  "risk_level": "low",
+  "agents_involved": ["codex-web"],
+  "risk_level": "medium",
   "tests_to_run": [
+    "node -e \"JSON.parse(require('fs').readFileSync('codex/tasks/latest.json','utf8')); console.log('latest.json ok')\"",
     "node scripts/codex_preflight.mjs --ci",
     "pnpm -C server test"
   ],
   "constraints": [
-    "CODEX_WEB: Do NOT run git network commands (no git fetch/pull/push/clone). Use the UI “Create PR” button if a PR is needed.",
-    "CODEX_WEB_HEAD: In Codex Web, the checked-out branch name may be 'work'. Do NOT treat HEAD name mismatch as stale. Locks+canary are the source of truth.",
-    "ANTI-COP-OUT: No diff => no PR. If no actionable work exists, stop and report evidence.",
-    "SCOPE: Do not modify files outside repo_scope. If out-of-scope issues are found, produce a Repair Manifest instead of changing them.",
-    "ALIGNMENT: Print task_id/base_branch/branch_name/canary from latest.json before doing any work.",
-    "EVIDENCE_BUNDLE: Provide evidence in 4 phases: Alignment, Work-Exists Gate, Change Proof, Tests.",
-    "PR_BASE: Ensure PR base branch is develop (not another codex/* branch). Do not create draft PRs.",
-    "NO_PLACEHOLDERS: Do not create empty directories or placeholder files. Only create files with real content and tests.",
-    "NO_NETWORK: Tests must not touch real external network services."
+    "Codex Web environment: do NOT run git push; use the Create PR button.",
+    "Do NOT create placeholder files or empty directories. If no diff is needed, stop and report; do not create a PR.",
+    "All changes must remain within repo_scope. If a necessary fix is out-of-scope, produce a Repair Manifest instead of changing it.",
+    "Canonicalization happens at ingest/write time only (identity). Do not re-canonicalize during validation except legacy v0 fallback.",
+    "Do NOT bulk-migrate existing stored transcripts. Implement freeze+version: legacy entries are v0/null; new writes become v1.",
+    "Hashing must be based on canonicalTranscript and must NOT use locale-sensitive casefolding (no toLowerCase/toUpperCase on hash inputs).",
+    "No raw user transcript content may be logged or emitted into events as part of this change."
   ],
   "acceptance_checks": [
-    "Alignment Evidence: show codex/tasks/latest.json values for task_id, base_branch, branch_name, and canary.",
-    "Alignment Evidence: print `git rev-parse --abbrev-ref HEAD` and `git rev-parse HEAD` for evidence; do NOT stop on SHA mismatch.",
-    "Work-Exists Gate: prove target symbols exist via grep or file navigation; if not found, stop and report: findReceiptOffsets (or equivalent), emitEntryAnalyzed callsite/payload, sanitizeBloomCardsWithContract boundary, validateReceipt in server/src/utils/truthValidator.js (or its imported helpers).",
-    "Freeze contract reason codes: add a shared constants module and replace raw string comparisons/assignments in Meaning Spine paths touched by this task.",
-    "Unique Match Rule: any transcript-search offset inference must return null on ambiguous multi-occurrence matches (firstIndex !== lastIndex). Ambiguity must drop the receipt/card safely and be reflected in contract/dropped reasons.",
-    "validateReceipt hardening: strict V1 path must not fall through to weaker matching if offsets fail; invalid shapes return explicit failure reasons and do not throw.",
-    "Emission hardening: ENTRY_ANALYZED payload must contain sanitized cards AND the Meaning Contract ledger; payload must not include raw reflection text anywhere.",
-    "Tests: add/adjust regression tests that fail if raw model output leaks into emission serialization; add/adjust tests verifying ambiguous quote matches are dropped.",
-    "Proof: include git status -sb and git diff --stat after changes; run tests_to_run and report results. (Run `pnpm -w test` locally after PR if desired.)"
+    "Alignment Evidence: print task_id, base_branch, branch_name, repo_scope, tests_to_run at start of run.",
+    "Work-Exists Gate: identify all transcript write paths (upload.js, GraphQL addEntry/updateEntry, scribe worker transcript persistence) and show exact files/lines to be changed.",
+    "Implement a single ingest canonicalization function (v1) using NFKC + punctuation folding + newline normalization + BOM/null stripping + internal whitespace folding (preserve newlines) + trim; store canonicalizationVersion='1'.",
+    "Entry stores rawTranscript (untouched) and canonicalTranscript (canonicalized). transcriptHash is sha256(canonicalTranscript).",
+    "All transcript-writing paths set/update canonical fields consistently when transcript changes.",
+    "Add/extend deterministic tests covering: smart quotes folding, dash folding, ellipsis folding, CRLF/CR normalization, BOM/null stripping, internal whitespace folding (tabs/multi-spaces without breaking newlines), and idempotency (canon(canon(x))==canon(x)).",
+    "Run tests_to_run and show outputs. If any test is skipped, explain why and provide a safe alternative.",
+    "Change Proof: show git status -sb and git diff --stat at end. No diff => no PR."
   ],
-  "canary": "CANARY_PHASE0_SPINE_LOCKDOWN_2026_02_19"
+  "locks": {
+    "task_id": "phase1_ingest_canonicalization_2026_02_20",
+    "base_branch": "develop",
+    "branch_name": "codex/implement-transcript-canonicalization-at-ingest",
+    "canary": "PHASE1_INGEST_CANON_V1_CANARY_2026_02_20"
+  }
 }
diff --git a/server/__tests__/upload.demo.test.js b/server/__tests__/upload.demo.test.js
@@ -14,6 +14,13 @@ vi.mock('../models/Entry.js', () => ({
       _id: 'demo-entry-id',
     })),
   },
+  buildCanonicalTranscriptPayload: (value) => ({
+    transcript: typeof value === 'string' ? value : '',
+    rawTranscript: typeof value === 'string' ? value : '',
+    canonicalTranscript: typeof value === 'string' ? value : '',
+    transcriptHash: 'mock-hash',
+    canonicalizationVersion: '1',
+  }),
 }));
 
 vi.mock('../src/orchestration/agentOrchestration.js', () => ({

diff --git a/server/graphql/resolvers/index.js b/server/graphql/resolvers/index.js
@@ -2,7 +2,7 @@
 
 import jwt from 'jsonwebtoken';
 
-import Entry from '../../models/Entry.js';
+import Entry, { buildCanonicalTranscriptPayload } from '../../models/Entry.js';
 import TagModel from '../../models/Tag.js';
 import DuelSession from '../../models/DuelSession.js';
 import User from '../../models/User.js';
@@ -473,7 +473,6 @@ const resolvers = {
 
       const ModelToUse = ContextEntry || Entry;
 
-      const aiTags = await inferTagsForEntry(transcript);
 
       const processedTags = (tags || [])
         .map((t) => {
@@ -483,12 +482,6 @@ const resolvers = {
         })
         .filter(Boolean);
 
-      for (const t of aiTags) {
-        if (!processedTags.find((pt) => pt.label === t)) {
-          processedTags.push({ label: t, source: 'ai' });
-        }
-      }
-
       const rawAudioUrl = typeof audioUrl === 'string' ? audioUrl.trim() : '';
       const hasAudio = rawAudioUrl !== '';
       const duration = Number.isFinite(audioDurationSeconds) ? audioDurationSeconds : null;
@@ -503,10 +496,19 @@ const resolvers = {
 
       const shouldPersistUrl = hasAudio && !hasS3Bucket && !isPresignedUrl(rawAudioUrl);
 
+      const transcriptPayload = buildCanonicalTranscriptPayload(transcript);
+      const aiTags = await inferTagsForEntry(transcriptPayload.transcript);
+
+      for (const t of aiTags) {
+        if (!processedTags.find((pt) => pt.label === t)) {
+          processedTags.push({ label: t, source: 'ai' });
+        }
+      }
+
       const entry = await ModelToUse.create({
         userId,
         title,
-        transcript,
+        ...transcriptPayload,
 
         // Persist audioUrl ONLY when it is stable (local path or non-S3 remote without presign).
         audioUrl: shouldPersistUrl ? rawAudioUrl : undefined,
@@ -562,7 +564,14 @@ const resolvers = {
       entry.version += 1;
 
       if (title !== undefined) entry.title = title;
-      if (transcript !== undefined) entry.transcript = transcript;
+      if (transcript !== undefined) {
+        const transcriptPayload = buildCanonicalTranscriptPayload(transcript);
+        entry.transcript = transcriptPayload.transcript;
+        entry.rawTranscript = transcriptPayload.rawTranscript;
+        entry.canonicalTranscript = transcriptPayload.canonicalTranscript;
+        entry.transcriptHash = transcriptPayload.transcriptHash;
+        entry.canonicalizationVersion = transcriptPayload.canonicalizationVersion;
+      }
 
       if (tags !== undefined) {
         entry.tags = tags

diff --git a/server/models/Entry.js b/server/models/Entry.js
@@ -1,6 +1,7 @@
 // File: /server/models/Entry.js
 
 import mongoose from 'mongoose';
+import { createHash } from 'node:crypto';
 import reflectionSchema from './subschemas/Reflection.js';
 import tagSchema from './subschemas/Tag.js';
 
@@ -17,6 +18,62 @@ const EMOTION_VALUES = [
 ];
 
 const DUEL_OUTCOME_VALUES = ['TRANSMUTED', 'LIBERATED', 'STABILIZED', 'FALTERED'];
+export const TRANSCRIPT_CANONICALIZATION_VERSION_V1 = '1';
+
+// NOTE: Canonicalization v1 is intended to stabilize voice transcripts across:
+// - OS newline differences
+// - Unicode composition / compatibility variants (NFKC)
+// - common typography artifacts (smart quotes/dashes/ellipsis)
+// - transcription spacing artifacts (tabs / double-spaces)
+// It MUST preserve newlines, but should fold intra-line whitespace to single spaces.
+const DOUBLE_SMART_QUOTES_REGEX = /[\u201C\u201D\u201E\u201F\u2033\u2036]/g;
+const SINGLE_SMART_QUOTES_REGEX = /[\u2018\u2019\u201A\u201B\u2032\u2035]/g;
+const DASH_VARIANTS_REGEX = /[\u2010\u2011\u2012\u2013\u2014\u2015\u2212]/g;
+const INTERNAL_WHITESPACE_EXCEPT_NEWLINE_REGEX = /[^\S\n]+/g;
+
+export function canonicalizeTranscriptV1(input) {
+  if (typeof input !== 'string') return '';
+
+  return input
+    // Remove BOM + NULs (rare, but show up in some pipelines)
+    .replace(/\uFEFF/g, '')
+    .replace(/\u0000/g, '')
+    // Compatibility normalize to reduce variant churn in transcripts
+    .normalize('NFKC')
+    // Typography folding
+    .replace(DOUBLE_SMART_QUOTES_REGEX, '"')
+    .replace(SINGLE_SMART_QUOTES_REGEX, "'")
+    .replace(DASH_VARIANTS_REGEX, '-')
+    .replace(/\u2026/g, '...')
+    // Newline normalization
+    .replace(/\r\n?/g, '\n')
+    // ✅ Critical: fold internal whitespace runs (tabs/multi-spaces), preserve newlines
+    .replace(INTERNAL_WHITESPACE_EXCEPT_NEWLINE_REGEX, ' ')
+    // ✅ remove whitespace at end-of-line
+    .replace(/[^\S\n]+\n/g, '\n')
+    // Boundary trim
+    .trim();
+}
+
+export function sha256Hex(input) {
+  return createHash('sha256').update(String(input || ''), 'utf8').digest('hex');
+}
+
+export function buildCanonicalTranscriptPayload(transcriptInput) {
+  const rawTranscript = typeof transcriptInput === 'string' ? transcriptInput : '';
+  const canonicalTranscript = canonicalizeTranscriptV1(rawTranscript);
+
+  return {
+    // Legacy read surface stays `transcript` but now equals canonical
+    transcript: canonicalTranscript,
+
+    // New v1 fields
+    rawTranscript,
+    canonicalTranscript,
+    transcriptHash: sha256Hex(canonicalTranscript),
+    canonicalizationVersion: TRANSCRIPT_CANONICALIZATION_VERSION_V1,
+  };
+}
 
 const duelHistorySchema = new Schema(
   {
@@ -71,12 +128,41 @@ const entrySchema = new Schema(
       maxlength: 200,
     },
 
+    /**
+     * Legacy transcript surface.
+     * Canonicalization v1 writes canonical text here to keep old readers stable.
+     */
     transcript: {
       type: String,
       trim: true,
       default: '',
     },
 
+    /**
+     * Canonicalization v1 stores both source text and a deterministic normalized view.
+     * Hashing source-of-truth is canonicalTranscript (not rawTranscript).
+     */
+    rawTranscript: {
+      type: String,
+      default: '',
+    },
+
+    canonicalTranscript: {
+      type: String,
+      default: '',
+    },
+
+    transcriptHash: {
+      type: String,
+      trim: true,
+      default: '',
+    },
+
+    canonicalizationVersion: {
+      type: String,
+      default: null,
+    },
+
     /**
      * Transcript lifecycle state (worker-friendly).
      * Keep enum small and stable.
@@ -326,8 +412,8 @@ entrySchema.index({ title: 'text', transcript: 'text' });
  * Default: exclude soft-deleted docs for find* queries.
  *
  * Opt-out:
- *  1) Query option: Entry.find(...).setOptions({ includeDeleted: true })
- *  2) Query sentinel: Entry.find({ includeDeleted: true }) // legacy-friendly
+ * 1) Query option: Entry.find(...).setOptions({ includeDeleted: true })
+ * 2) Query sentinel: Entry.find({ includeDeleted: true }) // legacy-friendly
  */
 function excludeDeletedQuery(next) {
   const opts = (typeof this.getOptions === 'function' && this.getOptions()) || {};
@@ -373,8 +459,7 @@ entrySchema.pre('aggregate', function excludeDeletedAggregate(next) {
     Array.isArray(pipeline) &&
     pipeline.some(
       (stage) =>
-        stage?.$match &&
-        Object.prototype.hasOwnProperty.call(stage.$match, 'isDeleted')
+        stage?.$match && Object.prototype.hasOwnProperty.call(stage.$match, 'isDeleted')
     );
 
   if (alreadyMatched) return next();
@@ -458,7 +543,12 @@ entrySchema.methods.setTranscriptState = async function setTranscriptState({
   if (status) this.transcriptStatus = status;
 
   if (typeof transcript === 'string') {
-    this.transcript = transcript;
+    const canonicalPayload = buildCanonicalTranscriptPayload(transcript);
+    this.transcript = canonicalPayload.transcript;
+    this.rawTranscript = canonicalPayload.rawTranscript;
+    this.canonicalTranscript = canonicalPayload.canonicalTranscript;
+    this.transcriptHash = canonicalPayload.transcriptHash;
+    this.canonicalizationVersion = canonicalPayload.canonicalizationVersion;
     if (!status) this.transcriptStatus = 'transcript_ready';
   }
 
@@ -485,4 +575,4 @@ entrySchema.methods.setTranscriptState = async function setTranscriptState({
 // Prevent OverwriteModelError in watch/test environments.
 const Entry = mongoose.models.Entry || mongoose.model('Entry', entrySchema);
 
-export default Entry;
+export default Entry;
diff --git a/server/models/__tests__/Entry.test.js b/server/models/__tests__/Entry.test.js
@@ -1,6 +1,13 @@
+// File: /server/models/__tests__/Entry.test.js
+
 import { describe, expect, it } from 'vitest';
 
-import Entry from '../Entry.js';
+import Entry, {
+  buildCanonicalTranscriptPayload,
+  canonicalizeTranscriptV1,
+  sha256Hex,
+  TRANSCRIPT_CANONICALIZATION_VERSION_V1,
+} from '../Entry.js';
 
 describe('Entry emotionalState schema', () => {
   it('includes appraisal/coreNeed enums with safe defaults', () => {
@@ -14,10 +21,44 @@ describe('Entry emotionalState schema', () => {
     const coreNeedEnum = Entry.schema.path('emotionalState.coreNeed').enumValues;
 
     expect(appraisalEnum).toEqual(
-      expect.arrayContaining(['LOSS', 'THREAT', 'VIOLATION', 'CHALLENGE']),
+      expect.arrayContaining(['LOSS', 'THREAT', 'VIOLATION', 'CHALLENGE'])
     );
     expect(coreNeedEnum).toEqual(
-      expect.arrayContaining(['SAFETY', 'CONNECTION', 'AUTONOMY', 'COMPETENCE']),
+      expect.arrayContaining(['SAFETY', 'CONNECTION', 'AUTONOMY', 'COMPETENCE'])
     );
   });
 });
+
+describe('transcript canonicalization v1', () => {
+  it('normalizes corpus variants deterministically and idempotently', () => {
+    // Includes:
+    // - BOM + NUL stripping
+    // - smart quotes/dashes/ellipsis folding
+    // - CRLF/CR to LF normalization
+    // - internal whitespace folding (tabs + multi-spaces) while preserving newlines
+    const corpus =
+      '\uFEFF\u0000 “Curly”\t\t\r\nline—two  \rline…\u2018ok\u2019 \t \u0000';
+
+    const canonical = canonicalizeTranscriptV1(corpus);
+
+    // Tabs and multi-spaces collapse to single spaces; newlines are preserved.
+    expect(canonical).toBe('"Curly"\nline-two\nline...\'ok\'');
+    expect(canonicalizeTranscriptV1(canonical)).toBe(canonical);
+  });
+
+  it('builds stable hash payloads from canonical transcript only', () => {
+    const payload = buildCanonicalTranscriptPayload('A\t \r\nB…');
+
+    expect(payload).toMatchObject({
+      transcript: 'A\nB...',
+      rawTranscript: 'A\t \r\nB…',
+      canonicalTranscript: 'A\nB...',
+      canonicalizationVersion: TRANSCRIPT_CANONICALIZATION_VERSION_V1,
+    });
+
+    // Hash must be derived from canonicalTranscript only (never rawTranscript).
+    expect(payload.transcriptHash).toBe(sha256Hex(payload.canonicalTranscript));
+    expect(payload.transcriptHash).toBe(sha256Hex('A\nB...'));
+    expect(payload.transcriptHash).not.toBe(sha256Hex(payload.rawTranscript));
+  });
+});
diff --git a/server/routes/__tests__/upload.test.js b/server/routes/__tests__/upload.test.js
@@ -29,6 +29,13 @@ vi.mock('../../models/Entry.js', () => ({
       _id: 'entry-mock-id',
     })),
   },
+  buildCanonicalTranscriptPayload: (value) => ({
+    transcript: typeof value === 'string' ? value : '',
+    rawTranscript: typeof value === 'string' ? value : '',
+    canonicalTranscript: typeof value === 'string' ? value : '',
+    transcriptHash: 'mock-hash',
+    canonicalizationVersion: '1',
+  }),
 }));
 vi.mock('../../src/orchestration/agentOrchestration.js', () => ({
   createScribeTask: vi.fn(async () => ({