From 611b94b5efa20343f79a29ea3641f7618fef049a Mon Sep 17 00:00:00 2001
From: Nick <58236636+EngineerNV@users.noreply.github.com>
Date: Thu, 30 Oct 2025 00:50:40 -0700
Subject: [PATCH 1/2] Switch ingestion to token-based chunks and add sample
 corpus

---
 .gitignore                      |  1 +
 Agent.MD                        |  5 +-
 README.md                       |  3 +-
 "data/corpus/Pok\303\251mon.MD" | 28 +++++++++++
 scripts/00_ingest.py            | 86 +++++++++++++++++++++++++++------
 5 files changed, 105 insertions(+), 18 deletions(-)
 create mode 100644 "data/corpus/Pok\303\251mon.MD"

diff --git a/.gitignore b/.gitignore
index 6447af5..623791f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,5 +41,6 @@ data/chunks/
 data/corpus/*
 !data/corpus/README.md
 !data/corpus/.gitkeep
+!data/corpus/Pokémon.MD
 data/chroma/*
 !data/chroma/.gitkeep
diff --git a/Agent.MD b/Agent.MD
index 0174781..c2903a9 100644
--- a/Agent.MD
+++ b/Agent.MD
@@ -8,8 +8,9 @@ to the whole repo (there are no nested `AGENTS.md` files).
 
 The numbered scripts under `scripts/` are fully implemented and runnable:
 
-1. `00_ingest.py` – loads markdown from `data/corpus/`, splits by headers, and
-   prints a preview of the resulting LangChain `Document` chunks.
+1. `00_ingest.py` – loads markdown from `data/corpus/`, splits them into
+   token-aware chunks with overlap, and prints a preview of the resulting
+   LangChain `Document` objects.
 2. `01_build_index.py` – embeds the chunks with MiniLM via
    `HuggingFaceEmbeddings`, replaces `data/chroma/`, and prints a build summary
    (doc count, collection name, metadata keys).
diff --git a/README.md b/README.md
index b837e53..1930805 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@
    ```
 2. **Configure secrets:** copy `.env.example` to `.env` and populate API keys (`OPENAI_API_KEY` for `02_query.py`, `RAG_LLM_API_KEY` for `04_llm_api.py`).
 3. **Add source material:** drop markdown files into `data/corpus/` (the `corpus/` and `chroma/` directories are created for you).
+   - A sample knowledge base, `Pokémon.MD`, is included so you can immediately test ingestion and retrieval behaviour.
 4. **Run the pipeline:**
    ```bash
    python scripts/00_ingest.py            # inspect chunking
@@ -31,7 +32,7 @@ The query script offers three modes:
 
 | Path | Purpose |
 |------|---------|
-| `scripts/00_ingest.py` | Loads markdown files from `data/corpus/`, splits them by headers, and previews the resulting `Document` chunks. |
+| `scripts/00_ingest.py` | Loads markdown files from `data/corpus/`, splits them into token-sized chunks with overlap, and previews the resulting `Document` objects. |
 | `scripts/01_build_index.py` | Embeds the ingested chunks with `HuggingFaceEmbeddings`, rebuilds `data/chroma/`, and prints a build summary. |
 | `scripts/02_query.py` | Connects to the persisted Chroma store and exposes the retrieval CLI described above. |
 | `scripts/03_eval.py` | Scores saved question/answer/context rows with lexical heuristics and prints aggregate metrics. |
diff --git "a/data/corpus/Pok\303\251mon.MD" "b/data/corpus/Pok\303\251mon.MD"
new file mode 100644
index 0000000..8703a6a
--- /dev/null
+++ "b/data/corpus/Pok\303\251mon.MD"
@@ -0,0 +1,28 @@
+# Paldea Battle Field Guide
+
+## Starter Pokémon Highlights
+- **Sprigatito** is a Grass-type feline Pokémon whose signature move, **Flower Trick**, always lands a critical hit and never misses.
+- **Fuecoco** is a Fire-type crocodile that evolves into a special-attacking powerhouse; it loves to snack between battles.
+- **Quaxly** is a Water-type duck with excellent physical attack growth and access to the priority move **Aqua Jet**.
+
+## Titan Strategies
+1. **Klawf, the Stony Cliff Titan** is weak to Water, Grass, Fighting, Ground, and Steel moves; Terastallized Sprigatito can finish the fight quickly.
+2. **Bombirdier, the Open Sky Titan** takes quadruple damage from Electric attacks because of its Flying/Dark typing.
+3. **Orthworm, the Lurking Steel Titan** resists Dragon moves but is extremely vulnerable to Fire- and Fighting-type attacks.
+
+## Gym Puzzle Reminders
+- **Cortondo Gym (Bug)**: Roll a giant olive through the maze, then use Rock-type attacks against Katy's team.
+- **Levincia Gym (Electric)**: Complete Iono's livestream hide-and-seek challenge; bring Ground types to ignore her Pokémon's STAB moves.
+- **Cascarrafa Gym (Water)**: Beat Kofu's bidding contest before challenging him; Grass attacks are safest because his team covers Electric threats.
+
+## Legendary Research Notes
+- The quartet **Wo-Chien, Chien-Pao, Ting-Lu,** and **Chi-Yu** are sealed behind ominous stakes scattered across Paldea. Pulling every stake of a color unlocks a respective shrine.
+- **Koraidon** favors physical attacks and unlocks more traversal abilities as the story progresses, while **Miraidon** specializes in special attacks and synergizes with Electric Terrain.
+- Terastal energy crystals respond best to moves matching a Pokémon's Tera Type, granting a significant power boost for that move type.
+
+## Elite Four Quick Facts
+- **Rika** leads with Ground types; Water or Grass Pokémon can withstand her Earth Power spam.
+- **Poppy** relies on Steel types; Fire-type Terastalization cuts through their defenses.
+- **Larry** returns with Flying types—Electric and Ice moves are your safest bets.
+- **Hassel** uses Dragon types, so bring Fairy, Ice, or Dragon coverage to wrap the battle up.
+
diff --git a/scripts/00_ingest.py b/scripts/00_ingest.py
index 99cde99..42f3c79 100644
--- a/scripts/00_ingest.py
+++ b/scripts/00_ingest.py
@@ -8,9 +8,9 @@
 What the script already provides
 --------------------------------
 * Reads markdown-like files from ``data/corpus/`` (configurable via ``CORPUS_DIR``).
-* Splits documents into sections keyed by headers (#, ##, ###) using
-  ``MarkdownHeaderTextSplitter``.
-* Returns ``List[Document]`` objects with header metadata intact—ready for embedding.
+* Splits documents into token-budgeted sections with sensible overlap so chunks align
+  with embedding context windows.
+* Returns ``List[Document]`` objects annotated with source metadata ready for embedding.
 
 Learner guidance
 ----------------
@@ -21,15 +21,18 @@
 """
 
 import os  # Handle filesystem navigation for the corpus directory
-from typing import List  # Describe the list of LangChain Document objects returned
+from typing import Iterable, List  # Describe the list of LangChain Document objects returned
 
-from langchain_text_splitters import (  # Split markdown files into header-aware chunks
-    MarkdownHeaderTextSplitter,
-)
+from langchain_core.documents import Document  # Represent individual text chunks with metadata
+from langchain_text_splitters import RecursiveCharacterTextSplitter
 
 # ---------------------------- Config ----------------------------
 CORPUS_DIR = os.environ.get("CORPUS_DIR", os.path.join("data", "corpus"))
-HEADER_LEVELS = ["#", "##", "###"]  # order matters
+
+# Configure chunking in token units so the ingestion step aligns with embedding models.
+DEFAULT_TIKTOKEN_MODEL = os.environ.get("INGEST_TIKTOKEN_MODEL", "text-embedding-3-small")
+CHUNK_SIZE_TOKENS = int(os.environ.get("INGEST_CHUNK_SIZE", "400"))
+CHUNK_OVERLAP_TOKENS = int(os.environ.get("INGEST_CHUNK_OVERLAP", "80"))
 
 
 # ---------------------------- Helpers ----------------------------
@@ -43,17 +46,66 @@ def read_text(path: str) -> str:
         return f.read()
 
 
-def split_markdown(markdown_text: str):
-    splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[(h, h) for h in HEADER_LEVELS])
-    return splitter.split_text(markdown_text)
+def _approximate_token_length(text: str) -> int:
+    """Fallback token estimate when a real tokenizer is unavailable."""
+
+    return max(1, len(text.split()))
+
+
+def _build_splitter() -> RecursiveCharacterTextSplitter:
+    """Return a token-aware text splitter with a sensible amount of overlap."""
+
+    # Guard against accidental misconfiguration that sets overlap >= chunk size.
+    overlap = max(0, min(CHUNK_OVERLAP_TOKENS, max(CHUNK_SIZE_TOKENS - 1, 0)))
+    chunk_size = max(1, CHUNK_SIZE_TOKENS)
+
+    try:
+        return RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+            model_name=DEFAULT_TIKTOKEN_MODEL,
+            chunk_size=chunk_size,
+            chunk_overlap=overlap,
+        )
+    except Exception:  # pragma: no cover - network restricted environments fall back here
+        return RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=overlap,
+            length_function=_approximate_token_length,
+        )
+
+
+_SPLITTER = _build_splitter()
+
+
+def split_markdown(markdown_text: str, *, source_path: str) -> List[Document]:
+    """Split text into token-bounded chunks while retaining document metadata."""
+
+    base_metadata = {
+        "source": os.path.basename(source_path),
+        "source_path": source_path,
+    }
+    documents = _SPLITTER.create_documents(
+        [markdown_text],
+        metadatas=[base_metadata],
+    )
+
+    total_chunks = len(documents)
+    for idx, doc in enumerate(documents):
+        metadata = getattr(doc, "metadata", {}) or {}
+        metadata.update({
+            "chunk_index": idx,
+            "chunk_count": total_chunks,
+        })
+        doc.metadata = metadata
+
+    return documents
 
 
 # (Persistence helpers removed – not needed now.)
 
 
 # ---------------------------- Core processing ----------------------------
-def process_file(path: str):
-    return split_markdown(read_text(path))
+def process_file(path: str) -> Iterable[Document]:
+    return split_markdown(read_text(path), source_path=path)
 
 
 def ingest() -> List:
@@ -72,8 +124,12 @@ def ingest() -> List:
 
 def preview(docs, n: int = 3):
     for i, d in enumerate(docs[:n]):
-        headers = [d.metadata.get(h) for h in HEADER_LEVELS if d.metadata.get(h)]
-        print(f"[{i}] {' > '.join(headers) if headers else '<no header>'}")
+        metadata = getattr(d, "metadata", {}) or {}
+        source = metadata.get("source", "<unknown source>")
+        idx = metadata.get("chunk_index")
+        count = metadata.get("chunk_count")
+        chunk_label = f"chunk {idx + 1}/{count}" if isinstance(idx, int) and isinstance(count, int) else "chunk"
+        print(f"[{i}] {source} • {chunk_label}")
         snippet = d.page_content.strip().replace('\n', ' ')
         print(snippet[:200] + ('...' if len(snippet) > 200 else ''))
 

From efb51ee3f14a4e67f35226e8b0358064f068797b Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 30 Oct 2025 07:54:32 +0000
Subject: [PATCH 2/2] Initial plan