From 611b94b5efa20343f79a29ea3641f7618fef049a Mon Sep 17 00:00:00 2001 From: Nick <58236636+EngineerNV@users.noreply.github.com> Date: Thu, 30 Oct 2025 00:50:40 -0700 Subject: [PATCH 1/2] Switch ingestion to token-based chunks and add sample corpus --- .gitignore | 1 + Agent.MD | 5 +- README.md | 3 +- "data/corpus/Pok\303\251mon.MD" | 28 +++++++++++ scripts/00_ingest.py | 86 +++++++++++++++++++++++++++------ 5 files changed, 105 insertions(+), 18 deletions(-) create mode 100644 "data/corpus/Pok\303\251mon.MD" diff --git a/.gitignore b/.gitignore index 6447af5..623791f 100644 --- a/.gitignore +++ b/.gitignore @@ -41,5 +41,6 @@ data/chunks/ data/corpus/* !data/corpus/README.md !data/corpus/.gitkeep +!data/corpus/Pokémon.MD data/chroma/* !data/chroma/.gitkeep diff --git a/Agent.MD b/Agent.MD index 0174781..c2903a9 100644 --- a/Agent.MD +++ b/Agent.MD @@ -8,8 +8,9 @@ to the whole repo (there are no nested `AGENTS.md` files). The numbered scripts under `scripts/` are fully implemented and runnable: -1. `00_ingest.py` – loads markdown from `data/corpus/`, splits by headers, and - prints a preview of the resulting LangChain `Document` chunks. +1. `00_ingest.py` – loads markdown from `data/corpus/`, splits them into + token-aware chunks with overlap, and prints a preview of the resulting + LangChain `Document` objects. 2. `01_build_index.py` – embeds the chunks with MiniLM via `HuggingFaceEmbeddings`, replaces `data/chroma/`, and prints a build summary (doc count, collection name, metadata keys). diff --git a/README.md b/README.md index b837e53..1930805 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ ``` 2. **Configure secrets:** copy `.env.example` to `.env` and populate API keys (`OPENAI_API_KEY` for `02_query.py`, `RAG_LLM_API_KEY` for `04_llm_api.py`). 3. **Add source material:** drop markdown files into `data/corpus/` (the `corpus/` and `chroma/` directories are created for you). + - A sample knowledge base, `Pokémon.MD`, is included so you can immediately test ingestion and retrieval behaviour. 4. **Run the pipeline:** ```bash python scripts/00_ingest.py # inspect chunking @@ -31,7 +32,7 @@ The query script offers three modes: | Path | Purpose | |------|---------| -| `scripts/00_ingest.py` | Loads markdown files from `data/corpus/`, splits them by headers, and previews the resulting `Document` chunks. | +| `scripts/00_ingest.py` | Loads markdown files from `data/corpus/`, splits them into token-sized chunks with overlap, and previews the resulting `Document` objects. | | `scripts/01_build_index.py` | Embeds the ingested chunks with `HuggingFaceEmbeddings`, rebuilds `data/chroma/`, and prints a build summary. | | `scripts/02_query.py` | Connects to the persisted Chroma store and exposes the retrieval CLI described above. | | `scripts/03_eval.py` | Scores saved question/answer/context rows with lexical heuristics and prints aggregate metrics. | diff --git "a/data/corpus/Pok\303\251mon.MD" "b/data/corpus/Pok\303\251mon.MD" new file mode 100644 index 0000000..8703a6a --- /dev/null +++ "b/data/corpus/Pok\303\251mon.MD" @@ -0,0 +1,28 @@ +# Paldea Battle Field Guide + +## Starter Pokémon Highlights +- **Sprigatito** is a Grass-type feline Pokémon whose signature move, **Flower Trick**, always lands a critical hit and never misses. +- **Fuecoco** is a Fire-type crocodile that evolves into a special-attacking powerhouse; it loves to snack between battles. +- **Quaxly** is a Water-type duck with excellent physical attack growth and access to the priority move **Aqua Jet**. + +## Titan Strategies +1. **Klawf, the Stony Cliff Titan** is weak to Water, Grass, Fighting, Ground, and Steel moves; Terastallized Sprigatito can finish the fight quickly. +2. **Bombirdier, the Open Sky Titan** takes quadruple damage from Electric attacks because of its Flying/Dark typing. +3. **Orthworm, the Lurking Steel Titan** resists Dragon moves but is extremely vulnerable to Fire- and Fighting-type attacks. + +## Gym Puzzle Reminders +- **Cortondo Gym (Bug)**: Roll a giant olive through the maze, then use Rock-type attacks against Katy's team. +- **Levincia Gym (Electric)**: Complete Iono's livestream hide-and-seek challenge; bring Ground types to ignore her Pokémon's STAB moves. +- **Cascarrafa Gym (Water)**: Beat Kofu's bidding contest before challenging him; Grass attacks are safest because his team covers Electric threats. + +## Legendary Research Notes +- The quartet **Wo-Chien, Chien-Pao, Ting-Lu,** and **Chi-Yu** are sealed behind ominous stakes scattered across Paldea. Pulling every stake of a color unlocks a respective shrine. +- **Koraidon** favors physical attacks and unlocks more traversal abilities as the story progresses, while **Miraidon** specializes in special attacks and synergizes with Electric Terrain. +- Terastal energy crystals respond best to moves matching a Pokémon's Tera Type, granting a significant power boost for that move type. + +## Elite Four Quick Facts +- **Rika** leads with Ground types; Water or Grass Pokémon can withstand her Earth Power spam. +- **Poppy** relies on Steel types; Fire-type Terastalization cuts through their defenses. +- **Larry** returns with Flying types—Electric and Ice moves are your safest bets. +- **Hassel** uses Dragon types, so bring Fairy, Ice, or Dragon coverage to wrap the battle up. + diff --git a/scripts/00_ingest.py b/scripts/00_ingest.py index 99cde99..42f3c79 100644 --- a/scripts/00_ingest.py +++ b/scripts/00_ingest.py @@ -8,9 +8,9 @@ What the script already provides -------------------------------- * Reads markdown-like files from ``data/corpus/`` (configurable via ``CORPUS_DIR``). -* Splits documents into sections keyed by headers (#, ##, ###) using - ``MarkdownHeaderTextSplitter``. -* Returns ``List[Document]`` objects with header metadata intact—ready for embedding. +* Splits documents into token-budgeted sections with sensible overlap so chunks align + with embedding context windows. +* Returns ``List[Document]`` objects annotated with source metadata ready for embedding. Learner guidance ---------------- @@ -21,15 +21,18 @@ """ import os # Handle filesystem navigation for the corpus directory -from typing import List # Describe the list of LangChain Document objects returned +from typing import Iterable, List # Describe the list of LangChain Document objects returned -from langchain_text_splitters import ( # Split markdown files into header-aware chunks - MarkdownHeaderTextSplitter, -) +from langchain_core.documents import Document # Represent individual text chunks with metadata +from langchain_text_splitters import RecursiveCharacterTextSplitter # ---------------------------- Config ---------------------------- CORPUS_DIR = os.environ.get("CORPUS_DIR", os.path.join("data", "corpus")) -HEADER_LEVELS = ["#", "##", "###"] # order matters + +# Configure chunking in token units so the ingestion step aligns with embedding models. +DEFAULT_TIKTOKEN_MODEL = os.environ.get("INGEST_TIKTOKEN_MODEL", "text-embedding-3-small") +CHUNK_SIZE_TOKENS = int(os.environ.get("INGEST_CHUNK_SIZE", "400")) +CHUNK_OVERLAP_TOKENS = int(os.environ.get("INGEST_CHUNK_OVERLAP", "80")) # ---------------------------- Helpers ---------------------------- @@ -43,17 +46,66 @@ def read_text(path: str) -> str: return f.read() -def split_markdown(markdown_text: str): - splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[(h, h) for h in HEADER_LEVELS]) - return splitter.split_text(markdown_text) +def _approximate_token_length(text: str) -> int: + """Fallback token estimate when a real tokenizer is unavailable.""" + + return max(1, len(text.split())) + + +def _build_splitter() -> RecursiveCharacterTextSplitter: + """Return a token-aware text splitter with a sensible amount of overlap.""" + + # Guard against accidental misconfiguration that sets overlap >= chunk size. + overlap = max(0, min(CHUNK_OVERLAP_TOKENS, max(CHUNK_SIZE_TOKENS - 1, 0))) + chunk_size = max(1, CHUNK_SIZE_TOKENS) + + try: + return RecursiveCharacterTextSplitter.from_tiktoken_encoder( + model_name=DEFAULT_TIKTOKEN_MODEL, + chunk_size=chunk_size, + chunk_overlap=overlap, + ) + except Exception: # pragma: no cover - network restricted environments fall back here + return RecursiveCharacterTextSplitter( + chunk_size=chunk_size, + chunk_overlap=overlap, + length_function=_approximate_token_length, + ) + + +_SPLITTER = _build_splitter() + + +def split_markdown(markdown_text: str, *, source_path: str) -> List[Document]: + """Split text into token-bounded chunks while retaining document metadata.""" + + base_metadata = { + "source": os.path.basename(source_path), + "source_path": source_path, + } + documents = _SPLITTER.create_documents( + [markdown_text], + metadatas=[base_metadata], + ) + + total_chunks = len(documents) + for idx, doc in enumerate(documents): + metadata = getattr(doc, "metadata", {}) or {} + metadata.update({ + "chunk_index": idx, + "chunk_count": total_chunks, + }) + doc.metadata = metadata + + return documents # (Persistence helpers removed – not needed now.) # ---------------------------- Core processing ---------------------------- -def process_file(path: str): - return split_markdown(read_text(path)) +def process_file(path: str) -> Iterable[Document]: + return split_markdown(read_text(path), source_path=path) def ingest() -> List: @@ -72,8 +124,12 @@ def ingest() -> List: def preview(docs, n: int = 3): for i, d in enumerate(docs[:n]): - headers = [d.metadata.get(h) for h in HEADER_LEVELS if d.metadata.get(h)] - print(f"[{i}] {' > '.join(headers) if headers else ''}") + metadata = getattr(d, "metadata", {}) or {} + source = metadata.get("source", "") + idx = metadata.get("chunk_index") + count = metadata.get("chunk_count") + chunk_label = f"chunk {idx + 1}/{count}" if isinstance(idx, int) and isinstance(count, int) else "chunk" + print(f"[{i}] {source} • {chunk_label}") snippet = d.page_content.strip().replace('\n', ' ') print(snippet[:200] + ('...' if len(snippet) > 200 else '')) From efb51ee3f14a4e67f35226e8b0358064f068797b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 30 Oct 2025 07:54:32 +0000 Subject: [PATCH 2/2] Initial plan