From 750406b0570832e1c495abf204b1a77242c1d07d Mon Sep 17 00:00:00 2001 From: Zack Fitch Date: Thu, 19 Feb 2026 00:29:46 -0800 Subject: [PATCH] Unify MCP extension whitelist with handlers and add data formats The MCP tools.rs had a hardcoded 9-extension whitelist while handlers/mod.rs had a comprehensive 27-extension ALLOWED_EXTENSIONS constant. This caused the MCP index tool to silently drop most file types (Python, Go, Shell, logs, etc). Now both code paths use the same shared constant. Adds log/data extensions (.log, .jsonl, .csv, .xml, .env, .ini, .cfg, .conf, .swift) and fixes MCP walk_directory to skip hidden dirs and noise dirs (node_modules, target, dist, build). Co-Authored-By: Claude Opus 4.6 --- Cargo.lock | 252 ++++++++++++++++++++++++++++++ ingestor-core/src/handlers/mod.rs | 4 + ingestor-core/src/lib.rs | 1 + ingestor-core/src/mcp/tools.rs | 14 +- ingestor-core/src/util.rs | 7 +- 5 files changed, 273 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3dbccb8..508a4bf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -70,12 +70,56 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" +[[package]] +name = "anstream" +version = "0.6.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + [[package]] name = "anstyle" version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + [[package]] name = "anyhow" version = "1.0.100" @@ -106,6 +150,21 @@ dependencies = [ "libloading", ] +[[package]] +name = "assert_cmd" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c5bcfa8749ac45dd12cb11055aeeb6b27a3895560d60d71e3c23bf979e60514" +dependencies = [ + "anstyle", + "bstr", + "libc", + "predicates", + "predicates-core", + "predicates-tree", + "wait-timeout", +] + [[package]] name = "async-channel" version = "2.5.0" @@ -259,6 +318,17 @@ dependencies = [ "generic-array", ] +[[package]] +name = "bstr" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" +dependencies = [ + "memchr", + "regex-automata", + "serde", +] + [[package]] name = "bumpalo" version = "3.19.1" @@ -786,6 +856,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c6e6ff9dcd79cff5cd969a17a545d79e84ab086e444102a591e288a8aa3ce394" dependencies = [ "clap_builder", + "clap_derive", ] [[package]] @@ -794,8 +865,22 @@ version = "4.5.54" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa42cf4d2b7a41bc8f663a7cab4031ebafa1bf3875705bfaf8466dc60ab52c00" dependencies = [ + "anstream", "anstyle", "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.49" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.114", ] [[package]] @@ -815,6 +900,12 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + [[package]] name = "colored" version = "3.1.1" @@ -1685,6 +1776,12 @@ version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" +[[package]] +name = "difflib" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" + [[package]] name = "digest" version = "0.10.7" @@ -2005,6 +2102,15 @@ dependencies = [ "zlib-rs", ] +[[package]] +name = "float-cmp" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b09cf3155332e944990140d967ff5eceb70df778b34f77d8075db46e4704e6d8" +dependencies = [ + "num-traits", +] + [[package]] name = "float-ord" version = "0.3.2" @@ -2494,6 +2600,19 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" +[[package]] +name = "globset" +version = "0.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52dfc19153a48bde0cbd630453615c8151bce3a5adfac7a0aebfbf0a1e1f57e3" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata", + "regex-syntax", +] + [[package]] name = "glow" version = "0.16.0" @@ -2935,6 +3054,22 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "ignore" +version = "0.4.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3d782a365a015e0f5c04902246139249abf769125006fbe7649e2ee88169b4a" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata", + "same-file", + "walkdir", + "winapi-util", +] + [[package]] name = "indexmap" version = "2.13.0" @@ -2963,11 +3098,16 @@ name = "ingestor-core" version = "0.1.0" dependencies = [ "anyhow", + "assert_cmd", + "clap", "criterion", "dirs 5.0.1", "hex", + "ignore", + "lru", "ndarray 0.16.1", "ort", + "predicates", "pretty_assertions", "regex", "reqwest", @@ -2978,6 +3118,7 @@ dependencies = [ "serde_json", "sha2", "tempfile", + "test-case", "tokenizers", "tokio", "tracing", @@ -3047,6 +3188,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + [[package]] name = "itertools" version = "0.10.5" @@ -3246,6 +3393,15 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "lru" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" +dependencies = [ + "hashbrown 0.15.5", +] + [[package]] name = "lru-slab" version = "0.1.2" @@ -3549,6 +3705,12 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "normalize-line-endings" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be" + [[package]] name = "ntapi" version = "0.4.2" @@ -3729,6 +3891,12 @@ version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + [[package]] name = "onig" version = "6.5.1" @@ -4034,6 +4202,36 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "predicates" +version = "3.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ada8f2932f28a27ee7b70dd6c1c39ea0675c55a36879ab92f3a715eaa1e63cfe" +dependencies = [ + "anstyle", + "difflib", + "float-cmp", + "normalize-line-endings", + "predicates-core", + "regex", +] + +[[package]] +name = "predicates-core" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cad38746f3166b4031b1a0d39ad9f954dd291e7854fcc0eed52ee41a0b50d144" + +[[package]] +name = "predicates-tree" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0de1b847b39c8131db0467e9df1ff60e6d0562ab8e9a16e568ad0fdb372e2f2" +dependencies = [ + "predicates-core", + "termtree", +] + [[package]] name = "presser" version = "0.3.1" @@ -5231,6 +5429,45 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "termtree" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683" + +[[package]] +name = "test-case" +version = "3.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb2550dd13afcd286853192af8601920d959b14c401fcece38071d53bf0768a8" +dependencies = [ + "test-case-macros", +] + +[[package]] +name = "test-case-core" +version = "3.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adcb7fd841cd518e279be3d5a3eb0636409487998a4aff22f3de87b81e88384f" +dependencies = [ + "cfg-if", + "proc-macro2", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "test-case-macros" +version = "3.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c89e72a01ed4c579669add59014b9a524d609c0c88c6a585ce37485879f6ffb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", + "test-case-core", +] + [[package]] name = "text_placeholder" version = "0.5.1" @@ -5895,6 +6132,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "uuid" version = "1.19.0" @@ -5936,6 +6179,15 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" +[[package]] +name = "wait-timeout" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11" +dependencies = [ + "libc", +] + [[package]] name = "walkdir" version = "2.5.0" diff --git a/ingestor-core/src/handlers/mod.rs b/ingestor-core/src/handlers/mod.rs index 533631c..bf0b9c6 100644 --- a/ingestor-core/src/handlers/mod.rs +++ b/ingestor-core/src/handlers/mod.rs @@ -41,10 +41,14 @@ pub const ALLOWED_EXTENSIONS: &[&str] = &[ "rb", // PHP "php", + // Swift + "swift", // Shell "sh", "bash", "zsh", // SQL "sql", + // Data/Logs + "log", "jsonl", "csv", "xml", "env", "ini", "cfg", "conf", ]; /// Handler for `llmx_index` tool: Create or update codebase indexes. diff --git a/ingestor-core/src/lib.rs b/ingestor-core/src/lib.rs index f8b74a3..cfe240c 100644 --- a/ingestor-core/src/lib.rs +++ b/ingestor-core/src/lib.rs @@ -1,5 +1,6 @@ mod chunk; mod export; +pub mod handlers; mod index; mod model; mod util; diff --git a/ingestor-core/src/mcp/tools.rs b/ingestor-core/src/mcp/tools.rs index 0844d59..414275b 100644 --- a/ingestor-core/src/mcp/tools.rs +++ b/ingestor-core/src/mcp/tools.rs @@ -162,7 +162,7 @@ pub struct ManageOutput { /// # Behavior /// /// 1. Recursively walks directories and reads files -/// 2. Filters by extension whitelist (.rs, .js, .ts, .tsx, .md, .json, .html, .css, .txt) +/// 2. Filters by extension whitelist (see `handlers::ALLOWED_EXTENSIONS`) /// 3. Checks for existing index by root path /// 4. Creates new index or updates existing one /// 5. Saves to disk and returns metadata @@ -371,6 +371,13 @@ fn walk_directory(path: &Path, files: &mut Vec) -> Result<()> { let entry = entry?; let path = entry.path(); + // Skip hidden directories and common non-code directories + if let Some(name) = path.file_name().and_then(|n| n.to_str()) { + if name.starts_with('.') || name == "node_modules" || name == "target" || name == "dist" || name == "build" { + continue; + } + } + if path.is_dir() { walk_directory(&path, files)?; } else if path.is_file() { @@ -381,10 +388,9 @@ fn walk_directory(path: &Path, files: &mut Vec) -> Result<()> { } fn read_file(path: &Path, files: &mut Vec) -> Result<()> { - // Check extension whitelist + // Check extension whitelist (shared with handlers module) if let Some(ext) = path.extension().and_then(|e| e.to_str()) { - let allowed = ["rs", "js", "ts", "tsx", "md", "json", "html", "css", "txt"]; - if !allowed.contains(&ext) { + if !crate::handlers::ALLOWED_EXTENSIONS.contains(&ext) { return Ok(()); } } else { diff --git a/ingestor-core/src/util.rs b/ingestor-core/src/util.rs index 0939361..cdbb768 100644 --- a/ingestor-core/src/util.rs +++ b/ingestor-core/src/util.rs @@ -25,7 +25,12 @@ pub fn detect_kind(path: &str) -> ChunkKind { ChunkKind::JavaScript } else if lower.ends_with(".html") || lower.ends_with(".htm") { ChunkKind::Html - } else if lower.ends_with(".txt") || lower.ends_with(".log") { + } else if lower.ends_with(".xml") { + ChunkKind::Html + } else if lower.ends_with(".txt") || lower.ends_with(".log") || lower.ends_with(".jsonl") + || lower.ends_with(".csv") || lower.ends_with(".env") || lower.ends_with(".ini") + || lower.ends_with(".cfg") || lower.ends_with(".conf") + { ChunkKind::Text } else if lower.ends_with(".png") || lower.ends_with(".jpg")