blacklanternsecurity · Vasco0x4 · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026 · Apr 7, 2026
diff --git a/man_spider/lib/parser/parser.py b/man_spider/lib/parser/parser.py
@@ -18,7 +18,14 @@ def is_text_file(filepath):
     best = result.best()
     # Only consider it a text file if we have high confidence
     # and the encoding is detected (not binary)
-    return best is not None and best.encoding is not None
+    if best is None or best.encoding is None:
+        return False
+    # Reject if decoded content has too many replacement characters —
+    # this means charset-normalizer forced a binary file through as text
+    text = str(best)
+    if text and text.count("\ufffd") / len(text) > 0.01:
+        return False
+    return True
 
 
 def extract_text_file(filepath):
@@ -136,13 +143,11 @@ def grep(self, content, pattern):
                         Interpret PATTERN as an extended regular expression
                     -i, --ignore-case
                         Ignore case distinctions
-                    -a, --text
-                        Process a binary file as if it were text
                     -m NUM, --max-count=NUM
-                        Stop  reading  a file after NUM matching lines
+                        Stop reading a file after NUM matching lines
                 """
                 grep_process = sp.Popen(
-                    ["grep", "-Eiam", "5", "--color=always", pattern], stdin=sp.PIPE, stdout=sp.PIPE
+                    ["grep", "-Eim", "5", "--color=always", pattern], stdin=sp.PIPE, stdout=sp.PIPE
                 )
                 grep_output = grep_process.communicate(content)[0]
                 for line in grep_output.splitlines():
@@ -204,6 +209,15 @@ def extract_text(self, file, pretty_filename):
         if text_content is None:
             return matches
 
+        # Guard against binary garbage: if more than 1% of characters are
+        # Unicode replacement chars (U+FFFD), the file was decoded incorrectly.
+        # Fall back to raw ASCII string extraction to avoid dumping huge binary chunks.
+        if text_content and text_content.count("\ufffd") / len(text_content) > 0.01:
+            log.debug(f"High replacement char ratio in {pretty_filename}, falling back to string extraction")
+            text_content = extract_strings_from_binary(str(file))
+            if text_content is None:
+                return matches
+
         # try to convert to UTF-8 for grep-friendliness
         try:
             binary_content = text_content.encode("utf-8", errors="ignore")

diff --git a/man_spider/lib/spiderling.py b/man_spider/lib/spiderling.py
@@ -16,6 +16,34 @@
 log = logging.getLogger("manspider.spiderling")
 
 
+# Directories to skip in --noise-filter moderate mode
+NOISE_DIRS_MODERATE = [
+    "policydefinitions",  # Group Policy ADMX/ADML templates (all language variants)
+    "winsxs",             # Windows component store (huge, system-only)
+    "servicing",          # Windows Update staging area
+]
+
+# Additional directories skipped in --noise-filter aggressive mode
+NOISE_DIRS_AGGRESSIVE = NOISE_DIRS_MODERATE + [
+    "\\windows\\system32",
+    "\\windows\\syswow64",
+    "\\windows\\assembly",
+    "\\windows\\fonts",
+    "\\windows\\spool",
+    "windows defender",
+]
+
+# File extensions suppressed by --noise-filter (both modes)
+NOISE_EXTENSIONS = [
+    ".adml",      # Group Policy Administrative Template Language files
+    ".admx",      # Group Policy Administrative Template XML files
+    ".mui",       # Multilingual User Interface resource files
+    ".mof",       # Managed Object Format (WMI definitions)
+    ".cat",       # Windows security catalog files
+    ".manifest",  # Windows assembly manifest files
+]
+
+
 class SpiderlingMessage:
     """
     Message which gets sent back to the parent through parent_queue

diff --git a/man_spider/manspider.py b/man_spider/manspider.py
@@ -190,6 +190,17 @@ def main():
         metavar="SIZE",
     )
     parser.add_argument("-v", "--verbose", action="store_true", help="show debugging messages")
+    parser.add_argument(
+        "--noise-filter",
+        choices=["moderate", "aggressive"],
+        default=None,
+        metavar="MODE",
+        help=(
+            "filter out common Windows system noise to reduce clutter. "
+            "moderate: skips PolicyDefinitions, WinSxS, Servicing + .adml/.admx/.mui/.mof/.cat/.manifest files. "
+            "aggressive: also skips System32, SysWOW64, Assembly, Fonts, Spool, Windows Defender."
+        ),
+    )
     parser.add_argument(
         "--modified-after",
         type=str,
@@ -257,6 +268,15 @@ def main():
         options.dirnames = [s.lower() for s in options.dirnames]
         options.exclude_dirnames = [s.lower() for s in options.exclude_dirnames]
 
+        # apply built-in noise filter presets
+        if options.noise_filter:
+            from man_spider.lib.spiderling import NOISE_DIRS_MODERATE, NOISE_DIRS_AGGRESSIVE, NOISE_EXTENSIONS
+            noise_dirs = NOISE_DIRS_MODERATE if options.noise_filter == "moderate" else NOISE_DIRS_AGGRESSIVE
+            options.exclude_dirnames = list(set(options.exclude_dirnames + [d.lower() for d in noise_dirs]))
+            # ensure extension format is correct (dot prefix)
+            noise_exts = [e if e.startswith(".") else f".{e}" for e in NOISE_EXTENSIONS]
+            options.exclude_extensions = list(set(options.exclude_extensions + noise_exts))
+
         # deduplicate targets
         targets = set()
         [[targets.add(t) for t in g] for g in options.targets]