Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 19 additions & 5 deletions man_spider/lib/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,14 @@ def is_text_file(filepath):
best = result.best()
# Only consider it a text file if we have high confidence
# and the encoding is detected (not binary)
return best is not None and best.encoding is not None
if best is None or best.encoding is None:
return False
# Reject if decoded content has too many replacement characters —
# this means charset-normalizer forced a binary file through as text
text = str(best)
if text and text.count("\ufffd") / len(text) > 0.01:
return False
return True


def extract_text_file(filepath):
Expand Down Expand Up @@ -136,13 +143,11 @@ def grep(self, content, pattern):
Interpret PATTERN as an extended regular expression
-i, --ignore-case
Ignore case distinctions
-a, --text
Process a binary file as if it were text
-m NUM, --max-count=NUM
Stop reading a file after NUM matching lines
Stop reading a file after NUM matching lines
"""
grep_process = sp.Popen(
["grep", "-Eiam", "5", "--color=always", pattern], stdin=sp.PIPE, stdout=sp.PIPE
["grep", "-Eim", "5", "--color=always", pattern], stdin=sp.PIPE, stdout=sp.PIPE
)
grep_output = grep_process.communicate(content)[0]
for line in grep_output.splitlines():
Expand Down Expand Up @@ -204,6 +209,15 @@ def extract_text(self, file, pretty_filename):
if text_content is None:
return matches

# Guard against binary garbage: if more than 1% of characters are
# Unicode replacement chars (U+FFFD), the file was decoded incorrectly.
# Fall back to raw ASCII string extraction to avoid dumping huge binary chunks.
if text_content and text_content.count("\ufffd") / len(text_content) > 0.01:
log.debug(f"High replacement char ratio in {pretty_filename}, falling back to string extraction")
text_content = extract_strings_from_binary(str(file))
if text_content is None:
return matches

# try to convert to UTF-8 for grep-friendliness
try:
binary_content = text_content.encode("utf-8", errors="ignore")
Expand Down
28 changes: 28 additions & 0 deletions man_spider/lib/spiderling.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,34 @@
log = logging.getLogger("manspider.spiderling")


# Directories to skip in --noise-filter moderate mode
NOISE_DIRS_MODERATE = [
"policydefinitions", # Group Policy ADMX/ADML templates (all language variants)
"winsxs", # Windows component store (huge, system-only)
"servicing", # Windows Update staging area
]

# Additional directories skipped in --noise-filter aggressive mode
NOISE_DIRS_AGGRESSIVE = NOISE_DIRS_MODERATE + [
"\\windows\\system32",
"\\windows\\syswow64",
"\\windows\\assembly",
"\\windows\\fonts",
"\\windows\\spool",
"windows defender",
]

# File extensions suppressed by --noise-filter (both modes)
NOISE_EXTENSIONS = [
".adml", # Group Policy Administrative Template Language files
".admx", # Group Policy Administrative Template XML files
".mui", # Multilingual User Interface resource files
".mof", # Managed Object Format (WMI definitions)
".cat", # Windows security catalog files
".manifest", # Windows assembly manifest files
]


class SpiderlingMessage:
"""
Message which gets sent back to the parent through parent_queue
Expand Down
20 changes: 20 additions & 0 deletions man_spider/manspider.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,17 @@ def main():
metavar="SIZE",
)
parser.add_argument("-v", "--verbose", action="store_true", help="show debugging messages")
parser.add_argument(
"--noise-filter",
choices=["moderate", "aggressive"],
default=None,
metavar="MODE",
help=(
"filter out common Windows system noise to reduce clutter. "
"moderate: skips PolicyDefinitions, WinSxS, Servicing + .adml/.admx/.mui/.mof/.cat/.manifest files. "
"aggressive: also skips System32, SysWOW64, Assembly, Fonts, Spool, Windows Defender."
),
)
parser.add_argument(
"--modified-after",
type=str,
Expand Down Expand Up @@ -257,6 +268,15 @@ def main():
options.dirnames = [s.lower() for s in options.dirnames]
options.exclude_dirnames = [s.lower() for s in options.exclude_dirnames]

# apply built-in noise filter presets
if options.noise_filter:
from man_spider.lib.spiderling import NOISE_DIRS_MODERATE, NOISE_DIRS_AGGRESSIVE, NOISE_EXTENSIONS
noise_dirs = NOISE_DIRS_MODERATE if options.noise_filter == "moderate" else NOISE_DIRS_AGGRESSIVE
options.exclude_dirnames = list(set(options.exclude_dirnames + [d.lower() for d in noise_dirs]))
# ensure extension format is correct (dot prefix)
noise_exts = [e if e.startswith(".") else f".{e}" for e in NOISE_EXTENSIONS]
options.exclude_extensions = list(set(options.exclude_extensions + noise_exts))

# deduplicate targets
targets = set()
[[targets.add(t) for t in g] for g in options.targets]
Expand Down
Loading