Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

matching and pipelining #3

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 130 additions & 36 deletions md5deep.py
Original file line number Diff line number Diff line change
@@ -1,50 +1,87 @@
#!/usr/bin/env python
# MIT License, (c) Joshua Wright jwright@willhackforsushi.com
# https://github.com/joswr1ght/md5deep
import os, sys, hashlib
import os, sys, hashlib, re, multiprocessing
from Queue import Queue
from threading import Thread
import time, datetime, errno

# To stop the queue from consuming all the RAM available
MaxQueue = 1000

# Reproduce this output with slashes consistent for Windows systems
#ba2812a436909554688154be461d976c A\SEC575-Clown-Chat\nvram

# file regex
md5FileRegex = re.compile(r'^(?P<hash>[a-f0-9]{32}) (?P<path>(/)?([^/\0]+(/)?)+)\n$')

file_queue = Queue(MaxQueue)

# Optimized for low-memory systems, read whole file with blocksize=0
def md5sum(filename, blocksize=65536):
hash = hashlib.md5()
with open(filename, "rb") as f:
for block in iter(lambda: f.read(blocksize), ""):
hash.update(block)
return hash.hexdigest()

try:
with open(filename, "rb") as f:
for block in iter(lambda: f.read(blocksize), ""):
hash.update(block)
return hash.hexdigest().strip()
except IOError as e:
if e.errno == errno.EACCES:
sys.stderr.write("Permission denied: %s\n"%(filename))
return "00000000000000000000000000000000"
pass

def mod_datetime(filename):
t = os.path.getmtime(filename)
return datetime.datetime.fromtimestamp(t)


def usage():
print "Usage: md5deep.py [OPTIONS] [FILES]"
print "-r - recursive mode, all subdirectories are traversed."
print "-n - During any of the matching modes (-m,-M,-x,or -X), displays only the filenames of any known hashes that were not matched by any of the input files."
print "-M <file> - enables matching mode."
print "-m <file> - as above."
print "-X <file> - enables negative matching mode."
print "-x <file> - as above."
print "-n - used with -MmXx so only file name outputed."
print "-f - speed up hash calculations, using more memory."
print "-0 - Uses a NULL character (/0) to terminate each line instead of a newline. Useful for processing filenames with strange characters."
print "-jnn - Controls multi-threading. By default the program will create one producer thread to scan the file system and one hashing thread per CPU core. Multi-threading causes output filenames to be in non-deterministic order, as files that take longer to hash will be delayed while they are hashed. If a deterministic order is required, specify -j0 to disable multi-threading."
print "-s - includes the targets of symlinks in the cargo, by default they are ignored."
print "-t yyyymmddThhmmss - include only files modified after the timestamp provided."

def formatOutput(hash, path):
hash = hash.replace(" ","")
path = path.replace("\r","")
path = path.replace("\n","")

def validate_hashes(hashfile, hashlist):
if opt_nameonly:
sys.stdout.write("%s%s"%(path, opt_endofline))
else:
sys.stdout.write("%s %s%s"%(hash, path, opt_endofline))

def validate_hashes(hashfile, hashlist, mode):
# Open file and build a new hashlist
hashlistrec = []
with open(hashfile, "r") as f:
for line in f:
filehash,filename = line.rstrip().split(" ")
# Convert to platform covention directory separators
filename = normfname(filename)
# Add entry to hashlistrec
hashlistrec.append((filename, filehash))
for diff in list(set(hashlistrec) - set(hashlist)):
# Replicate "-n" md5deep functionality; print only the filename
# if the file is missing in the filename list; print the hash
# of the current file if it is different from the negative match
# file.
if (not os.path.isfile(diff[0])):
# File from negative match list is missing, just print filename
print winfname(diff[0])
else:
print diff[0] + " " + winfname(diff[1])

# Produce a Windows-style filename
def winfname(filename):
return filename.replace("/","\\")
hashpair = md5FileRegex.match(line)
if hashpair:
filehash = hashpair.group('hash')
filename = hashpair.group('path')
# Convert to platform covention directory separators
filename = normfname(hashpair.group('path'))
# Add entry to hashlistrec
hashlistrec.append((filename, filehash))

if mode == "neg":
for diff in list(set(hashlist) - set(hashlistrec)):
formatOutput(diff[1], normfname(diff[0]))
elif mode == "pos":
for inter in list(set(hashlistrec) & set(hashlist)):
formatOutput(inter[1], normfname(inter[0]))

# Normalize filename based on platform
def normfname(filename):
Expand All @@ -53,14 +90,26 @@ def normfname(filename):
else:
return filename.replace("\\","/")

# Worker thread function
def calcMD5(i, q):
while True:
path = q.get()
formatOutput(md5sum(path, md5blocklen), path)
q.task_done()

if __name__ == '__main__':

opt_recursive = None
opt_negmatch = None
opt_match = None
opt_nameonly = None
opt_hashtable = None
opt_fast = None
opt_null = None
opt_endofline = "\n"
opt_files = []
opt_threads = multiprocessing.cpu_count()
opt_symlink = False
opt_timestamp =""

if len(sys.argv) == 1:
usage()
Expand All @@ -73,16 +122,37 @@ def normfname(filename):
opt_recursive = True
continue
elif i == '-0':
opt_null = True
opt_endofline = "\0"
continue
elif i == '-f':
opt_fast = True
elif i == '-X':
elif i == '-X' or i == '-x':
opt_negmatch = next(it)
if not os.path.isfile(opt_negmatch):
sys.stdout.write("Cannot open negative match file %s\n"%opt_negmatch)
sys.exit(-1)
continue
elif i == '-M' or i == '-m':
opt_match = next(it)
if not os.path.isfile(opt_match):
sys.stdout.write("Cannot open match file %s\n"%opt_match)
sys.exit(-1)
continue
elif i == '-n' and (opt_negmatch or opt_match):
opt_nameonly = True
continue
elif i.startswith('-j'):
opt_threads = int(i[2:])
continue
elif i == '-s':
opt_symlink = True
continue
elif i == '-t':
opt_timestamp = datetime.datetime.strptime( next(it), "%Y%m%dT%H%M%S" )
if not opt_timestamp:
sys.stdout.write("Is not valid ISO timestamp %s\n"%opt_timestampe)
sys.exit(-1)
continue
else:
opt_files.append(i)

Expand All @@ -92,6 +162,19 @@ def normfname(filename):
# Default to optimize for low-memory systems
md5blocklen=65536

# If we are not doing matching then we by-pass the hashtable
# this saves RAM and allows us to process much larger filesystems
if opt_negmatch or opt_match:
opt_hashtable = True
else:
opt_hashtable = False

if opt_threads:
for i in range(opt_threads):
worker = Thread(target=calcMD5, args=(i, file_queue))
worker.setDaemon(True)
worker.start()

# Build a list of (hash,filename) for each file, regardless of specified
# options
hashlist = []
Expand All @@ -106,16 +189,27 @@ def normfname(filename):
for (directory, _, files) in os.walk(start):
for f in files:
path = os.path.join(directory, f)
hashlist.append((path, md5sum(path, md5blocklen)))
if (not(opt_symlink and os.path.islink(path))):
if opt_hashtable:
hashlist.append((path, md5sum(path, md5blocklen)))
elif not opt_timestamp or (mod_datetime(path) > opt_timestamp and opt_timestamp):
if opt_threads:
# Add it to the queue
file_queue.put(path)
else:
# Threading disabled
formatOutput(md5sum(path, md5blocklen), path)


# With the hashlist built, compare to the negative match list, or print
# With the hashlist built, compare to the negative/posative match list, or print
# the results.
if opt_negmatch:
validate_hashes(opt_negmatch, hashlist)
if opt_threads:
file_queue.join()
elif opt_negmatch:
validate_hashes(opt_negmatch, hashlist, "neg")
elif opt_match:
validate_hashes(opt_match, hashlist, "pos")
else:
# Just print out the list with Windows-syle filenames
for hash in hashlist:
if opt_null:
print "%s %s\0"%(hash[1],winfname(hash[0]))
else:
print "%s %s"%(hash[1],winfname(hash[0]))
formatOutput(hash[1],normfname(hash[0]))