From 32ccee486fe406911ad5792c59b1a3e65a330cfd Mon Sep 17 00:00:00 2001 From: elcrp96 Date: Thu, 30 Jul 2015 18:49:23 +0100 Subject: [PATCH 1/8] matching and pipelining Add positive matching (-M & -m) fixed negative matching (-X & -m) made -n an option rather than just on When no matching required by passed the hash table, so very large file sets can be processed without running out of RAM. --- md5deep.py | 93 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 66 insertions(+), 27 deletions(-) diff --git a/md5deep.py b/md5deep.py index 75051fb..45a43f1 100755 --- a/md5deep.py +++ b/md5deep.py @@ -1,11 +1,14 @@ #!/usr/bin/env python # MIT License, (c) Joshua Wright jwright@willhackforsushi.com # https://github.com/joswr1ght/md5deep -import os, sys, hashlib +import os, sys, hashlib, re # Reproduce this output with slashes consistent for Windows systems #ba2812a436909554688154be461d976c A\SEC575-Clown-Chat\nvram +# file regex +md5Regex = re.compile(r'^(?P[a-f0-9]{32}) (?P(/)?([^/\0]+(/)?)+)\n$') + # Optimized for low-memory systems, read whole file with blocksize=0 def md5sum(filename, blocksize=65536): hash = hashlib.md5() @@ -17,30 +20,43 @@ def md5sum(filename, blocksize=65536): def usage(): print "Usage: md5deep.py [OPTIONS] [FILES]" print "-r - recursive mode, all subdirectories are traversed." + print "-n - During any of the matching modes (-m,-M,-x,or -X), displays only the filenames of any known hashes that were not matched by any of the input files." + print "-M - enables matching mode." + print "-m - as above." print "-X - enables negative matching mode." + print "-x - as above." + print "-n - used with -MmXx so only file name outputed." + print "-f - speed up hash calculations, using more memory." print "-f - speed up hash calculations, using more memory." print "-0 - Uses a NULL character (/0) to terminate each line instead of a newline. Useful for processing filenames with strange characters." -def validate_hashes(hashfile, hashlist): +def formatOutput(hash, path): + if opt_nameonly: + print "%s%s"%(path, opt_endofline) + else: + print "%s %s%s"%(hash, path, opt_endofline) + + +def validate_hashes(hashfile, hashlist, mode): # Open file and build a new hashlist hashlistrec = [] with open(hashfile, "r") as f: for line in f: - filehash,filename = line.rstrip().split(" ") - # Convert to platform covention directory separators - filename = normfname(filename) - # Add entry to hashlistrec - hashlistrec.append((filename, filehash)) - for diff in list(set(hashlistrec) - set(hashlist)): - # Replicate "-n" md5deep functionality; print only the filename - # if the file is missing in the filename list; print the hash - # of the current file if it is different from the negative match - # file. - if (not os.path.isfile(diff[0])): - # File from negative match list is missing, just print filename - print winfname(diff[0]) - else: - print diff[0] + " " + winfname(diff[1]) + hashpair = md5Regex.match(line) + if hashpair: + filehash = hashpair.group('hash') + filename = hashpair.group('path') + # Convert to platform covention directory separators + filename = normfname(hashpair.group('path')) + # Add entry to hashlistrec + hashlistrec.append((filename, filehash)) + + if mode == "neg": + for diff in list(set(hashlist) - set(hashlistrec)): + formatOutput(diff[1], normfname(diff[0])) + elif mode == "pos": + for inter in list(set(hashlistrec) & set(hashlist)): + formatOutput(inter[1], normfname(inter[0])) # Produce a Windows-style filename def winfname(filename): @@ -58,8 +74,11 @@ def normfname(filename): opt_recursive = None opt_negmatch = None + opt_match = None + opt_nameonly = None + opt_hashtable = None opt_fast = None - opt_null = None + opt_endofline = "" opt_files = [] if len(sys.argv) == 1: @@ -73,16 +92,25 @@ def normfname(filename): opt_recursive = True continue elif i == '-0': - opt_null = True + opt_endofline = "\0" continue elif i == '-f': opt_fast = True - elif i == '-X': + elif i == '-X' or i == '-x': opt_negmatch = next(it) if not os.path.isfile(opt_negmatch): sys.stdout.write("Cannot open negative match file %s\n"%opt_negmatch) sys.exit(-1) continue + elif i == '-M' or i == '-m': + opt_match = next(it) + if not os.path.isfile(opt_match): + sys.stdout.write("Cannot open match file %s\n"%opt_match) + sys.exit(-1) + continue + elif i == '-n' and (opt_negmatch or opt_match): + opt_nameonly = True + continue else: opt_files.append(i) @@ -92,6 +120,14 @@ def normfname(filename): # Default to optimize for low-memory systems md5blocklen=65536 + # If we are not doing matching then we by-pass the hashtable + # this saves RAM and allows us to process much larger filesystems + if opt_negmatch or opt_match: + opt_hashtable = True + else: + opt_hashtable = False + + # Build a list of (hash,filename) for each file, regardless of specified # options hashlist = [] @@ -106,16 +142,19 @@ def normfname(filename): for (directory, _, files) in os.walk(start): for f in files: path = os.path.join(directory, f) - hashlist.append((path, md5sum(path, md5blocklen))) + if opt_hashtable: + hashlist.append((path, md5sum(path, md5blocklen))) + else: + formatOutput(md5sum(path, md5blocklen), path) + - # With the hashlist built, compare to the negative match list, or print + # With the hashlist built, compare to the negative/posative match list, or print # the results. if opt_negmatch: - validate_hashes(opt_negmatch, hashlist) + validate_hashes(opt_negmatch, hashlist, "neg") + elif opt_match: + validate_hashes(opt_match, hashlist, "pos") else: # Just print out the list with Windows-syle filenames for hash in hashlist: - if opt_null: - print "%s %s\0"%(hash[1],winfname(hash[0])) - else: - print "%s %s"%(hash[1],winfname(hash[0])) + formatOutput(hash[1],normfname(hash[0])) From 100a1cc3d9c0e15ce68897ad75222c764113f146 Mon Sep 17 00:00:00 2001 From: elcrp96 Date: Fri, 31 Jul 2015 17:26:37 +0100 Subject: [PATCH 2/8] Added -jnn support Controls multi-threading. By default the program will create one producer thread to scan the file system and one hashing thread per CPU core. Multi-threading causes output filenames to be in non-deterministic order, as files that take longer to hash will be delayed while they are hashed. If a deterministic order is required, specify -j0 to disable multi-threading --- md5deep.py | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/md5deep.py b/md5deep.py index 45a43f1..a74320c 100755 --- a/md5deep.py +++ b/md5deep.py @@ -1,7 +1,9 @@ #!/usr/bin/env python # MIT License, (c) Joshua Wright jwright@willhackforsushi.com # https://github.com/joswr1ght/md5deep -import os, sys, hashlib, re +import os, sys, hashlib, re, multiprocessing +from Queue import Queue +from threading import Thread # Reproduce this output with slashes consistent for Windows systems #ba2812a436909554688154be461d976c A\SEC575-Clown-Chat\nvram @@ -9,6 +11,8 @@ # file regex md5Regex = re.compile(r'^(?P[a-f0-9]{32}) (?P(/)?([^/\0]+(/)?)+)\n$') +file_queue = Queue() + # Optimized for low-memory systems, read whole file with blocksize=0 def md5sum(filename, blocksize=65536): hash = hashlib.md5() @@ -29,6 +33,7 @@ def usage(): print "-f - speed up hash calculations, using more memory." print "-f - speed up hash calculations, using more memory." print "-0 - Uses a NULL character (/0) to terminate each line instead of a newline. Useful for processing filenames with strange characters." + print "-jnn - Controls multi-threading. By default the program will create one producer thread to scan the file system and one hashing thread per CPU core. Multi-threading causes output filenames to be in non-deterministic order, as files that take longer to hash will be delayed while they are hashed. If a deterministic order is required, specify -j0 to disable multi-threading." def formatOutput(hash, path): if opt_nameonly: @@ -58,10 +63,6 @@ def validate_hashes(hashfile, hashlist, mode): for inter in list(set(hashlistrec) & set(hashlist)): formatOutput(inter[1], normfname(inter[0])) -# Produce a Windows-style filename -def winfname(filename): - return filename.replace("/","\\") - # Normalize filename based on platform def normfname(filename): if os.name == 'nt': # Windows @@ -69,6 +70,12 @@ def normfname(filename): else: return filename.replace("\\","/") +# Worker thread function +def calcMD5(i, q): + while True: + path = q.get() + formatOutput(md5sum(path), path) + q.task_done() if __name__ == '__main__': @@ -80,6 +87,7 @@ def normfname(filename): opt_fast = None opt_endofline = "" opt_files = [] + opt_threads = multiprocessing.cpu_count() if len(sys.argv) == 1: usage() @@ -111,6 +119,9 @@ def normfname(filename): elif i == '-n' and (opt_negmatch or opt_match): opt_nameonly = True continue + elif i.startswith('-j'): + opt_threads = int(i[2:]) + continue else: opt_files.append(i) @@ -127,7 +138,12 @@ def normfname(filename): else: opt_hashtable = False - + if opt_threads: + for i in range(opt_threads): + worker = Thread(target=calcMD5, args=(i, file_queue)) + worker.setDaemon(True) + worker.start() + # Build a list of (hash,filename) for each file, regardless of specified # options hashlist = [] @@ -144,13 +160,19 @@ def normfname(filename): path = os.path.join(directory, f) if opt_hashtable: hashlist.append((path, md5sum(path, md5blocklen))) + elif opt_threads: + # Add it to the queue + file_queue.put(path) else: + # Threading disabled formatOutput(md5sum(path, md5blocklen), path) # With the hashlist built, compare to the negative/posative match list, or print # the results. - if opt_negmatch: + if opt_threads: + file_queue.join() + elif opt_negmatch: validate_hashes(opt_negmatch, hashlist, "neg") elif opt_match: validate_hashes(opt_match, hashlist, "pos") From 96801a3f483c4c693cbb99f594c6324588d6668a Mon Sep 17 00:00:00 2001 From: elcrp96 Date: Fri, 31 Jul 2015 17:55:23 +0100 Subject: [PATCH 3/8] bugfix md5sum called without blocklen being specified so ran slow. --- md5deep.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/md5deep.py b/md5deep.py index a74320c..10c5553 100755 --- a/md5deep.py +++ b/md5deep.py @@ -41,7 +41,6 @@ def formatOutput(hash, path): else: print "%s %s%s"%(hash, path, opt_endofline) - def validate_hashes(hashfile, hashlist, mode): # Open file and build a new hashlist hashlistrec = [] @@ -74,7 +73,7 @@ def normfname(filename): def calcMD5(i, q): while True: path = q.get() - formatOutput(md5sum(path), path) + formatOutput(md5sum(path, md5blocklen), path) q.task_done() if __name__ == '__main__': From 49195c3d2589ab1f802bf99a84b61377ec0f296e Mon Sep 17 00:00:00 2001 From: elcrp96 Date: Fri, 7 Aug 2015 17:09:45 +0100 Subject: [PATCH 4/8] Memory Management Add explicit number of entry limit to the Queue as in some very large filesystem this cause the process to run out of RAM. --- md5deep.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/md5deep.py b/md5deep.py index 10c5553..42a3362 100755 --- a/md5deep.py +++ b/md5deep.py @@ -5,13 +5,16 @@ from Queue import Queue from threading import Thread +# To stop the queue from consuming all the RAM available +MaxQueue = 1000 + # Reproduce this output with slashes consistent for Windows systems #ba2812a436909554688154be461d976c A\SEC575-Clown-Chat\nvram # file regex md5Regex = re.compile(r'^(?P[a-f0-9]{32}) (?P(/)?([^/\0]+(/)?)+)\n$') -file_queue = Queue() +file_queue = Queue(MaxQueue) # Optimized for low-memory systems, read whole file with blocksize=0 def md5sum(filename, blocksize=65536): From 4e221a0e18deffa09fd65e3c861e6085f0518a28 Mon Sep 17 00:00:00 2001 From: elcrp96 Date: Mon, 10 Aug 2015 16:31:11 +0100 Subject: [PATCH 5/8] added -t yyyymmddThhmmss Added support to only processes files changes since a given datetime. --- md5deep.py | 46 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/md5deep.py b/md5deep.py index 42a3362..6c02160 100755 --- a/md5deep.py +++ b/md5deep.py @@ -4,6 +4,7 @@ import os, sys, hashlib, re, multiprocessing from Queue import Queue from threading import Thread +import time, datetime # To stop the queue from consuming all the RAM available MaxQueue = 1000 @@ -12,7 +13,7 @@ #ba2812a436909554688154be461d976c A\SEC575-Clown-Chat\nvram # file regex -md5Regex = re.compile(r'^(?P[a-f0-9]{32}) (?P(/)?([^/\0]+(/)?)+)\n$') +md5FileRegex = re.compile(r'^(?P[a-f0-9]{32}) (?P(/)?([^/\0]+(/)?)+)\n$') file_queue = Queue(MaxQueue) @@ -22,7 +23,12 @@ def md5sum(filename, blocksize=65536): with open(filename, "rb") as f: for block in iter(lambda: f.read(blocksize), ""): hash.update(block) - return hash.hexdigest() + return hash.hexdigest().strip() + +def mod_datetime(filename): + t = os.path.getmtime(filename) + return datetime.datetime.fromtimestamp(t) + def usage(): print "Usage: md5deep.py [OPTIONS] [FILES]" @@ -34,22 +40,26 @@ def usage(): print "-x - as above." print "-n - used with -MmXx so only file name outputed." print "-f - speed up hash calculations, using more memory." - print "-f - speed up hash calculations, using more memory." print "-0 - Uses a NULL character (/0) to terminate each line instead of a newline. Useful for processing filenames with strange characters." print "-jnn - Controls multi-threading. By default the program will create one producer thread to scan the file system and one hashing thread per CPU core. Multi-threading causes output filenames to be in non-deterministic order, as files that take longer to hash will be delayed while they are hashed. If a deterministic order is required, specify -j0 to disable multi-threading." + print "-t yyyymmddThhmmss - include only files modified after the timestamp provided." def formatOutput(hash, path): + hash = hash.replace(" ","") + path = path.replace("\r","") + path = path.replace("\n","") + if opt_nameonly: - print "%s%s"%(path, opt_endofline) + sys.stdout.write("%s%s"%(path, opt_endofline)) else: - print "%s %s%s"%(hash, path, opt_endofline) + sys.stdout.write("%s %s%s"%(hash, path, opt_endofline)) def validate_hashes(hashfile, hashlist, mode): # Open file and build a new hashlist hashlistrec = [] with open(hashfile, "r") as f: for line in f: - hashpair = md5Regex.match(line) + hashpair = md5FileRegex.match(line) if hashpair: filehash = hashpair.group('hash') filename = hashpair.group('path') @@ -87,9 +97,10 @@ def calcMD5(i, q): opt_nameonly = None opt_hashtable = None opt_fast = None - opt_endofline = "" + opt_endofline = "\n" opt_files = [] opt_threads = multiprocessing.cpu_count() + opt_timestamp ="" if len(sys.argv) == 1: usage() @@ -102,7 +113,7 @@ def calcMD5(i, q): opt_recursive = True continue elif i == '-0': - opt_endofline = "\0" + opt_endofline = "\0\n" continue elif i == '-f': opt_fast = True @@ -124,6 +135,12 @@ def calcMD5(i, q): elif i.startswith('-j'): opt_threads = int(i[2:]) continue + elif i == '-t': + opt_timestamp = datetime.datetime.strptime( next(it), "%Y%m%dT%H%M%S" ) + if not opt_timestamp: + sys.stdout.write("Is not valid ISO timestamp %s\n"%opt_timestampe) + sys.exit(-1) + continue else: opt_files.append(i) @@ -162,12 +179,13 @@ def calcMD5(i, q): path = os.path.join(directory, f) if opt_hashtable: hashlist.append((path, md5sum(path, md5blocklen))) - elif opt_threads: - # Add it to the queue - file_queue.put(path) - else: - # Threading disabled - formatOutput(md5sum(path, md5blocklen), path) + elif not opt_timestamp or (mod_datetime(path) > opt_timestamp and opt_timestamp): + if opt_threads: + # Add it to the queue + file_queue.put(path) + else: + # Threading disabled + formatOutput(md5sum(path, md5blocklen), path) # With the hashlist built, compare to the negative/posative match list, or print From 7e097e69d414bead80587d7cd8236a57f3bf645e Mon Sep 17 00:00:00 2001 From: elcrp96 Date: Wed, 19 Aug 2015 12:23:43 +0100 Subject: [PATCH 6/8] -0 option bug fix Removed trailing newline when -0 specified. --- md5deep.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/md5deep.py b/md5deep.py index 6c02160..a2b7b06 100755 --- a/md5deep.py +++ b/md5deep.py @@ -113,7 +113,7 @@ def calcMD5(i, q): opt_recursive = True continue elif i == '-0': - opt_endofline = "\0\n" + opt_endofline = "\0" continue elif i == '-f': opt_fast = True From 45f50c033417973de47d64014d3889241e79b495 Mon Sep 17 00:00:00 2001 From: elcrp96 Date: Mon, 14 Sep 2015 12:58:57 +0100 Subject: [PATCH 7/8] Error Handling Added additional handling for IOErrors such as existence and permissions --- md5deep.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/md5deep.py b/md5deep.py index a2b7b06..05e3b4f 100755 --- a/md5deep.py +++ b/md5deep.py @@ -4,7 +4,7 @@ import os, sys, hashlib, re, multiprocessing from Queue import Queue from threading import Thread -import time, datetime +import time, datetime, errno # To stop the queue from consuming all the RAM available MaxQueue = 1000 @@ -20,10 +20,17 @@ # Optimized for low-memory systems, read whole file with blocksize=0 def md5sum(filename, blocksize=65536): hash = hashlib.md5() - with open(filename, "rb") as f: - for block in iter(lambda: f.read(blocksize), ""): - hash.update(block) - return hash.hexdigest().strip() + + try: + with open(filename, "rb") as f: + for block in iter(lambda: f.read(blocksize), ""): + hash.update(block) + return hash.hexdigest().strip() + except IOError as e: + if e.errno == errno.EACCES: + sys.stderr.write("Permission denied: %s\n"%(filename)) + return "00000000000000000000000000000000" + pass def mod_datetime(filename): t = os.path.getmtime(filename) From afe5413310e4550ac8595439c3514c40f63f6de1 Mon Sep 17 00:00:00 2001 From: elcrp96 Date: Mon, 14 Sep 2015 22:52:50 +0100 Subject: [PATCH 8/8] Add option to include Symlink The inclusion of symlink targets, is now optional. Symlinks will be ignore unless -s is specified. --- md5deep.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/md5deep.py b/md5deep.py index 05e3b4f..c7f0453 100755 --- a/md5deep.py +++ b/md5deep.py @@ -49,6 +49,7 @@ def usage(): print "-f - speed up hash calculations, using more memory." print "-0 - Uses a NULL character (/0) to terminate each line instead of a newline. Useful for processing filenames with strange characters." print "-jnn - Controls multi-threading. By default the program will create one producer thread to scan the file system and one hashing thread per CPU core. Multi-threading causes output filenames to be in non-deterministic order, as files that take longer to hash will be delayed while they are hashed. If a deterministic order is required, specify -j0 to disable multi-threading." + print "-s - includes the targets of symlinks in the cargo, by default they are ignored." print "-t yyyymmddThhmmss - include only files modified after the timestamp provided." def formatOutput(hash, path): @@ -107,6 +108,7 @@ def calcMD5(i, q): opt_endofline = "\n" opt_files = [] opt_threads = multiprocessing.cpu_count() + opt_symlink = False opt_timestamp ="" if len(sys.argv) == 1: @@ -142,6 +144,9 @@ def calcMD5(i, q): elif i.startswith('-j'): opt_threads = int(i[2:]) continue + elif i == '-s': + opt_symlink = True + continue elif i == '-t': opt_timestamp = datetime.datetime.strptime( next(it), "%Y%m%dT%H%M%S" ) if not opt_timestamp: @@ -184,15 +189,16 @@ def calcMD5(i, q): for (directory, _, files) in os.walk(start): for f in files: path = os.path.join(directory, f) - if opt_hashtable: - hashlist.append((path, md5sum(path, md5blocklen))) - elif not opt_timestamp or (mod_datetime(path) > opt_timestamp and opt_timestamp): - if opt_threads: - # Add it to the queue - file_queue.put(path) - else: - # Threading disabled - formatOutput(md5sum(path, md5blocklen), path) + if (not(opt_symlink and os.path.islink(path))): + if opt_hashtable: + hashlist.append((path, md5sum(path, md5blocklen))) + elif not opt_timestamp or (mod_datetime(path) > opt_timestamp and opt_timestamp): + if opt_threads: + # Add it to the queue + file_queue.put(path) + else: + # Threading disabled + formatOutput(md5sum(path, md5blocklen), path) # With the hashlist built, compare to the negative/posative match list, or print