From 32ccee486fe406911ad5792c59b1a3e65a330cfd Mon Sep 17 00:00:00 2001
From: elcrp96 <chris@pates.me.uk>
Date: Thu, 30 Jul 2015 18:49:23 +0100
Subject: [PATCH 1/8] matching and pipelining

Add positive matching (-M & -m)
fixed negative matching (-X & -m)
made -n an option rather than just on
When no matching required by passed the hash table, so very large file
sets can be processed without running out of RAM.
---
 md5deep.py | 93 ++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 66 insertions(+), 27 deletions(-)
diff --git a/md5deep.py b/md5deep.py
index 75051fb..45a43f1 100755
--- a/md5deep.py
+++ b/md5deep.py
@@ -1,11 +1,14 @@
 #!/usr/bin/env python
 # MIT License, (c) Joshua Wright jwright@willhackforsushi.com
 # https://github.com/joswr1ght/md5deep
-import os, sys, hashlib
+import os, sys, hashlib, re
 
 # Reproduce this output with slashes consistent for Windows systems
 #ba2812a436909554688154be461d976c  A\SEC575-Clown-Chat\nvram
 
+# file regex
+md5Regex = re.compile(r'^(?P<hash>[a-f0-9]{32})  (?P<path>(/)?([^/\0]+(/)?)+)\n$')
+
 # Optimized for low-memory systems, read whole file with blocksize=0
 def md5sum(filename, blocksize=65536):
     hash = hashlib.md5()
@@ -17,30 +20,43 @@ def md5sum(filename, blocksize=65536):
 def usage():
     print "Usage: md5deep.py [OPTIONS] [FILES]"
     print "-r        - recursive mode, all subdirectories are traversed."
+    print "-n        - During any of the matching modes (-m,-M,-x,or -X), displays only the filenames of any known hashes that were not matched by any of the input files."
+    print "-M <file> - enables matching mode."
+    print "-m <file> - as above."
     print "-X <file> - enables negative matching mode."
+    print "-x <file> - as above."
+    print "-n        - used with -MmXx so only file name outputed."
+    print "-f        - speed up hash calculations, using more memory."
     print "-f        - speed up hash calculations, using more memory."
     print "-0        - Uses a NULL character (/0) to terminate each line instead of a newline. Useful for processing filenames with strange characters."
 
-def validate_hashes(hashfile, hashlist):
+def formatOutput(hash, path):
+    if opt_nameonly:
+        print "%s%s"%(path,  opt_endofline)
+    else:
+        print "%s  %s%s"%(hash, path, opt_endofline)
+
+
+def validate_hashes(hashfile, hashlist, mode):
     # Open file and build a new hashlist
     hashlistrec = []
     with open(hashfile, "r") as f:
         for line in f:
-            filehash,filename = line.rstrip().split("  ")
-            # Convert to platform covention directory separators
-            filename = normfname(filename)
-            # Add entry to hashlistrec
-            hashlistrec.append((filename, filehash))
-        for diff in list(set(hashlistrec) - set(hashlist)):
-            # Replicate "-n" md5deep functionality; print only the filename
-            # if the file is missing in the filename list; print the hash
-            # of the current file if it is different from the negative match
-            # file.
-            if (not os.path.isfile(diff[0])):
-                # File from negative match list is missing, just print filename
-                print winfname(diff[0])
-            else:
-                print diff[0] + "  " + winfname(diff[1])
+            hashpair = md5Regex.match(line)
+            if hashpair:
+               filehash = hashpair.group('hash')
+               filename = hashpair.group('path')
+               # Convert to platform covention directory separators
+               filename = normfname(hashpair.group('path'))
+               # Add entry to hashlistrec
+               hashlistrec.append((filename, filehash))
+
+        if mode == "neg":
+            for diff in list(set(hashlist) - set(hashlistrec)):
+                formatOutput(diff[1], normfname(diff[0]))
+        elif mode == "pos":
+            for inter in list(set(hashlistrec) & set(hashlist)):
+                formatOutput(inter[1], normfname(inter[0]))
 
 # Produce a Windows-style filename
 def winfname(filename):
@@ -58,8 +74,11 @@ def normfname(filename):
     
     opt_recursive = None
     opt_negmatch = None
+    opt_match = None
+    opt_nameonly = None
+    opt_hashtable = None
     opt_fast = None
-    opt_null = None
+    opt_endofline = ""
     opt_files = []
 
     if len(sys.argv) == 1:
@@ -73,16 +92,25 @@ def normfname(filename):
             opt_recursive = True
             continue
         elif i == '-0':
-            opt_null = True
+            opt_endofline = "\0"
             continue
         elif i == '-f':
             opt_fast = True
-        elif i == '-X':
+        elif i == '-X' or i == '-x':
             opt_negmatch = next(it)
             if not os.path.isfile(opt_negmatch):
                 sys.stdout.write("Cannot open negative match file %s\n"%opt_negmatch)
                 sys.exit(-1)
             continue
+        elif i == '-M' or i == '-m':
+            opt_match = next(it)
+            if not os.path.isfile(opt_match):
+                sys.stdout.write("Cannot open match file %s\n"%opt_match)
+                sys.exit(-1)
+            continue
+        elif i == '-n' and (opt_negmatch or opt_match):
+            opt_nameonly = True
+            continue
         else:
             opt_files.append(i)
 
@@ -92,6 +120,14 @@ def normfname(filename):
         # Default to optimize for low-memory systems
         md5blocklen=65536
 
+    # If we are not doing matching then we by-pass the hashtable
+    # this saves RAM and allows us to process much larger filesystems
+    if opt_negmatch or opt_match:
+        opt_hashtable = True
+    else:
+        opt_hashtable = False
+
+
     # Build a list of (hash,filename) for each file, regardless of specified 
     # options
     hashlist = []
@@ -106,16 +142,19 @@ def normfname(filename):
             for (directory, _, files) in os.walk(start):
                 for f in files:
                     path = os.path.join(directory, f)
-                    hashlist.append((path, md5sum(path, md5blocklen)))
+                    if opt_hashtable:
+                       hashlist.append((path, md5sum(path, md5blocklen)))
+                    else:
+	               formatOutput(md5sum(path, md5blocklen),  path)
+                       
 
-    # With the hashlist built, compare to the negative match list, or print
+    # With the hashlist built, compare to the negative/posative match list, or print
     # the results.
     if opt_negmatch:
-        validate_hashes(opt_negmatch, hashlist)
+        validate_hashes(opt_negmatch, hashlist, "neg")
+    elif opt_match:
+        validate_hashes(opt_match, hashlist, "pos")
     else:
         # Just print out the list with Windows-syle filenames
         for hash in hashlist:
-           if opt_null:
-              print "%s  %s\0"%(hash[1],winfname(hash[0]))
-           else:
-              print "%s  %s"%(hash[1],winfname(hash[0]))
+           formatOutput(hash[1],normfname(hash[0]))

From 100a1cc3d9c0e15ce68897ad75222c764113f146 Mon Sep 17 00:00:00 2001
From: elcrp96 <chris@pates.me.uk>
Date: Fri, 31 Jul 2015 17:26:37 +0100
Subject: [PATCH 2/8] Added -jnn support

Controls multi-threading. By default the program will create one
producer thread to scan the file system and one hashing thread per CPU
core. Multi-threading causes output filenames to be in
non-deterministic order, as files that take longer to hash will be
delayed while they are hashed. If a deterministic order is required,
specify -j0 to disable multi-threading
---
 md5deep.py | 36 +++++++++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/md5deep.py b/md5deep.py
index 45a43f1..a74320c 100755
--- a/md5deep.py
+++ b/md5deep.py
@@ -1,7 +1,9 @@
 #!/usr/bin/env python
 # MIT License, (c) Joshua Wright jwright@willhackforsushi.com
 # https://github.com/joswr1ght/md5deep
-import os, sys, hashlib, re
+import os, sys, hashlib, re, multiprocessing
+from Queue import Queue
+from threading import Thread
 
 # Reproduce this output with slashes consistent for Windows systems
 #ba2812a436909554688154be461d976c  A\SEC575-Clown-Chat\nvram
@@ -9,6 +11,8 @@
 # file regex
 md5Regex = re.compile(r'^(?P<hash>[a-f0-9]{32})  (?P<path>(/)?([^/\0]+(/)?)+)\n$')
 
+file_queue = Queue()
+
 # Optimized for low-memory systems, read whole file with blocksize=0
 def md5sum(filename, blocksize=65536):
     hash = hashlib.md5()
@@ -29,6 +33,7 @@ def usage():
     print "-f        - speed up hash calculations, using more memory."
     print "-f        - speed up hash calculations, using more memory."
     print "-0        - Uses a NULL character (/0) to terminate each line instead of a newline. Useful for processing filenames with strange characters."
+    print "-jnn      - Controls multi-threading. By default the program will create one producer thread to scan the file system and one hashing thread per CPU core. Multi-threading causes output filenames to be in non-deterministic order, as files that take longer to hash will be delayed while they are hashed. If a deterministic order is required, specify -j0 to disable multi-threading."
 
 def formatOutput(hash, path):
     if opt_nameonly:
@@ -58,10 +63,6 @@ def validate_hashes(hashfile, hashlist, mode):
             for inter in list(set(hashlistrec) & set(hashlist)):
                 formatOutput(inter[1], normfname(inter[0]))
 
-# Produce a Windows-style filename
-def winfname(filename):
-    return filename.replace("/","\\")
-
 # Normalize filename based on platform
 def normfname(filename):
     if os.name == 'nt': # Windows
@@ -69,6 +70,12 @@ def normfname(filename):
     else:
         return filename.replace("\\","/")
 
+# Worker thread function
+def calcMD5(i, q):
+    while True:
+        path = q.get()
+        formatOutput(md5sum(path), path)
+        q.task_done()
 
 if __name__ == '__main__':
     
@@ -80,6 +87,7 @@ def normfname(filename):
     opt_fast = None
     opt_endofline = ""
     opt_files = []
+    opt_threads = multiprocessing.cpu_count()
 
     if len(sys.argv) == 1:
         usage()
@@ -111,6 +119,9 @@ def normfname(filename):
         elif i == '-n' and (opt_negmatch or opt_match):
             opt_nameonly = True
             continue
+        elif i.startswith('-j'):
+            opt_threads = int(i[2:])
+            continue
         else:
             opt_files.append(i)
 
@@ -127,7 +138,12 @@ def normfname(filename):
     else:
         opt_hashtable = False
 
-
+    if opt_threads:
+        for i in range(opt_threads):
+            worker = Thread(target=calcMD5, args=(i, file_queue))
+            worker.setDaemon(True)
+            worker.start()
+ 
     # Build a list of (hash,filename) for each file, regardless of specified 
     # options
     hashlist = []
@@ -144,13 +160,19 @@ def normfname(filename):
                     path = os.path.join(directory, f)
                     if opt_hashtable:
                        hashlist.append((path, md5sum(path, md5blocklen)))
+                    elif opt_threads:
+                       # Add it to the queue
+                       file_queue.put(path)     
                     else:
+                       # Threading disabled
 	               formatOutput(md5sum(path, md5blocklen),  path)
                        
 
     # With the hashlist built, compare to the negative/posative match list, or print
     # the results.
-    if opt_negmatch:
+    if opt_threads:
+        file_queue.join()
+    elif opt_negmatch:
         validate_hashes(opt_negmatch, hashlist, "neg")
     elif opt_match:
         validate_hashes(opt_match, hashlist, "pos")

From 96801a3f483c4c693cbb99f594c6324588d6668a Mon Sep 17 00:00:00 2001
From: elcrp96 <chris@pates.me.uk>
Date: Fri, 31 Jul 2015 17:55:23 +0100
Subject: [PATCH 3/8] bugfix

md5sum called without blocklen being specified so ran slow.
---
 md5deep.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/md5deep.py b/md5deep.py
index a74320c..10c5553 100755
--- a/md5deep.py
+++ b/md5deep.py
@@ -41,7 +41,6 @@ def formatOutput(hash, path):
     else:
         print "%s  %s%s"%(hash, path, opt_endofline)
 
-
 def validate_hashes(hashfile, hashlist, mode):
     # Open file and build a new hashlist
     hashlistrec = []
@@ -74,7 +73,7 @@ def normfname(filename):
 def calcMD5(i, q):
     while True:
         path = q.get()
-        formatOutput(md5sum(path), path)
+	formatOutput(md5sum(path, md5blocklen),  path)
         q.task_done()
 
 if __name__ == '__main__':

From 49195c3d2589ab1f802bf99a84b61377ec0f296e Mon Sep 17 00:00:00 2001
From: elcrp96 <chris@pates.me.uk>
Date: Fri, 7 Aug 2015 17:09:45 +0100
Subject: [PATCH 4/8] Memory Management

Add explicit number of entry limit to the Queue as in some very large
filesystem this cause the process to run out of RAM.
---
 md5deep.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/md5deep.py b/md5deep.py
index 10c5553..42a3362 100755
--- a/md5deep.py
+++ b/md5deep.py
@@ -5,13 +5,16 @@
 from Queue import Queue
 from threading import Thread
 
+# To stop the queue from consuming all the RAM available
+MaxQueue = 1000
+
 # Reproduce this output with slashes consistent for Windows systems
 #ba2812a436909554688154be461d976c  A\SEC575-Clown-Chat\nvram
 
 # file regex
 md5Regex = re.compile(r'^(?P<hash>[a-f0-9]{32})  (?P<path>(/)?([^/\0]+(/)?)+)\n$')
 
-file_queue = Queue()
+file_queue = Queue(MaxQueue)
 
 # Optimized for low-memory systems, read whole file with blocksize=0
 def md5sum(filename, blocksize=65536):

From 4e221a0e18deffa09fd65e3c861e6085f0518a28 Mon Sep 17 00:00:00 2001
From: elcrp96 <chris@pates.me.uk>
Date: Mon, 10 Aug 2015 16:31:11 +0100
Subject: [PATCH 5/8] added -t yyyymmddThhmmss

Added support to only processes files changes since a given datetime.
---
 md5deep.py | 46 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/md5deep.py b/md5deep.py
index 42a3362..6c02160 100755
--- a/md5deep.py
+++ b/md5deep.py
@@ -4,6 +4,7 @@
 import os, sys, hashlib, re, multiprocessing
 from Queue import Queue
 from threading import Thread
+import time, datetime
 
 # To stop the queue from consuming all the RAM available
 MaxQueue = 1000
@@ -12,7 +13,7 @@
 #ba2812a436909554688154be461d976c  A\SEC575-Clown-Chat\nvram
 
 # file regex
-md5Regex = re.compile(r'^(?P<hash>[a-f0-9]{32})  (?P<path>(/)?([^/\0]+(/)?)+)\n$')
+md5FileRegex = re.compile(r'^(?P<hash>[a-f0-9]{32})  (?P<path>(/)?([^/\0]+(/)?)+)\n$')
 
 file_queue = Queue(MaxQueue)
 
@@ -22,7 +23,12 @@ def md5sum(filename, blocksize=65536):
     with open(filename, "rb") as f:
         for block in iter(lambda: f.read(blocksize), ""):
             hash.update(block)
-    return hash.hexdigest()
+    return hash.hexdigest().strip()
+
+def mod_datetime(filename):
+    t = os.path.getmtime(filename)
+    return datetime.datetime.fromtimestamp(t)
+
 
 def usage():
     print "Usage: md5deep.py [OPTIONS] [FILES]"
@@ -34,22 +40,26 @@ def usage():
     print "-x <file> - as above."
     print "-n        - used with -MmXx so only file name outputed."
     print "-f        - speed up hash calculations, using more memory."
-    print "-f        - speed up hash calculations, using more memory."
     print "-0        - Uses a NULL character (/0) to terminate each line instead of a newline. Useful for processing filenames with strange characters."
     print "-jnn      - Controls multi-threading. By default the program will create one producer thread to scan the file system and one hashing thread per CPU core. Multi-threading causes output filenames to be in non-deterministic order, as files that take longer to hash will be delayed while they are hashed. If a deterministic order is required, specify -j0 to disable multi-threading."
+    print "-t yyyymmddThhmmss - include only files modified after the timestamp provided."
 
 def formatOutput(hash, path):
+    hash = hash.replace(" ","")
+    path = path.replace("\r","")
+    path = path.replace("\n","")
+
     if opt_nameonly:
-        print "%s%s"%(path,  opt_endofline)
+        sys.stdout.write("%s%s"%(path,  opt_endofline))
     else:
-        print "%s  %s%s"%(hash, path, opt_endofline)
+        sys.stdout.write("%s  %s%s"%(hash, path, opt_endofline))
 
 def validate_hashes(hashfile, hashlist, mode):
     # Open file and build a new hashlist
     hashlistrec = []
     with open(hashfile, "r") as f:
         for line in f:
-            hashpair = md5Regex.match(line)
+            hashpair = md5FileRegex.match(line)
             if hashpair:
                filehash = hashpair.group('hash')
                filename = hashpair.group('path')
@@ -87,9 +97,10 @@ def calcMD5(i, q):
     opt_nameonly = None
     opt_hashtable = None
     opt_fast = None
-    opt_endofline = ""
+    opt_endofline = "\n"
     opt_files = []
     opt_threads = multiprocessing.cpu_count()
+    opt_timestamp =""
 
     if len(sys.argv) == 1:
         usage()
@@ -102,7 +113,7 @@ def calcMD5(i, q):
             opt_recursive = True
             continue
         elif i == '-0':
-            opt_endofline = "\0"
+            opt_endofline = "\0\n"
             continue
         elif i == '-f':
             opt_fast = True
@@ -124,6 +135,12 @@ def calcMD5(i, q):
         elif i.startswith('-j'):
             opt_threads = int(i[2:])
             continue
+        elif i == '-t':
+            opt_timestamp = datetime.datetime.strptime( next(it), "%Y%m%dT%H%M%S" )
+            if not opt_timestamp:
+                sys.stdout.write("Is not valid ISO timestamp %s\n"%opt_timestampe)
+                sys.exit(-1)
+            continue
         else:
             opt_files.append(i)
 
@@ -162,12 +179,13 @@ def calcMD5(i, q):
                     path = os.path.join(directory, f)
                     if opt_hashtable:
                        hashlist.append((path, md5sum(path, md5blocklen)))
-                    elif opt_threads:
-                       # Add it to the queue
-                       file_queue.put(path)     
-                    else:
-                       # Threading disabled
-	               formatOutput(md5sum(path, md5blocklen),  path)
+                    elif not opt_timestamp or (mod_datetime(path) > opt_timestamp and opt_timestamp):
+                       if opt_threads:
+                          # Add it to the queue
+                          file_queue.put(path)     
+                       else:
+                          # Threading disabled
+	                  formatOutput(md5sum(path, md5blocklen),  path)
                        
 
     # With the hashlist built, compare to the negative/posative match list, or print

From 7e097e69d414bead80587d7cd8236a57f3bf645e Mon Sep 17 00:00:00 2001
From: elcrp96 <chris@pates.me.uk>
Date: Wed, 19 Aug 2015 12:23:43 +0100
Subject: [PATCH 6/8] -0 option bug fix

Removed trailing newline when -0 specified.
---
 md5deep.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/md5deep.py b/md5deep.py
index 6c02160..a2b7b06 100755
--- a/md5deep.py
+++ b/md5deep.py
@@ -113,7 +113,7 @@ def calcMD5(i, q):
             opt_recursive = True
             continue
         elif i == '-0':
-            opt_endofline = "\0\n"
+            opt_endofline = "\0"
             continue
         elif i == '-f':
             opt_fast = True

From 45f50c033417973de47d64014d3889241e79b495 Mon Sep 17 00:00:00 2001
From: elcrp96 <chris@pates.me.uk>
Date: Mon, 14 Sep 2015 12:58:57 +0100
Subject: [PATCH 7/8] Error Handling

Added additional handling for IOErrors such as existence and permissions
---
 md5deep.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/md5deep.py b/md5deep.py
index a2b7b06..05e3b4f 100755
--- a/md5deep.py
+++ b/md5deep.py
@@ -4,7 +4,7 @@
 import os, sys, hashlib, re, multiprocessing
 from Queue import Queue
 from threading import Thread
-import time, datetime
+import time, datetime, errno
 
 # To stop the queue from consuming all the RAM available
 MaxQueue = 1000
@@ -20,10 +20,17 @@
 # Optimized for low-memory systems, read whole file with blocksize=0
 def md5sum(filename, blocksize=65536):
     hash = hashlib.md5()
-    with open(filename, "rb") as f:
-        for block in iter(lambda: f.read(blocksize), ""):
-            hash.update(block)
-    return hash.hexdigest().strip()
+  
+    try: 
+        with open(filename, "rb") as f:
+            for block in iter(lambda: f.read(blocksize), ""):
+                hash.update(block)
+        return hash.hexdigest().strip()
+    except IOError as e:
+        if e.errno == errno.EACCES:
+            sys.stderr.write("Permission denied: %s\n"%(filename))
+        return "00000000000000000000000000000000"
+        pass
 
 def mod_datetime(filename):
     t = os.path.getmtime(filename)

From afe5413310e4550ac8595439c3514c40f63f6de1 Mon Sep 17 00:00:00 2001
From: elcrp96 <chris@pates.me.uk>
Date: Mon, 14 Sep 2015 22:52:50 +0100
Subject: [PATCH 8/8] Add option to include Symlink

The inclusion of symlink targets, is now optional. Symlinks will be
ignore unless -s is specified.
---
 md5deep.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/md5deep.py b/md5deep.py
index 05e3b4f..c7f0453 100755
--- a/md5deep.py
+++ b/md5deep.py
@@ -49,6 +49,7 @@ def usage():
     print "-f        - speed up hash calculations, using more memory."
     print "-0        - Uses a NULL character (/0) to terminate each line instead of a newline. Useful for processing filenames with strange characters."
     print "-jnn      - Controls multi-threading. By default the program will create one producer thread to scan the file system and one hashing thread per CPU core. Multi-threading causes output filenames to be in non-deterministic order, as files that take longer to hash will be delayed while they are hashed. If a deterministic order is required, specify -j0 to disable multi-threading."
+    print "-s        - includes the targets of symlinks in the cargo, by default they are ignored."
     print "-t yyyymmddThhmmss - include only files modified after the timestamp provided."
 
 def formatOutput(hash, path):
@@ -107,6 +108,7 @@ def calcMD5(i, q):
     opt_endofline = "\n"
     opt_files = []
     opt_threads = multiprocessing.cpu_count()
+    opt_symlink = False
     opt_timestamp =""
 
     if len(sys.argv) == 1:
@@ -142,6 +144,9 @@ def calcMD5(i, q):
         elif i.startswith('-j'):
             opt_threads = int(i[2:])
             continue
+        elif i == '-s':
+            opt_symlink = True
+            continue
         elif i == '-t':
             opt_timestamp = datetime.datetime.strptime( next(it), "%Y%m%dT%H%M%S" )
             if not opt_timestamp:
@@ -184,15 +189,16 @@ def calcMD5(i, q):
             for (directory, _, files) in os.walk(start):
                 for f in files:
                     path = os.path.join(directory, f)
-                    if opt_hashtable:
-                       hashlist.append((path, md5sum(path, md5blocklen)))
-                    elif not opt_timestamp or (mod_datetime(path) > opt_timestamp and opt_timestamp):
-                       if opt_threads:
-                          # Add it to the queue
-                          file_queue.put(path)     
-                       else:
-                          # Threading disabled
-	                  formatOutput(md5sum(path, md5blocklen),  path)
+                    if (not(opt_symlink and os.path.islink(path))):
+                        if opt_hashtable:
+                            hashlist.append((path, md5sum(path, md5blocklen)))
+                        elif not opt_timestamp or (mod_datetime(path) > opt_timestamp and opt_timestamp):
+                            if opt_threads:
+                                # Add it to the queue
+                                file_queue.put(path)     
+                            else:
+                                # Threading disabled
+	                        formatOutput(md5sum(path, md5blocklen),  path)
                        
 
     # With the hashlist built, compare to the negative/posative match list, or print