From b3bf93261f37c87a08f96c1c0fc5037ef0b94d8d Mon Sep 17 00:00:00 2001
From: serbriri <serbriri@gmail.com>
Date: Tue, 4 Feb 2014 17:44:37 +0100
Subject: [PATCH 1/5] Modified malwaredl function

- Using new bs4 BeautifulSoup version
- Modified malwaredl function
---
 mwcrawler.py | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/mwcrawler.py b/mwcrawler.py
index 19d0d56..d2defab 100644
--- a/mwcrawler.py
+++ b/mwcrawler.py
@@ -17,9 +17,9 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 # Requirements:
-# - BeautifulSoup 3.0.8
+# - BeautifulSoup 4
 
-from BeautifulSoup import BeautifulSoup as bs
+from bs4 import BeautifulSoup as bs
 import sys
 import hashlib
 import re
@@ -103,20 +103,21 @@ def decisor(url):
 			file.close
 			print "-- Saved file type %s with md5: %s" % (filetype,md5)
 
+# Process MalwareDomainList URLs
+#
 def malwaredl(soup):
 	print "- Fetching from Malware Domain List"
-	mdl=[]
-	for row in soup('description'):
-		mdl.append(row)
-	del mdl[0]
 	mdl_sites=[]
-	for row in mdl:
-		site = re.sub('&amp;','&',str(row).split()[1]).replace(',','')
-		if site == '-':
-			mdl_sites.append(re.sub('&amp;','&',str(row).split()[4]).replace(',',''))
-		else:
-			mdl_sites.append(site)
-	print "-- Found %s urls" % len(mdl)
+	
+	for i in soup.find_all('item'):
+		site_url = (i.description.string).split()[1].replace(',','')
+		
+		if site_url == '-':
+			site_url = (i.description.string).split()[4].replace(',','')
+		
+		mdl_sites.append(site_url)
+	
+	print "-- Found %s urls" % len(mdl_sites)
 	for row in mdl_sites:
 		decisor(row)
 
@@ -198,4 +199,4 @@ def sacour(soup):
 	vxvault(parse('http://vxvault.siri-urz.net/URL_List.php'))
 	malc0de(parse('http://malc0de.com/rss'))
 	malwarebl(parse('http://www.malwareblacklist.com/mbl.xml'))
-	sacour(parse('http://www.sacour.cn/showmal.asp?month=%d&year=%d' % (now.month, now.year)))
\ No newline at end of file
+	sacour(parse('http://www.sacour.cn/showmal.asp?month=%d&year=%d' % (now.month, now.year)))

From 50285002a51f26c7f78f598f1984719102ab103a Mon Sep 17 00:00:00 2001
From: serbriri <serbriri@gmail.com>
Date: Wed, 5 Feb 2014 12:13:49 +0100
Subject: [PATCH 2/5] Modified vxvault, malc0de, malwarebl and main

- Modified vxvault, malc0de and malwarebl functions
- Added try statements in main function
---
 mwcrawler.py | 325 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 182 insertions(+), 143 deletions(-)

diff --git a/mwcrawler.py b/mwcrawler.py
index d2defab..bbddff2 100644
--- a/mwcrawler.py
+++ b/mwcrawler.py
@@ -17,186 +17,225 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 # Requirements:
-# - BeautifulSoup 4
+# - BeautifulSoup 3.0.8 (Upgraded to BeautifulSoup 4 )
 
 from bs4 import BeautifulSoup as bs
+
 import sys
 import hashlib
 import re
 import urllib2
 import magic
-import os 
+import os
 import socket
 import datetime
 
+UNSORTED_FOLDER = '/opt/malware/unsorted'
+HTTP_TIMEOUT = 15
+THUG_PATH = '/opt/thug/src'
+
 # By default thug analyis is disabled
-isthug	= False
+isthug = False
 
 # variable for date value manipulation
-now		= datetime.datetime.now()
+now = datetime.datetime.now()
 str(now)
 
 # maximum wait time of http gets
-timeout	= 15
-socket.setdefaulttimeout(timeout)
+socket.setdefaulttimeout(HTTP_TIMEOUT)
 
 # load thug function, also checks if thug is installed
 def loadthug():
-	try:
-		sys.path.append('/opt/thug/src')
-		import thug
-		isthug = True
-		print "- Thug module loaded for html analysis"
-	except ImportError:
-		print "- No Thug module found, html code inspection won't be available"
+    try:
+        sys.path.append(THUG_PATH)
+        import thug
+
+        isthug = True
+        print "- Thug module loaded for html analysis"
+    except ImportError:
+        print "- No Thug module found, html code inspection won't be available"
 
 # determine file type for correct archival
 def gettype(file):
-	ms = magic.open(magic.MAGIC_NONE)
-	ms.load()
-	return ms.buffer(file)
+    ms = magic.open(magic.MAGIC_NONE)
+    ms.load()
+    return ms.buffer(file)
 
 # beautifulsoup parser
 def parse(url):
-	request = urllib2.Request(url)
-	request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1)')
-	try:
-		http = bs(urllib2.urlopen(request))
-	except:
-		print "- Error parsing %s" % (url)
-		return
-	return http
+    request = urllib2.Request(url)
+    request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1)')
+    try:
+        http = bs(urllib2.urlopen(request))
+    except:
+        print "- Error parsing %s" % (url)
+        return
+    return http
+
 
 def decisor(url):
-	if not re.match('http',url):
-		url = 'http://'+url
-
-	try:
-		url_dl = urllib2.urlopen(url).read()
-	except Exception, e:
-		#print "-- Error: %s" % e
-		return
-
-	filetype = gettype(url_dl).split(' ')[0]
-	md5      = hashlib.md5(url_dl).hexdigest()
-
-	if (filetype == 'HTML'):
-		if isthug:
-			print "-- Thug candidate: HTML code in %s" % url
-
-			try:
-				thug.Thug([url])()
-			except Exception, e:
-				print "- Thug error: %s" % e
-				return
-
-	else:
-		dest = '/opt/malware/unsorted/'+filetype
-		fpath = dest+'/'+str(md5)
-
-		if not os.path.exists(dest):
-			os.makedirs(dest)
-
-		if not os.path.exists(fpath):
-			file = open(fpath, 'wb')
-			file.write(url_dl)
-			file.close
-			print "-- Saved file type %s with md5: %s" % (filetype,md5)
-
-# Process MalwareDomainList URLs
+    if not re.match('http', url):
+        url = 'http://' + url
+
+    try:
+        url_dl = urllib2.urlopen(url).read()
+    except Exception, e:
+        #print "-- Error: %s" % e
+        return
+
+    filetype = gettype(url_dl).split(' ')[0]
+    md5 = hashlib.md5(url_dl).hexdigest()
+
+    if (filetype == 'HTML'):
+        if isthug:
+            print "-- Thug candidate: HTML code in %s" % url
+
+            try:
+                thug.Thug([url])()
+            except Exception, e:
+                print "- Thug error: %s" % e
+                return
+
+    else:
+        dest = UNSORTED_FOLDER + filetype
+        fpath = dest + '/' + str(md5)
+
+        if not os.path.exists(dest):
+            os.makedirs(dest)
+
+        if not os.path.exists(fpath):
+            file = open(fpath, 'wb')
+            file.write(url_dl)
+            file.close
+            print "-- Saved file type %s with md5: %s" % (filetype, md5)
+
+# Process links from MalwareDomainList
 #
+# Modified for better performance and clarity ------------------------------------------------------------------
 def malwaredl(soup):
-	print "- Fetching from Malware Domain List"
-	mdl_sites=[]
-	
-	for i in soup.find_all('item'):
-		site_url = (i.description.string).split()[1].replace(',','')
-		
-		if site_url == '-':
-			site_url = (i.description.string).split()[4].replace(',','')
-		
-		mdl_sites.append(site_url)
-	
-	print "-- Found %s urls" % len(mdl_sites)
-	for row in mdl_sites:
-		decisor(row)
+    print "[+] Fetching from Malware Domain List"
+    mdl_sites = []
+    for i in soup.find_all('item'):
+        site_url = (i.description.string).split()[1].replace(',', '')
 
-def vxvault(soup):
-	print "- Fetching from VXVault"
-	vxv=[]
-	for row in soup('pre'):
-		vxv = row.string.split('\r\n')
-	del vxv[:4]
-	del vxv[-1]
-	print "-- Found %s urls" % len(vxv)
-	for row in vxv:
-		decisor(row)
+        if site_url != '-':
+            mdl_sites.append(site_url)
+        else:
+            mdl_sites.append((i.description.string).split()[4].replace(',', ''))
+
+    print "\t [+] Found %s urls" % len(mdl_sites)
+    for row in mdl_sites:
+        decisor(row)
 
+# Process links from VXVault
+#
+# Modified for better performance and clarity ------------------------------------------------------------------
+def vxvault(soup):
+    print "[+] Fetching from VXVault"
+    vxv = []
+    for row in soup('pre'):
+        vxv = row.string.split('\n')
+    vxv = vxv[4:]
+    print "\t [+] Found %s urls" % len(vxv)
+    for row in vxv:
+        decisor(row)
+
+# Process links from Malc0de
+#
+# Modified for better performance and clarity ------------------------------------------------------------------
 def malc0de(soup):
-	print "- Fetching from Malc0de"
-	mlc=[]
-	for row in soup('description'):
-		mlc.append(row)
-	del mlc[0]
-	mlc_sites=[]
-	for row in mlc:
-		site = re.sub('&amp;','&',str(row).split()[1]).replace(',','')
-		mlc_sites.append(site)
-	print "-- Found %s urls" % len(mlc_sites)
-	for row in mlc_sites:
-		decisor(row)
+    print "[+] Fetching from Malware Domain List"
+    mlc_sites = []
+    for i in soup.find_all('item'):
+        site_url = "http://" + re.sub('&amp;', '&', i.description.string.split()[1]).replace(',', '')
+        mlc_sites.append(site_url)
+
+    print "\t [+] Found %s urls" % len(mlc_sites)
+    for row in mlc_sites:
+        decisor(row)
 
+# Process links from Malware Black List
+#
+# Modified for better performance and clarity ------------------------------------------------------------------
 def malwarebl(soup):
-	print "- Fetching from Malware Black List"
-	mbl=[]
-	for row in soup('description'):
-		site = str(row).split()[1].replace(',','')
-		mbl.append(site)
-	print "-- Found %s urls" % len(mbl)
-	for row in mbl:
-		decisor(row)
+    print "- Fetching from Malware Black List"
+    mbl_sites = []
+
+    for i in soup.find_all('item'):
+        site_url = "http://" + re.sub('&amp;', '&', i.description.string.split()[1]).replace(',', '')
+        mbl_sites.append(site_url)
+
+    print "\t [+] Found %s urls" % len(mbl_sites)
+    for row in mbl_sites:
+        decisor(row)
+
 
 def minotaur(soup):
-	print "- Fetching from NovCon Minotaur"
-	min=[]
-	for row in soup('td'):
-		try:
-			if re.match('http',row.string):
-				min.append(row.string)
-		except:
-			pass
-	print "-- Found %s urls" % len(min)
-	for row in min: 
-		decisor(row)
+    print "- Fetching from NovCon Minotaur"
+    min = []
+    for row in soup('td'):
+        try:
+            if re.match('http', row.string):
+                min.append(row.string)
+        except:
+            pass
+    print "-- Found %s urls" % len(min)
+    for row in min:
+        decisor(row)
+
 
 def sacour(soup):
-	print "- Fetching from Sacour.cn"
-	for url in soup('a'):
-		min=[]
-		if re.match('list/',url['href']):
-			suburl = parse('http://www.sacour.cn/'+url['href'])
-			for text in suburl('body'):
-				for urls in text.contents:
-					if re.match('http://',str(urls)):
-						min.append(str(urls))
-		if len(min) > 0:
-			print "-- Found %s urls in %s" % (len(min),url['href'])
-			for row in min:
-				decisor(row)
+    print "- Fetching from Sacour.cn"
+    for url in soup('a'):
+        min = []
+        if re.match('list/', url['href']):
+            suburl = parse('http://www.sacour.cn/' + url['href'])
+            for text in suburl('body'):
+                for urls in text.contents:
+                    if re.match('http://', str(urls)):
+                        min.append(str(urls))
+        if len(min) > 0:
+            print "-- Found %s urls in %s" % (len(min), url['href'])
+            for row in min:
+                decisor(row)
+
 
 if __name__ == "__main__":
-	print "Malware Parser v0.4"
-
-	try:
-		if sys.argv[1] == '-t':
-			loadthug()
-	except:
-		print "- Thug analysis not enabled (use -t to enable thug)"
-
-	#source list
-	minotaur(parse('http://minotauranalysis.com/malwarelist-urls.aspx'))
-	malwaredl(parse('http://www.malwaredomainlist.com/hostslist/mdl.xml'))
-	vxvault(parse('http://vxvault.siri-urz.net/URL_List.php'))
-	malc0de(parse('http://malc0de.com/rss'))
-	malwarebl(parse('http://www.malwareblacklist.com/mbl.xml'))
-	sacour(parse('http://www.sacour.cn/showmal.asp?month=%d&year=%d' % (now.month, now.year)))
+    print "Malware Parser v0.4"
+
+    try:
+        if sys.argv[1] == '-t':
+            loadthug()
+    except:
+        print "- Thug analysis not enabled (use -t to enable thug)"
+
+    #source list
+    try:
+        minotaur(parse('http://minotauranalysis.com/malwarelist-urls.aspx'))
+    except:
+        print "[ERROR] Unexpected error processing Minotaur List"
+
+    try:
+        malwaredl(parse('http://www.malwaredomainlist.com/hostslist/mdl.xml'))
+    except:
+        print "[ERROR] Unexpected error processing Malware Domain List"
+
+    try:
+        vxvault(parse('http://vxvault.siri-urz.net/URL_List.php'))
+    except:
+        print "[ERROR] Unexpected error processing VXVault List"
+
+    try:
+        malc0de(parse('http://malc0de.com/rss'))
+    except:
+        print "[ERROR] Unexpected error processing Malc0de List"
+
+    try:
+        malwarebl(parse('http://www.malwareblacklist.com/mbl.xml'))
+    except
+    print "[ERROR] Unexpected error processing Malware Black List"
+
+try:
+    sacour(parse('http://www.sacour.cn/showmal.asp?month=%d&year=%d' % (now.month, now.year)))
+except:
+    print "[ERROR] Unexpected error processing Sacour List"

From 1f48d2c8c775cfe37f85653bf17d4643fb62fe59 Mon Sep 17 00:00:00 2001
From: serbriri <serbriri@gmail.com>
Date: Thu, 6 Feb 2014 18:13:13 +0100
Subject: [PATCH 3/5] Update mwcrawler.py

---
 mwcrawler.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mwcrawler.py b/mwcrawler.py
index bbddff2..4cf27b9 100644
--- a/mwcrawler.py
+++ b/mwcrawler.py
@@ -232,10 +232,10 @@ def sacour(soup):
 
     try:
         malwarebl(parse('http://www.malwareblacklist.com/mbl.xml'))
-    except
-    print "[ERROR] Unexpected error processing Malware Black List"
+    except:
+        print "[ERROR] Unexpected error processing Malware Black List"
 
-try:
-    sacour(parse('http://www.sacour.cn/showmal.asp?month=%d&year=%d' % (now.month, now.year)))
-except:
-    print "[ERROR] Unexpected error processing Sacour List"
+    try:
+        sacour(parse('http://www.sacour.cn/showmal.asp?month=%d&year=%d' % (now.month, now.year)))
+    except:
+        print "[ERROR] Unexpected error processing Sacour List"

From 7e3c2c796603e5e15dd76bad5b840d56dd491328 Mon Sep 17 00:00:00 2001
From: serbriri <serbriri@gmail.com>
Date: Thu, 6 Feb 2014 18:14:55 +0100
Subject: [PATCH 4/5] Update mwcrawler.py

---
 mwcrawler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mwcrawler.py b/mwcrawler.py
index 4cf27b9..6214401 100644
--- a/mwcrawler.py
+++ b/mwcrawler.py
@@ -107,7 +107,7 @@ def decisor(url):
             file = open(fpath, 'wb')
             file.write(url_dl)
             file.close
-            print "-- Saved file type %s with md5: %s" % (filetype, md5)
+            print "\t\t [*] Saved file type %s with md5: %s" % (filetype, md5)
 
 # Process links from MalwareDomainList
 #

From cf25038ed93d4cf21013f49249ac09ac8e89023e Mon Sep 17 00:00:00 2001
From: serbriri <serbriri@gmail.com>
Date: Thu, 6 Feb 2014 18:29:17 +0100
Subject: [PATCH 5/5] Update mwcrawler.py

---
 mwcrawler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mwcrawler.py b/mwcrawler.py
index 6214401..eb73a4d 100644
--- a/mwcrawler.py
+++ b/mwcrawler.py
@@ -144,7 +144,7 @@ def vxvault(soup):
 #
 # Modified for better performance and clarity ------------------------------------------------------------------
 def malc0de(soup):
-    print "[+] Fetching from Malware Domain List"
+    print "[+] Fetching from MalC0de List"
     mlc_sites = []
     for i in soup.find_all('item'):
         site_url = "http://" + re.sub('&amp;', '&', i.description.string.split()[1]).replace(',', '')