From b3bf93261f37c87a08f96c1c0fc5037ef0b94d8d Mon Sep 17 00:00:00 2001 From: serbriri Date: Tue, 4 Feb 2014 17:44:37 +0100 Subject: [PATCH 1/5] Modified malwaredl function - Using new bs4 BeautifulSoup version - Modified malwaredl function --- mwcrawler.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/mwcrawler.py b/mwcrawler.py index 19d0d56..d2defab 100644 --- a/mwcrawler.py +++ b/mwcrawler.py @@ -17,9 +17,9 @@ # along with this program. If not, see . # # Requirements: -# - BeautifulSoup 3.0.8 +# - BeautifulSoup 4 -from BeautifulSoup import BeautifulSoup as bs +from bs4 import BeautifulSoup as bs import sys import hashlib import re @@ -103,20 +103,21 @@ def decisor(url): file.close print "-- Saved file type %s with md5: %s" % (filetype,md5) +# Process MalwareDomainList URLs +# def malwaredl(soup): print "- Fetching from Malware Domain List" - mdl=[] - for row in soup('description'): - mdl.append(row) - del mdl[0] mdl_sites=[] - for row in mdl: - site = re.sub('&','&',str(row).split()[1]).replace(',','') - if site == '-': - mdl_sites.append(re.sub('&','&',str(row).split()[4]).replace(',','')) - else: - mdl_sites.append(site) - print "-- Found %s urls" % len(mdl) + + for i in soup.find_all('item'): + site_url = (i.description.string).split()[1].replace(',','') + + if site_url == '-': + site_url = (i.description.string).split()[4].replace(',','') + + mdl_sites.append(site_url) + + print "-- Found %s urls" % len(mdl_sites) for row in mdl_sites: decisor(row) @@ -198,4 +199,4 @@ def sacour(soup): vxvault(parse('http://vxvault.siri-urz.net/URL_List.php')) malc0de(parse('http://malc0de.com/rss')) malwarebl(parse('http://www.malwareblacklist.com/mbl.xml')) - sacour(parse('http://www.sacour.cn/showmal.asp?month=%d&year=%d' % (now.month, now.year))) \ No newline at end of file + sacour(parse('http://www.sacour.cn/showmal.asp?month=%d&year=%d' % (now.month, now.year))) From 50285002a51f26c7f78f598f1984719102ab103a Mon Sep 17 00:00:00 2001 From: serbriri Date: Wed, 5 Feb 2014 12:13:49 +0100 Subject: [PATCH 2/5] Modified vxvault, malc0de, malwarebl and main - Modified vxvault, malc0de and malwarebl functions - Added try statements in main function --- mwcrawler.py | 325 ++++++++++++++++++++++++++++----------------------- 1 file changed, 182 insertions(+), 143 deletions(-) diff --git a/mwcrawler.py b/mwcrawler.py index d2defab..bbddff2 100644 --- a/mwcrawler.py +++ b/mwcrawler.py @@ -17,186 +17,225 @@ # along with this program. If not, see . # # Requirements: -# - BeautifulSoup 4 +# - BeautifulSoup 3.0.8 (Upgraded to BeautifulSoup 4 ) from bs4 import BeautifulSoup as bs + import sys import hashlib import re import urllib2 import magic -import os +import os import socket import datetime +UNSORTED_FOLDER = '/opt/malware/unsorted' +HTTP_TIMEOUT = 15 +THUG_PATH = '/opt/thug/src' + # By default thug analyis is disabled -isthug = False +isthug = False # variable for date value manipulation -now = datetime.datetime.now() +now = datetime.datetime.now() str(now) # maximum wait time of http gets -timeout = 15 -socket.setdefaulttimeout(timeout) +socket.setdefaulttimeout(HTTP_TIMEOUT) # load thug function, also checks if thug is installed def loadthug(): - try: - sys.path.append('/opt/thug/src') - import thug - isthug = True - print "- Thug module loaded for html analysis" - except ImportError: - print "- No Thug module found, html code inspection won't be available" + try: + sys.path.append(THUG_PATH) + import thug + + isthug = True + print "- Thug module loaded for html analysis" + except ImportError: + print "- No Thug module found, html code inspection won't be available" # determine file type for correct archival def gettype(file): - ms = magic.open(magic.MAGIC_NONE) - ms.load() - return ms.buffer(file) + ms = magic.open(magic.MAGIC_NONE) + ms.load() + return ms.buffer(file) # beautifulsoup parser def parse(url): - request = urllib2.Request(url) - request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1)') - try: - http = bs(urllib2.urlopen(request)) - except: - print "- Error parsing %s" % (url) - return - return http + request = urllib2.Request(url) + request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1)') + try: + http = bs(urllib2.urlopen(request)) + except: + print "- Error parsing %s" % (url) + return + return http + def decisor(url): - if not re.match('http',url): - url = 'http://'+url - - try: - url_dl = urllib2.urlopen(url).read() - except Exception, e: - #print "-- Error: %s" % e - return - - filetype = gettype(url_dl).split(' ')[0] - md5 = hashlib.md5(url_dl).hexdigest() - - if (filetype == 'HTML'): - if isthug: - print "-- Thug candidate: HTML code in %s" % url - - try: - thug.Thug([url])() - except Exception, e: - print "- Thug error: %s" % e - return - - else: - dest = '/opt/malware/unsorted/'+filetype - fpath = dest+'/'+str(md5) - - if not os.path.exists(dest): - os.makedirs(dest) - - if not os.path.exists(fpath): - file = open(fpath, 'wb') - file.write(url_dl) - file.close - print "-- Saved file type %s with md5: %s" % (filetype,md5) - -# Process MalwareDomainList URLs + if not re.match('http', url): + url = 'http://' + url + + try: + url_dl = urllib2.urlopen(url).read() + except Exception, e: + #print "-- Error: %s" % e + return + + filetype = gettype(url_dl).split(' ')[0] + md5 = hashlib.md5(url_dl).hexdigest() + + if (filetype == 'HTML'): + if isthug: + print "-- Thug candidate: HTML code in %s" % url + + try: + thug.Thug([url])() + except Exception, e: + print "- Thug error: %s" % e + return + + else: + dest = UNSORTED_FOLDER + filetype + fpath = dest + '/' + str(md5) + + if not os.path.exists(dest): + os.makedirs(dest) + + if not os.path.exists(fpath): + file = open(fpath, 'wb') + file.write(url_dl) + file.close + print "-- Saved file type %s with md5: %s" % (filetype, md5) + +# Process links from MalwareDomainList # +# Modified for better performance and clarity ------------------------------------------------------------------ def malwaredl(soup): - print "- Fetching from Malware Domain List" - mdl_sites=[] - - for i in soup.find_all('item'): - site_url = (i.description.string).split()[1].replace(',','') - - if site_url == '-': - site_url = (i.description.string).split()[4].replace(',','') - - mdl_sites.append(site_url) - - print "-- Found %s urls" % len(mdl_sites) - for row in mdl_sites: - decisor(row) + print "[+] Fetching from Malware Domain List" + mdl_sites = [] + for i in soup.find_all('item'): + site_url = (i.description.string).split()[1].replace(',', '') -def vxvault(soup): - print "- Fetching from VXVault" - vxv=[] - for row in soup('pre'): - vxv = row.string.split('\r\n') - del vxv[:4] - del vxv[-1] - print "-- Found %s urls" % len(vxv) - for row in vxv: - decisor(row) + if site_url != '-': + mdl_sites.append(site_url) + else: + mdl_sites.append((i.description.string).split()[4].replace(',', '')) + + print "\t [+] Found %s urls" % len(mdl_sites) + for row in mdl_sites: + decisor(row) +# Process links from VXVault +# +# Modified for better performance and clarity ------------------------------------------------------------------ +def vxvault(soup): + print "[+] Fetching from VXVault" + vxv = [] + for row in soup('pre'): + vxv = row.string.split('\n') + vxv = vxv[4:] + print "\t [+] Found %s urls" % len(vxv) + for row in vxv: + decisor(row) + +# Process links from Malc0de +# +# Modified for better performance and clarity ------------------------------------------------------------------ def malc0de(soup): - print "- Fetching from Malc0de" - mlc=[] - for row in soup('description'): - mlc.append(row) - del mlc[0] - mlc_sites=[] - for row in mlc: - site = re.sub('&','&',str(row).split()[1]).replace(',','') - mlc_sites.append(site) - print "-- Found %s urls" % len(mlc_sites) - for row in mlc_sites: - decisor(row) + print "[+] Fetching from Malware Domain List" + mlc_sites = [] + for i in soup.find_all('item'): + site_url = "http://" + re.sub('&', '&', i.description.string.split()[1]).replace(',', '') + mlc_sites.append(site_url) + + print "\t [+] Found %s urls" % len(mlc_sites) + for row in mlc_sites: + decisor(row) +# Process links from Malware Black List +# +# Modified for better performance and clarity ------------------------------------------------------------------ def malwarebl(soup): - print "- Fetching from Malware Black List" - mbl=[] - for row in soup('description'): - site = str(row).split()[1].replace(',','') - mbl.append(site) - print "-- Found %s urls" % len(mbl) - for row in mbl: - decisor(row) + print "- Fetching from Malware Black List" + mbl_sites = [] + + for i in soup.find_all('item'): + site_url = "http://" + re.sub('&', '&', i.description.string.split()[1]).replace(',', '') + mbl_sites.append(site_url) + + print "\t [+] Found %s urls" % len(mbl_sites) + for row in mbl_sites: + decisor(row) + def minotaur(soup): - print "- Fetching from NovCon Minotaur" - min=[] - for row in soup('td'): - try: - if re.match('http',row.string): - min.append(row.string) - except: - pass - print "-- Found %s urls" % len(min) - for row in min: - decisor(row) + print "- Fetching from NovCon Minotaur" + min = [] + for row in soup('td'): + try: + if re.match('http', row.string): + min.append(row.string) + except: + pass + print "-- Found %s urls" % len(min) + for row in min: + decisor(row) + def sacour(soup): - print "- Fetching from Sacour.cn" - for url in soup('a'): - min=[] - if re.match('list/',url['href']): - suburl = parse('http://www.sacour.cn/'+url['href']) - for text in suburl('body'): - for urls in text.contents: - if re.match('http://',str(urls)): - min.append(str(urls)) - if len(min) > 0: - print "-- Found %s urls in %s" % (len(min),url['href']) - for row in min: - decisor(row) + print "- Fetching from Sacour.cn" + for url in soup('a'): + min = [] + if re.match('list/', url['href']): + suburl = parse('http://www.sacour.cn/' + url['href']) + for text in suburl('body'): + for urls in text.contents: + if re.match('http://', str(urls)): + min.append(str(urls)) + if len(min) > 0: + print "-- Found %s urls in %s" % (len(min), url['href']) + for row in min: + decisor(row) + if __name__ == "__main__": - print "Malware Parser v0.4" - - try: - if sys.argv[1] == '-t': - loadthug() - except: - print "- Thug analysis not enabled (use -t to enable thug)" - - #source list - minotaur(parse('http://minotauranalysis.com/malwarelist-urls.aspx')) - malwaredl(parse('http://www.malwaredomainlist.com/hostslist/mdl.xml')) - vxvault(parse('http://vxvault.siri-urz.net/URL_List.php')) - malc0de(parse('http://malc0de.com/rss')) - malwarebl(parse('http://www.malwareblacklist.com/mbl.xml')) - sacour(parse('http://www.sacour.cn/showmal.asp?month=%d&year=%d' % (now.month, now.year))) + print "Malware Parser v0.4" + + try: + if sys.argv[1] == '-t': + loadthug() + except: + print "- Thug analysis not enabled (use -t to enable thug)" + + #source list + try: + minotaur(parse('http://minotauranalysis.com/malwarelist-urls.aspx')) + except: + print "[ERROR] Unexpected error processing Minotaur List" + + try: + malwaredl(parse('http://www.malwaredomainlist.com/hostslist/mdl.xml')) + except: + print "[ERROR] Unexpected error processing Malware Domain List" + + try: + vxvault(parse('http://vxvault.siri-urz.net/URL_List.php')) + except: + print "[ERROR] Unexpected error processing VXVault List" + + try: + malc0de(parse('http://malc0de.com/rss')) + except: + print "[ERROR] Unexpected error processing Malc0de List" + + try: + malwarebl(parse('http://www.malwareblacklist.com/mbl.xml')) + except + print "[ERROR] Unexpected error processing Malware Black List" + +try: + sacour(parse('http://www.sacour.cn/showmal.asp?month=%d&year=%d' % (now.month, now.year))) +except: + print "[ERROR] Unexpected error processing Sacour List" From 1f48d2c8c775cfe37f85653bf17d4643fb62fe59 Mon Sep 17 00:00:00 2001 From: serbriri Date: Thu, 6 Feb 2014 18:13:13 +0100 Subject: [PATCH 3/5] Update mwcrawler.py --- mwcrawler.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mwcrawler.py b/mwcrawler.py index bbddff2..4cf27b9 100644 --- a/mwcrawler.py +++ b/mwcrawler.py @@ -232,10 +232,10 @@ def sacour(soup): try: malwarebl(parse('http://www.malwareblacklist.com/mbl.xml')) - except - print "[ERROR] Unexpected error processing Malware Black List" + except: + print "[ERROR] Unexpected error processing Malware Black List" -try: - sacour(parse('http://www.sacour.cn/showmal.asp?month=%d&year=%d' % (now.month, now.year))) -except: - print "[ERROR] Unexpected error processing Sacour List" + try: + sacour(parse('http://www.sacour.cn/showmal.asp?month=%d&year=%d' % (now.month, now.year))) + except: + print "[ERROR] Unexpected error processing Sacour List" From 7e3c2c796603e5e15dd76bad5b840d56dd491328 Mon Sep 17 00:00:00 2001 From: serbriri Date: Thu, 6 Feb 2014 18:14:55 +0100 Subject: [PATCH 4/5] Update mwcrawler.py --- mwcrawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mwcrawler.py b/mwcrawler.py index 4cf27b9..6214401 100644 --- a/mwcrawler.py +++ b/mwcrawler.py @@ -107,7 +107,7 @@ def decisor(url): file = open(fpath, 'wb') file.write(url_dl) file.close - print "-- Saved file type %s with md5: %s" % (filetype, md5) + print "\t\t [*] Saved file type %s with md5: %s" % (filetype, md5) # Process links from MalwareDomainList # From cf25038ed93d4cf21013f49249ac09ac8e89023e Mon Sep 17 00:00:00 2001 From: serbriri Date: Thu, 6 Feb 2014 18:29:17 +0100 Subject: [PATCH 5/5] Update mwcrawler.py --- mwcrawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mwcrawler.py b/mwcrawler.py index 6214401..eb73a4d 100644 --- a/mwcrawler.py +++ b/mwcrawler.py @@ -144,7 +144,7 @@ def vxvault(soup): # # Modified for better performance and clarity ------------------------------------------------------------------ def malc0de(soup): - print "[+] Fetching from Malware Domain List" + print "[+] Fetching from MalC0de List" mlc_sites = [] for i in soup.find_all('item'): site_url = "http://" + re.sub('&', '&', i.description.string.split()[1]).replace(',', '')