From 0b8a17b3b85336b824efb0f65e4108863f31b3e4 Mon Sep 17 00:00:00 2001 From: "P.S.Narayanan" Date: Thu, 29 Jun 2017 15:11:50 +0530 Subject: [PATCH] Update v1.0.0 (#17) * Update README.md * Major Update 1.0.0 * Fixed Unittest --- README.md | 3 ++- modules/getweblinks.py | 31 +++++++++++++++++++++---------- tests/test_getweblinks.py | 7 ++++--- torBot.py | 24 +++++++++++++++++++----- 4 files changed, 46 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 0bf1606a..a21afc69 100755 --- a/README.md +++ b/README.md @@ -45,7 +45,8 @@ ## A python web crawler for Deep and Dark Web. -[![Build Status](https://travis-ci.org/DedSecInside/TorBoT.svg?branch=master)](https://travis-ci.org/DedSecInside/TorBoT)[![](https://img.shields.io/badge/Donate-Bitcoin-blue.svg?style=flat-square)](https://blockchain.info/address/14st7SzDbQZuu8fpQ74x477WoRJ7gpHFaj) +[![Build Status](https://travis-ci.org/DedSecInside/TorBoT.svg?branch=master)](https://travis-ci.org/DedSecInside/TorBoT) +[![](https://img.shields.io/badge/Donate-Bitcoin-blue.svg?style=flat-square)](https://blockchain.info/address/14st7SzDbQZuu8fpQ74x477WoRJ7gpHFaj) [![forthebadge](http://forthebadge.com/images/badges/built-with-love.svg)](http://forthebadge.com) [![forthebadge](http://forthebadge.com/images/badges/made-with-python.svg)](http://forthebadge.com) diff --git a/modules/getweblinks.py b/modules/getweblinks.py index 50965591..4e882f61 100644 --- a/modules/getweblinks.py +++ b/modules/getweblinks.py @@ -6,18 +6,24 @@ import bs4 """Get all onion links from the website""" -def getLinks(soup): +def getLinks(soup,ext): _soup_instance = bs4.BeautifulSoup - extensions = ['.onion','.onion/'] + extensions = [] + if ext: + for e in ext: + extensions.append(e) if isinstance(type(soup), type(_soup_instance)): websites = [] for link in soup.find_all('a'): web_link = link.get('href') if web_link != None: - if 'http' in web_link: - for extension in extensions: - if web_link.endswith(extension): - websites.append(web_link) + if ('http' in web_link or 'https' in web_link): + if ext: + for exten in extensions: + if web_link.endswith(exten): + websites.append(web_link) + else: + websites.append(web_link) else: pass """Pretty print output as below""" @@ -25,10 +31,15 @@ def getLinks(soup): print (bcolors.OKGREEN+'Websites Found - '+bcolors.ENDC+str(len(websites))) print ('-------------------------------') for web in websites: - if (urllib.request.urlopen(web).getcode() == 200): - print (web) - else : - print(bcolors.On_Red+web +bcolors.ENDC) + flag=1 + try: + urllib.request.urlopen(web) + except urllib.error.HTTPError as e: + if e.code: + print(bcolors.On_Red+web+bcolors.ENDC) + flag=0 + if flag: + print(web) return websites else: raise('Method parameter is not of instance bs4.BeautifulSoup') diff --git a/tests/test_getweblinks.py b/tests/test_getweblinks.py index 68265789..c0df6c20 100644 --- a/tests/test_getweblinks.py +++ b/tests/test_getweblinks.py @@ -13,12 +13,13 @@ class getLinksTestCase(unittest.TestCase): def setUp(self): self.held, sys.stdout = sys.stdout, StringIO() + self.maxDiff=None def test_print_links(self): #data = "\nWebsites Found - 7\n-------------------------------\nhttp://ads.wsrs.net/www/delivery/ck.php?n=MyIP856a6b4\nhttp://ads.wsrs.net/www/delivery/ck.php?n=MyIPbf5d683\nhttp://aff.ironsocket.com/SH7L\nhttp://aff.ironsocket.com/SH7L\nhttp://ads.wsrs.net/www/delivery/ck.php?n=MyIPdb5f512\nhttp://wsrs.net/\nhttp://cmsgear.com/\n" - data = "\n"+bcolors.OKGREEN+"Websites Found - "+bcolors.ENDC+"0\n-------------------------------\n" - - getweblinks.getLinks(soup) + data = "\n"+bcolors.OKGREEN+"Websites Found - "+bcolors.ENDC+"1\n-------------------------------\nhttp://cmsgear.com/\n" + ext = ['.com/'] + getweblinks.getLinks(soup,ext) self.assertEqual(sys.stdout.getvalue(),data) diff --git a/torBot.py b/torBot.py index 62f31e5f..c665f6e7 100644 --- a/torBot.py +++ b/torBot.py @@ -63,7 +63,7 @@ def header(): print( " / /_/ __ \/ __ \/ /_ ____/_ __/ ") print( " / __/ / / / /_/ / __ \/ __ \/ / ") print( " / /_/ /_/ / _, _/ /_/ / /_/ / / ") - print( " \__/\____/_/ |_/_.___/\____/_/ V 0.0.3") + print( " \__/\____/_/ |_/_.___/\____/_/ V 1.0.0") print(bcolors.FAIL+bcolors.On_Black) print("#######################################################") print("# TorBot - A python Tor Crawler #") @@ -74,12 +74,26 @@ def header(): def main(): - header() + parser = argparse.ArgumentParser() + parser.add_argument("-q","--quiet",action="store_true") + parser.add_argument("-u","--url",help="Specifiy a website link to crawl") + parser.add_argument("-m","--mail",action="store_true", help="Get e-mail addresses from the crawled sites.") + parser.add_argument("-e","--extension",action='append',dest='extension',default=[],help="Specifiy additional website extensions to the list(.com or .org etc)") + args = parser.parse_args() + if args.quiet == 0: + header() print ("Tor Ip Address :") + link = args.url + ext = 0 + ext = args.extension a = readPage("https://check.torproject.org/",1) - b = readPage("http://torlinkbgs6aabns.onion/") - getMails(b) - getLinks(b) + if link: + b = readPage(link) + else: + b = readPage("http://torlinkbgs6aabns.onion/") + if args.mail: + getMails(b) + getLinks(b,ext) print ("\n\n") return 0