Skip to content

Commit

Permalink
Merge pull request #144 from KingAkeem/multi_thread_tree
Browse files Browse the repository at this point in the history
Using multi-threading for tree generation with links and adding documentation
  • Loading branch information
PSNAppz authored Oct 27, 2018
2 parents 65bee14 + a9d945d commit 1fd77b7
Show file tree
Hide file tree
Showing 6 changed files with 169 additions and 97 deletions.
29 changes: 19 additions & 10 deletions modules/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from ete3 import Tree, TreeStyle, TextFace, add_face_to_node
from .link import LinkNode
from .utils import multi_thread

class LinkTree:
"""
Expand All @@ -17,8 +18,8 @@ class LinkTree:
tld (bool): Decides whether or not to use additional top-level-domains besides .tor
stop_depth (int): Depth of which to stop searching for links
"""
def __init__(self, root_node, *, tld=False, stop_depth=1):
self._tree = build_tree(root_node, tld=tld, stop=stop_depth)
def __init__(self, root_node, *, stop_depth=1):
self._tree = build_tree(root_node, stop=stop_depth)

def __len__(self):
return len(self._tree)
Expand Down Expand Up @@ -66,10 +67,11 @@ def initialize_tree(root_node):
to_visit (list): Children of root node
"""
root = Tree(name=root_node.name)
children = root_node.get_children()
children = root_node.links
return root, children

def build_tree(link, *, tld, stop=1, rec=0, to_visit=None, tree=None):

def build_tree(link=None, *, stop=1, rec=0, to_visit=None, tree=None):
"""
Builds tree using Breadth First Search. You can specify stop depth.
Rec & tree arguments are used for recursion.
Expand Down Expand Up @@ -97,25 +99,32 @@ def build_tree(link, *, tld, stop=1, rec=0, to_visit=None, tree=None):
# If recursion is 0 then sub_tree will be root
return sub_tree if rec == 0 else tree

children_to_visit = list()
for link_name in to_visit:
def visit_nodes(link):
children_to_visit = list()
try:
node = LinkNode(link_name, tld=tld)
node = LinkNode(link)
except (ValueError, ConnectionError, HTTPError):
continue
return None

link_node = sub_tree.add_child(name=node.name)
link_children = node.get_children()
link_children = node.links
# No need to find children if we aren't going to visit them
if stop != rec + 1:
for child in link_children:
link_node.add_child(name=child)
children_to_visit.append(child)

if stop != rec + 1:
return children_to_visit

return to_visit

next_nodes = multi_thread(to_visit, visit_nodes)
rec += 1

# If we've reached stop depth then return tree
if stop == rec:
return sub_tree

new_tree = tree.add_child(sub_tree)
return build_tree(to_visit, tld=tld, stop=stop, rec=rec, tree=new_tree)
return build_tree(to_visit=next_nodes, stop=stop, rec=rec, tree=new_tree)
120 changes: 85 additions & 35 deletions modules/link.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,75 @@
import re
"""
This module is used to create a LinkNode that can be consumued by a LinkTree
and contains useful Link methods
"""
import requests
import requests.exceptions
import validators

from bs4 import BeautifulSoup
from .utils import multi_thread
from .color import color

def get_emails(node):
"""Finds all emails associated with node
Args:
node (LinkNode): node used to get emails from
Returns:
emails (list): list of emails
"""
emails = []
for child in node.children:
link = child.get('href')
if link and 'mailto' in link:
email_addr = link.split(':')
if LinkNode.valid_email(email_addr[1]) and len(email_addr) > 1:
emails.append(email_addr[1])
return emails


def get_links(node):
"""Finds all links associated with node
Args:
node (LinkNode): node used to get links from
Returns:
links (list): list of links
"""
def retrieve_link(child):
link = child.get('href')
if link and LinkNode.valid_link(link):
return link
return None

return multi_thread(node.children, retrieve_link)


class LinkNode:
"""Represents link node in a link tree
Attributes:
link (str): link to be used as node
"""

def __init__(self, link, *, tld=False):
def __init__(self, link):
# If link has invalid form, throw an error
if not self.valid_link(link):
raise ValueError("Invalid link format.")

self.tld = tld
self._children = []
self._emails = []
self._links = []

# Attempts to connect to link, throws an error if link is unreachable
try:
self.response = requests.get(link)
except (requests.exceptions.ChunkedEncodingError, requests.exceptions.HTTPError, requests.exceptions.ConnectionError, ConnectionError) as err:
except (requests.exceptions.ChunkedEncodingError,
requests.exceptions.HTTPError,
requests.exceptions.ConnectionError,
ConnectionError) as err:
raise err

self._node = BeautifulSoup(self.response.text, 'html.parser')
Expand All @@ -30,43 +80,43 @@ def __init__(self, link, *, tld=False):
self.name = self._node.title.string
self.status = color(link, 'green')

def get_emails(self):
if self._emails:
return self._emails

children = self._node.find_all('a')
email_nodes = []
for child in children:
link = child.get('href')
if link and 'mailto' in link:
email_addr = link.split(':')
if self.valid_email(email_addr[1]) and len(email_addr) > 1:
email_nodes.append(email_addr[1])
self._emails = email_nodes
return email_nodes

def get_children(self):
if self._children:
return self._children

children = self._node.find_all('a')
child_nodes = []
for child in children:
link = child.get('href')
if link and self.valid_link(link):
child_nodes.append(link)

self._children = child_nodes
return child_nodes
@property
def emails(self):
"""
Getter for node emails
"""
if not self._emails:
self._emails = get_emails(self)
return self._emails

@property
def links(self):
"""
Getter for node links
"""
if not self._links:
self._links = get_links(self)
return self._links

@property
def children(self):
"""
Getter for node children
"""
if not self._children:
self._children = self._node.find_all('a')
return self._children

@staticmethod
def valid_email(email):
if validators.email(email):
return True
return False
"""Static method used to validate emails"""
if validators.email(email):
return True
return False

@staticmethod
def valid_link(link):
"""Static method used to validate links"""
if validators.url(link):
return True
return False
53 changes: 35 additions & 18 deletions modules/link_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,44 +9,56 @@
from .color import color

class LinkIO:

"""
This class is only used to interact with links
"""
@staticmethod
def display_children(link_node):
children = link_node.get_children()
sucess_msg = color(f'Links Found - {len(children)}', 'green')
def display_children(root):
"""
Static method to display status of child nodes
Args:
root (LinkNode): root of children to be displayed
"""
sucess_msg = color(f'Links Found - {len(root.links)}', 'green')
print(sucess_msg + '\n' + '---------------------------------')
multi_thread(children, LinkIO.display)
multi_thread(root.links, LinkIO.display)

@staticmethod
def read(link, *, response=False, show_msg=False, headers=None, schemes=None):
"""
Attempts to retrieve HTML from link
Args:
headers (dict)
schemes (list)
link (str): link to read
response (bool): determines if response is returned.
show_msg(bool): determines if message is displayed for connection
headers (dict): header for request, defaults to None
schemes (list): differenct schemes to attempt to use
Returns:
resp.text (str): html from page
str: html from page
requests.Response (optional): response returned from requests
"""
headers = {'User-Agent': 'XXXX-XXXXX-XXXX'} if not headers else headers
# Attempts to connect directly to site if no scheme is passed
if not schemes:
if show_msg:
print(f'Attempting to connect to {link}')
if LinkNode.valid_link(link):
node = LinkNode(link, tld=True)
node = LinkNode(link)
if response:
return node.response.text, node.response
return node.response.text

schemes = ['https://', 'http://'] if not schemes else schemes

# Attempt to use different schemes until one is successful
for scheme in schemes:
temp_url = scheme + link
if show_msg:
print(f'Attempting to connect to {link}')
if LinkNode.valid_link(temp_url):
node = LinkNode(temp_url, tld=True)
node = LinkNode(temp_url)
if response:
return node.response.text, node.response
return node.response.text
Expand All @@ -55,26 +67,31 @@ def read(link, *, response=False, show_msg=False, headers=None, schemes=None):
@staticmethod
def display(link):
"""
Prints the status of a link
Prints the status of a link based on it's connection status
Args:
link (str): link to get status of
"""
if LinkNode.valid_link(link):
try:
node = LinkNode(link, tld=True)
node = LinkNode(link)
title = node.name
link_status = node.status
except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError, ConnectionError):
except (requests.exceptions.HTTPError,
requests.exceptions.ConnectionError,
ConnectionError):
title = 'Not Found'
link_status = color(link, 'red')

print("%-80s %-30s" % (link_status, title))
status_msg = "%-80s %-30s" % (link_status, title)
print(status_msg)


@staticmethod
def display_ip():
"""Returns users tor ip address
"""
https://check.torproject.org/ tells you if you are using tor and it
displays your IP address which we scape and return
displays your IP address which we scape and display
"""

page = LinkIO.read('https://check.torproject.org/', show_msg=True)
Expand Down
12 changes: 5 additions & 7 deletions modules/tests/test_getweblinks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import pytest
import requests_mock

from bs4 import BeautifulSoup
from yattag import Doc
from ..link import LinkNode

Expand All @@ -24,7 +23,7 @@ def setup_html(test_links, *, fail=False):
with tag('body'):
for data in test_links:
if not fail:
line('a', 'test_anchor', href=data)
line('a', 'test_anchor', href=data)

return doc.getvalue()

Expand All @@ -45,7 +44,7 @@ def test_get_links_fail():
mock_connection.register_uri('GET', data, text=mock_html)
with pytest.raises(ValueError):
node = LinkNode(data)
result = node.get_children()
result = node.links
assert result == []

@pytest.fixture
Expand All @@ -61,11 +60,10 @@ def test_get_links_tor():
mock_html = setup_html(test_data)
mock_link = 'http://test.tor'
with requests_mock.Mocker() as mock_connection:
for data in test_data:
mock_connection.register_uri('GET', mock_link, text=mock_html)
mock_connection.register_uri('GET', mock_link, text=mock_html)

node = LinkNode(mock_link)
result = node.get_children()
result = node.links
assert result == test_data


Expand Down Expand Up @@ -93,7 +91,7 @@ def test_get_links_tld():
mock_connection.register_uri('GET', mock_url, text=mock_html)

node = LinkNode(mock_url)
links = node.get_children()
links = node.links
assert links == test_data


Expand Down
Loading

0 comments on commit 1fd77b7

Please sign in to comment.