mushorg
diff --git a/‎clone.py
Lines changed: 114 additions & 60 deletions b/‎clone.py
Lines changed: 114 additions & 60 deletions
diff --git a/‎converter.py
Lines changed: 31 additions & 0 deletions b/‎converter.py
Lines changed: 31 additions & 0 deletions
@@ -14,23 +14,27 @@
 GNU General Public License for more details.
 """
 
-import re
+import argparse
+import asyncio
+import hashlib
+import json
 import os
+import re
 import sys
-
-import asyncio
 from asyncio import Queue
-import argparse
+
 import aiohttp
 import cssutils
 import yarl
 from bs4 import BeautifulSoup
 
 
 class Cloner(object):
-    def __init__(self, root):
+    def __init__(self, root, max_depth):
         self.visited_urls = []
         self.root = self.add_scheme(root)
+        self.max_depth = max_depth
+        self.moved_root = None
         if len(self.root.host) < 4:
             sys.exit('invalid taget {}'.format(self.root.host))
         self.target_path = '/opt/snare/pages/{}'.format(self.root.host)
@@ -39,6 +43,7 @@ def __init__(self, root):
             os.mkdir(self.target_path)
 
         self.new_urls = Queue()
+        self.meta = {}
 
     @staticmethod
     def add_scheme(url):
@@ -48,103 +53,147 @@ def add_scheme(url):
             new_url = yarl.URL('http://' + url)
         return new_url
 
-    @asyncio.coroutine
-    def process_link(self, url, check_host=False):
-        url = yarl.URL(url)
+    async def process_link(self, url, level, check_host=False):
+        try:
+            url = yarl.URL(url)
+        except UnicodeError:
+            return None
+        if url.scheme == ("data" or "javascript" or "file"):
+            return url.human_repr()
+        if not url.is_absolute():
+            if self.moved_root is None:
+                url = self.root.join(url)
+            else:
+                url = self.moved_root.join(url)
+
+        host = url.host
+
         if check_host:
-            if (url.host != self.root.host or url.fragment
-                            or url in self.visited_urls):
+            if (host != self.root.host and self.moved_root is None) or \
+                    url.fragment or \
+                    (self.moved_root is not None and host != self.moved_root.host):
                 return None
-        if not url.is_absolute():
-            url = self.root.join(url)
 
-        yield from self.new_urls.put(url)
-        return url.relative().human_repr()
+        if url.human_repr() not in self.visited_urls and (level + 1) <= self.max_depth:
+            await self.new_urls.put((url, level + 1))
+
+        res = None
+        try:
+            res = url.relative().human_repr()
+        except ValueError:
+            print(url)
+        return res
 
-    @asyncio.coroutine
-    def replace_links(self, data):
+    async def replace_links(self, data, level):
         soup = BeautifulSoup(data, 'html.parser')
 
         # find all relative links
         for link in soup.findAll(href=True):
-            res = yield from self.process_link(link['href'], check_host=True)
+            res = await self.process_link(link['href'], level, check_host=True)
             if res is not None:
                 link['href'] = res
 
         # find all images and scripts
         for elem in soup.findAll(src=True):
-            res = yield from self.process_link(elem['src'])
+            res = await self.process_link(elem['src'], level)
             if res is not None:
                 elem['src'] = res
 
         # find all action elements
         for act_link in soup.findAll(action=True):
-            res = yield from self.process_link(act_link['action'])
+            res = await self.process_link(act_link['action'], level)
             if res is not None:
                 act_link['action'] = res
 
         # prevent redirects
         for redir in soup.findAll(True, attrs={'name': re.compile('redirect.*')}):
-            redir['value'] = yarl.URL(redir['value']).relative().human_repr()
+            if redir['value'] != "":
+                redir['value'] = yarl.URL(redir['value']).relative().human_repr()
 
         return soup
 
-    @asyncio.coroutine
-    def get_body(self):
-        while not self.new_urls.empty():
-            current_url = yield from self.new_urls.get()
-            if current_url in self.visited_urls:
-                continue
-            self.visited_urls.append(current_url)
-            if current_url.name:
-                file_name = current_url.name
-            elif current_url.raw_path != '/':
-                file_name = current_url.path.rsplit('/')[1]
-            else:
-                file_name = 'index.html'
-            file_path = os.path.dirname(current_url.path)
-            if file_path == '/':
-                file_path = self.target_path
-            else:
-                file_path = os.path.join(self.target_path, file_path[1:])
+    def _make_filename(self, url):
+        host = url.host
+        if url.is_absolute():
+            file_name = url.relative().human_repr()
+        else:
+            file_name = url.human_repr()
+        if not file_name.startswith('/'):
+            file_name = "/" + file_name
 
-            print('path: ', file_path, 'name: ', file_name)
+        if file_name == '/' or file_name == "":
+            if host == self.root.host or (self.moved_root is not None and self.moved_root.host == host):
+                file_name = '/index.html'
+            else:
+                file_name = host
+        m = hashlib.md5()
+        m.update(file_name.encode('utf-8'))
+        hash_name = m.hexdigest()
+        return file_name, hash_name
 
-            if file_path and not os.path.exists(file_path):
-                os.makedirs(file_path)
+    async def get_body(self, session):
+        while not self.new_urls.empty():
+            current_url, level = await self.new_urls.get()
+            if current_url.human_repr() in self.visited_urls:
+                continue
+            self.visited_urls.append(current_url.human_repr())
+            file_name, hash_name = self._make_filename(current_url)
+            print('name: ', file_name)
+            self.meta[file_name] = {}
 
             data = None
+            content_type = None
             try:
                 with aiohttp.Timeout(10.0):
-                    with aiohttp.ClientSession() as session:
-                        response = yield from session.get(current_url)
-                        data = yield from response.read()
-            except aiohttp.ClientError as client_error:
+                    response = await session.get(current_url)
+                    content_type = response.content_type
+                    data = await response.read()
+
+            except (aiohttp.ClientError, asyncio.TimeoutError) as client_error:
                 print(client_error)
             else:
-                response.release()
-                session.close()
+                await response.release()
             if data is not None:
-                if re.match(re.compile('.*\.(html|php)'), file_name):
-                    soup = yield from self.replace_links(data)
+                self.meta[file_name]['hash'] = hash_name
+                self.meta[file_name]['content_type'] = content_type
+                if content_type == 'text/html':
+                    soup = await self.replace_links(data, level)
                     data = str(soup).encode()
-                with open(os.path.join(file_path, file_name), 'wb') as index_fh:
+                with open(os.path.join(self.target_path, hash_name), 'wb') as index_fh:
                     index_fh.write(data)
-                if '.css' in file_name:
+                if content_type == 'text/css':
                     css = cssutils.parseString(data)
                     for carved_url in cssutils.getUrls(css):
                         if carved_url.startswith('data'):
                             continue
                         carved_url = yarl.URL(carved_url)
                         if not carved_url.is_absolute():
                             carved_url = self.root.join(carved_url)
-                        if carved_url not in self.visited_urls:
-                            yield from self.new_urls.put(carved_url)
-
-    @asyncio.coroutine
-    def run(self):
-        yield from self.new_urls.put(self.root)
-        return (yield from self.get_body())
+                        if carved_url.human_repr() not in self.visited_urls:
+                            await self.new_urls.put((carved_url,level+1))
+
+    async def get_root_host(self):
+        try:
+            with aiohttp.ClientSession() as session:
+                resp = await session.get(self.root)
+                if resp._url_obj.host != self.root.host:
+                    self.moved_root = resp._url_obj
+                resp.close()
+        except aiohttp.errors.ClientError as err:
+            print("Can\'t connect to target host.")
+            exit(-1)
+
+    async def run(self):
+        session = aiohttp.ClientSession()
+        try:
+            await self.new_urls.put((self.root, 0))
+            await self.get_body(session)
+        except KeyboardInterrupt:
+            raise
+        finally:
+            with open(os.path.join(self.target_path, 'meta.json'), 'w') as mj:
+                json.dump(self.meta, mj)
+            await session.close()
 
 
 def main():
@@ -158,9 +207,14 @@ def main():
     loop = asyncio.get_event_loop()
     parser = argparse.ArgumentParser()
     parser.add_argument("--target", help="domain of the page to be cloned", required=True)
+    parser.add_argument("--max-depth", help="max depth of the cloning", required=False, default=sys.maxsize)
     args = parser.parse_args()
-    cloner = Cloner(args.target)
-    loop.run_until_complete(cloner.run())
+    try:
+        cloner = Cloner(args.target, int(args.max_depth))
+        loop.run_until_complete(cloner.get_root_host())
+        loop.run_until_complete(cloner.run())
+    except KeyboardInterrupt:
+        pass
 
 
 if __name__ == '__main__':
 
@@ -0,0 +1,31 @@
+import os
+import hashlib
+from os import walk
+import mimetypes
+import json
+import shutil
+
+
+class Converter:
+    def __init__(self):
+        self.meta = {}
+
+    def convert(self, path):
+        files_to_convert = []
+
+        for (dirpath, dirnames, filenames) in walk(path):
+            for fn in filenames:
+                files_to_convert.append(os.path.join(dirpath, fn))
+
+        for fn in files_to_convert:
+            path_len = len(path)
+            file_name = fn[path_len:]
+            m = hashlib.md5()
+            m.update(fn.encode('utf-8'))
+            hash_name = m.hexdigest()
+            self.meta[file_name] = {'hash': hash_name, 'content_type': mimetypes.guess_type(file_name)[0]}
+            shutil.copyfile(fn, os.path.join(path, hash_name))
+            os.remove(fn)
+
+        with open(os.path.join(path, 'meta.json'), 'w')  as mj:
+            json.dump(self.meta, mj)