Skip to content

Commit 8cc9cb1

Browse files
committed
Merge branch 'develop'
2 parents 38789a8 + db7c35b commit 8cc9cb1

File tree

4 files changed

+259
-152
lines changed

4 files changed

+259
-152
lines changed

clone.py

Lines changed: 114 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -14,23 +14,27 @@
1414
GNU General Public License for more details.
1515
"""
1616

17-
import re
17+
import argparse
18+
import asyncio
19+
import hashlib
20+
import json
1821
import os
22+
import re
1923
import sys
20-
21-
import asyncio
2224
from asyncio import Queue
23-
import argparse
25+
2426
import aiohttp
2527
import cssutils
2628
import yarl
2729
from bs4 import BeautifulSoup
2830

2931

3032
class Cloner(object):
31-
def __init__(self, root):
33+
def __init__(self, root, max_depth):
3234
self.visited_urls = []
3335
self.root = self.add_scheme(root)
36+
self.max_depth = max_depth
37+
self.moved_root = None
3438
if len(self.root.host) < 4:
3539
sys.exit('invalid taget {}'.format(self.root.host))
3640
self.target_path = '/opt/snare/pages/{}'.format(self.root.host)
@@ -39,6 +43,7 @@ def __init__(self, root):
3943
os.mkdir(self.target_path)
4044

4145
self.new_urls = Queue()
46+
self.meta = {}
4247

4348
@staticmethod
4449
def add_scheme(url):
@@ -48,103 +53,147 @@ def add_scheme(url):
4853
new_url = yarl.URL('http://' + url)
4954
return new_url
5055

51-
@asyncio.coroutine
52-
def process_link(self, url, check_host=False):
53-
url = yarl.URL(url)
56+
async def process_link(self, url, level, check_host=False):
57+
try:
58+
url = yarl.URL(url)
59+
except UnicodeError:
60+
return None
61+
if url.scheme == ("data" or "javascript" or "file"):
62+
return url.human_repr()
63+
if not url.is_absolute():
64+
if self.moved_root is None:
65+
url = self.root.join(url)
66+
else:
67+
url = self.moved_root.join(url)
68+
69+
host = url.host
70+
5471
if check_host:
55-
if (url.host != self.root.host or url.fragment
56-
or url in self.visited_urls):
72+
if (host != self.root.host and self.moved_root is None) or \
73+
url.fragment or \
74+
(self.moved_root is not None and host != self.moved_root.host):
5775
return None
58-
if not url.is_absolute():
59-
url = self.root.join(url)
6076

61-
yield from self.new_urls.put(url)
62-
return url.relative().human_repr()
77+
if url.human_repr() not in self.visited_urls and (level + 1) <= self.max_depth:
78+
await self.new_urls.put((url, level + 1))
79+
80+
res = None
81+
try:
82+
res = url.relative().human_repr()
83+
except ValueError:
84+
print(url)
85+
return res
6386

64-
@asyncio.coroutine
65-
def replace_links(self, data):
87+
async def replace_links(self, data, level):
6688
soup = BeautifulSoup(data, 'html.parser')
6789

6890
# find all relative links
6991
for link in soup.findAll(href=True):
70-
res = yield from self.process_link(link['href'], check_host=True)
92+
res = await self.process_link(link['href'], level, check_host=True)
7193
if res is not None:
7294
link['href'] = res
7395

7496
# find all images and scripts
7597
for elem in soup.findAll(src=True):
76-
res = yield from self.process_link(elem['src'])
98+
res = await self.process_link(elem['src'], level)
7799
if res is not None:
78100
elem['src'] = res
79101

80102
# find all action elements
81103
for act_link in soup.findAll(action=True):
82-
res = yield from self.process_link(act_link['action'])
104+
res = await self.process_link(act_link['action'], level)
83105
if res is not None:
84106
act_link['action'] = res
85107

86108
# prevent redirects
87109
for redir in soup.findAll(True, attrs={'name': re.compile('redirect.*')}):
88-
redir['value'] = yarl.URL(redir['value']).relative().human_repr()
110+
if redir['value'] != "":
111+
redir['value'] = yarl.URL(redir['value']).relative().human_repr()
89112

90113
return soup
91114

92-
@asyncio.coroutine
93-
def get_body(self):
94-
while not self.new_urls.empty():
95-
current_url = yield from self.new_urls.get()
96-
if current_url in self.visited_urls:
97-
continue
98-
self.visited_urls.append(current_url)
99-
if current_url.name:
100-
file_name = current_url.name
101-
elif current_url.raw_path != '/':
102-
file_name = current_url.path.rsplit('/')[1]
103-
else:
104-
file_name = 'index.html'
105-
file_path = os.path.dirname(current_url.path)
106-
if file_path == '/':
107-
file_path = self.target_path
108-
else:
109-
file_path = os.path.join(self.target_path, file_path[1:])
115+
def _make_filename(self, url):
116+
host = url.host
117+
if url.is_absolute():
118+
file_name = url.relative().human_repr()
119+
else:
120+
file_name = url.human_repr()
121+
if not file_name.startswith('/'):
122+
file_name = "/" + file_name
110123

111-
print('path: ', file_path, 'name: ', file_name)
124+
if file_name == '/' or file_name == "":
125+
if host == self.root.host or (self.moved_root is not None and self.moved_root.host == host):
126+
file_name = '/index.html'
127+
else:
128+
file_name = host
129+
m = hashlib.md5()
130+
m.update(file_name.encode('utf-8'))
131+
hash_name = m.hexdigest()
132+
return file_name, hash_name
112133

113-
if file_path and not os.path.exists(file_path):
114-
os.makedirs(file_path)
134+
async def get_body(self, session):
135+
while not self.new_urls.empty():
136+
current_url, level = await self.new_urls.get()
137+
if current_url.human_repr() in self.visited_urls:
138+
continue
139+
self.visited_urls.append(current_url.human_repr())
140+
file_name, hash_name = self._make_filename(current_url)
141+
print('name: ', file_name)
142+
self.meta[file_name] = {}
115143

116144
data = None
145+
content_type = None
117146
try:
118147
with aiohttp.Timeout(10.0):
119-
with aiohttp.ClientSession() as session:
120-
response = yield from session.get(current_url)
121-
data = yield from response.read()
122-
except aiohttp.ClientError as client_error:
148+
response = await session.get(current_url)
149+
content_type = response.content_type
150+
data = await response.read()
151+
152+
except (aiohttp.ClientError, asyncio.TimeoutError) as client_error:
123153
print(client_error)
124154
else:
125-
response.release()
126-
session.close()
155+
await response.release()
127156
if data is not None:
128-
if re.match(re.compile('.*\.(html|php)'), file_name):
129-
soup = yield from self.replace_links(data)
157+
self.meta[file_name]['hash'] = hash_name
158+
self.meta[file_name]['content_type'] = content_type
159+
if content_type == 'text/html':
160+
soup = await self.replace_links(data, level)
130161
data = str(soup).encode()
131-
with open(os.path.join(file_path, file_name), 'wb') as index_fh:
162+
with open(os.path.join(self.target_path, hash_name), 'wb') as index_fh:
132163
index_fh.write(data)
133-
if '.css' in file_name:
164+
if content_type == 'text/css':
134165
css = cssutils.parseString(data)
135166
for carved_url in cssutils.getUrls(css):
136167
if carved_url.startswith('data'):
137168
continue
138169
carved_url = yarl.URL(carved_url)
139170
if not carved_url.is_absolute():
140171
carved_url = self.root.join(carved_url)
141-
if carved_url not in self.visited_urls:
142-
yield from self.new_urls.put(carved_url)
143-
144-
@asyncio.coroutine
145-
def run(self):
146-
yield from self.new_urls.put(self.root)
147-
return (yield from self.get_body())
172+
if carved_url.human_repr() not in self.visited_urls:
173+
await self.new_urls.put((carved_url,level+1))
174+
175+
async def get_root_host(self):
176+
try:
177+
with aiohttp.ClientSession() as session:
178+
resp = await session.get(self.root)
179+
if resp._url_obj.host != self.root.host:
180+
self.moved_root = resp._url_obj
181+
resp.close()
182+
except aiohttp.errors.ClientError as err:
183+
print("Can\'t connect to target host.")
184+
exit(-1)
185+
186+
async def run(self):
187+
session = aiohttp.ClientSession()
188+
try:
189+
await self.new_urls.put((self.root, 0))
190+
await self.get_body(session)
191+
except KeyboardInterrupt:
192+
raise
193+
finally:
194+
with open(os.path.join(self.target_path, 'meta.json'), 'w') as mj:
195+
json.dump(self.meta, mj)
196+
await session.close()
148197

149198

150199
def main():
@@ -158,9 +207,14 @@ def main():
158207
loop = asyncio.get_event_loop()
159208
parser = argparse.ArgumentParser()
160209
parser.add_argument("--target", help="domain of the page to be cloned", required=True)
210+
parser.add_argument("--max-depth", help="max depth of the cloning", required=False, default=sys.maxsize)
161211
args = parser.parse_args()
162-
cloner = Cloner(args.target)
163-
loop.run_until_complete(cloner.run())
212+
try:
213+
cloner = Cloner(args.target, int(args.max_depth))
214+
loop.run_until_complete(cloner.get_root_host())
215+
loop.run_until_complete(cloner.run())
216+
except KeyboardInterrupt:
217+
pass
164218

165219

166220
if __name__ == '__main__':

converter.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import os
2+
import hashlib
3+
from os import walk
4+
import mimetypes
5+
import json
6+
import shutil
7+
8+
9+
class Converter:
10+
def __init__(self):
11+
self.meta = {}
12+
13+
def convert(self, path):
14+
files_to_convert = []
15+
16+
for (dirpath, dirnames, filenames) in walk(path):
17+
for fn in filenames:
18+
files_to_convert.append(os.path.join(dirpath, fn))
19+
20+
for fn in files_to_convert:
21+
path_len = len(path)
22+
file_name = fn[path_len:]
23+
m = hashlib.md5()
24+
m.update(fn.encode('utf-8'))
25+
hash_name = m.hexdigest()
26+
self.meta[file_name] = {'hash': hash_name, 'content_type': mimetypes.guess_type(file_name)[0]}
27+
shutil.copyfile(fn, os.path.join(path, hash_name))
28+
os.remove(fn)
29+
30+
with open(os.path.join(path, 'meta.json'), 'w') as mj:
31+
json.dump(self.meta, mj)

0 commit comments

Comments
 (0)