14
14
GNU General Public License for more details.
15
15
"""
16
16
17
- import re
17
+ import argparse
18
+ import asyncio
19
+ import hashlib
20
+ import json
18
21
import os
22
+ import re
19
23
import sys
20
-
21
- import asyncio
22
24
from asyncio import Queue
23
- import argparse
25
+
24
26
import aiohttp
25
27
import cssutils
26
28
import yarl
27
29
from bs4 import BeautifulSoup
28
30
29
31
30
32
class Cloner (object ):
31
- def __init__ (self , root ):
33
+ def __init__ (self , root , max_depth ):
32
34
self .visited_urls = []
33
35
self .root = self .add_scheme (root )
36
+ self .max_depth = max_depth
37
+ self .moved_root = None
34
38
if len (self .root .host ) < 4 :
35
39
sys .exit ('invalid taget {}' .format (self .root .host ))
36
40
self .target_path = '/opt/snare/pages/{}' .format (self .root .host )
@@ -39,6 +43,7 @@ def __init__(self, root):
39
43
os .mkdir (self .target_path )
40
44
41
45
self .new_urls = Queue ()
46
+ self .meta = {}
42
47
43
48
@staticmethod
44
49
def add_scheme (url ):
@@ -48,103 +53,147 @@ def add_scheme(url):
48
53
new_url = yarl .URL ('http://' + url )
49
54
return new_url
50
55
51
- @asyncio .coroutine
52
- def process_link (self , url , check_host = False ):
53
- url = yarl .URL (url )
56
+ async def process_link (self , url , level , check_host = False ):
57
+ try :
58
+ url = yarl .URL (url )
59
+ except UnicodeError :
60
+ return None
61
+ if url .scheme == ("data" or "javascript" or "file" ):
62
+ return url .human_repr ()
63
+ if not url .is_absolute ():
64
+ if self .moved_root is None :
65
+ url = self .root .join (url )
66
+ else :
67
+ url = self .moved_root .join (url )
68
+
69
+ host = url .host
70
+
54
71
if check_host :
55
- if (url .host != self .root .host or url .fragment
56
- or url in self .visited_urls ):
72
+ if (host != self .root .host and self .moved_root is None ) or \
73
+ url .fragment or \
74
+ (self .moved_root is not None and host != self .moved_root .host ):
57
75
return None
58
- if not url .is_absolute ():
59
- url = self .root .join (url )
60
76
61
- yield from self .new_urls .put (url )
62
- return url .relative ().human_repr ()
77
+ if url .human_repr () not in self .visited_urls and (level + 1 ) <= self .max_depth :
78
+ await self .new_urls .put ((url , level + 1 ))
79
+
80
+ res = None
81
+ try :
82
+ res = url .relative ().human_repr ()
83
+ except ValueError :
84
+ print (url )
85
+ return res
63
86
64
- @asyncio .coroutine
65
- def replace_links (self , data ):
87
+ async def replace_links (self , data , level ):
66
88
soup = BeautifulSoup (data , 'html.parser' )
67
89
68
90
# find all relative links
69
91
for link in soup .findAll (href = True ):
70
- res = yield from self .process_link (link ['href' ], check_host = True )
92
+ res = await self .process_link (link ['href' ], level , check_host = True )
71
93
if res is not None :
72
94
link ['href' ] = res
73
95
74
96
# find all images and scripts
75
97
for elem in soup .findAll (src = True ):
76
- res = yield from self .process_link (elem ['src' ])
98
+ res = await self .process_link (elem ['src' ], level )
77
99
if res is not None :
78
100
elem ['src' ] = res
79
101
80
102
# find all action elements
81
103
for act_link in soup .findAll (action = True ):
82
- res = yield from self .process_link (act_link ['action' ])
104
+ res = await self .process_link (act_link ['action' ], level )
83
105
if res is not None :
84
106
act_link ['action' ] = res
85
107
86
108
# prevent redirects
87
109
for redir in soup .findAll (True , attrs = {'name' : re .compile ('redirect.*' )}):
88
- redir ['value' ] = yarl .URL (redir ['value' ]).relative ().human_repr ()
110
+ if redir ['value' ] != "" :
111
+ redir ['value' ] = yarl .URL (redir ['value' ]).relative ().human_repr ()
89
112
90
113
return soup
91
114
92
- @asyncio .coroutine
93
- def get_body (self ):
94
- while not self .new_urls .empty ():
95
- current_url = yield from self .new_urls .get ()
96
- if current_url in self .visited_urls :
97
- continue
98
- self .visited_urls .append (current_url )
99
- if current_url .name :
100
- file_name = current_url .name
101
- elif current_url .raw_path != '/' :
102
- file_name = current_url .path .rsplit ('/' )[1 ]
103
- else :
104
- file_name = 'index.html'
105
- file_path = os .path .dirname (current_url .path )
106
- if file_path == '/' :
107
- file_path = self .target_path
108
- else :
109
- file_path = os .path .join (self .target_path , file_path [1 :])
115
+ def _make_filename (self , url ):
116
+ host = url .host
117
+ if url .is_absolute ():
118
+ file_name = url .relative ().human_repr ()
119
+ else :
120
+ file_name = url .human_repr ()
121
+ if not file_name .startswith ('/' ):
122
+ file_name = "/" + file_name
110
123
111
- print ('path: ' , file_path , 'name: ' , file_name )
124
+ if file_name == '/' or file_name == "" :
125
+ if host == self .root .host or (self .moved_root is not None and self .moved_root .host == host ):
126
+ file_name = '/index.html'
127
+ else :
128
+ file_name = host
129
+ m = hashlib .md5 ()
130
+ m .update (file_name .encode ('utf-8' ))
131
+ hash_name = m .hexdigest ()
132
+ return file_name , hash_name
112
133
113
- if file_path and not os .path .exists (file_path ):
114
- os .makedirs (file_path )
134
+ async def get_body (self , session ):
135
+ while not self .new_urls .empty ():
136
+ current_url , level = await self .new_urls .get ()
137
+ if current_url .human_repr () in self .visited_urls :
138
+ continue
139
+ self .visited_urls .append (current_url .human_repr ())
140
+ file_name , hash_name = self ._make_filename (current_url )
141
+ print ('name: ' , file_name )
142
+ self .meta [file_name ] = {}
115
143
116
144
data = None
145
+ content_type = None
117
146
try :
118
147
with aiohttp .Timeout (10.0 ):
119
- with aiohttp .ClientSession () as session :
120
- response = yield from session .get (current_url )
121
- data = yield from response .read ()
122
- except aiohttp .ClientError as client_error :
148
+ response = await session .get (current_url )
149
+ content_type = response .content_type
150
+ data = await response .read ()
151
+
152
+ except (aiohttp .ClientError , asyncio .TimeoutError ) as client_error :
123
153
print (client_error )
124
154
else :
125
- response .release ()
126
- session .close ()
155
+ await response .release ()
127
156
if data is not None :
128
- if re .match (re .compile ('.*\.(html|php)' ), file_name ):
129
- soup = yield from self .replace_links (data )
157
+ self .meta [file_name ]['hash' ] = hash_name
158
+ self .meta [file_name ]['content_type' ] = content_type
159
+ if content_type == 'text/html' :
160
+ soup = await self .replace_links (data , level )
130
161
data = str (soup ).encode ()
131
- with open (os .path .join (file_path , file_name ), 'wb' ) as index_fh :
162
+ with open (os .path .join (self . target_path , hash_name ), 'wb' ) as index_fh :
132
163
index_fh .write (data )
133
- if '. css' in file_name :
164
+ if content_type == 'text/ css' :
134
165
css = cssutils .parseString (data )
135
166
for carved_url in cssutils .getUrls (css ):
136
167
if carved_url .startswith ('data' ):
137
168
continue
138
169
carved_url = yarl .URL (carved_url )
139
170
if not carved_url .is_absolute ():
140
171
carved_url = self .root .join (carved_url )
141
- if carved_url not in self .visited_urls :
142
- yield from self .new_urls .put (carved_url )
143
-
144
- @asyncio .coroutine
145
- def run (self ):
146
- yield from self .new_urls .put (self .root )
147
- return (yield from self .get_body ())
172
+ if carved_url .human_repr () not in self .visited_urls :
173
+ await self .new_urls .put ((carved_url ,level + 1 ))
174
+
175
+ async def get_root_host (self ):
176
+ try :
177
+ with aiohttp .ClientSession () as session :
178
+ resp = await session .get (self .root )
179
+ if resp ._url_obj .host != self .root .host :
180
+ self .moved_root = resp ._url_obj
181
+ resp .close ()
182
+ except aiohttp .errors .ClientError as err :
183
+ print ("Can\' t connect to target host." )
184
+ exit (- 1 )
185
+
186
+ async def run (self ):
187
+ session = aiohttp .ClientSession ()
188
+ try :
189
+ await self .new_urls .put ((self .root , 0 ))
190
+ await self .get_body (session )
191
+ except KeyboardInterrupt :
192
+ raise
193
+ finally :
194
+ with open (os .path .join (self .target_path , 'meta.json' ), 'w' ) as mj :
195
+ json .dump (self .meta , mj )
196
+ await session .close ()
148
197
149
198
150
199
def main ():
@@ -158,9 +207,14 @@ def main():
158
207
loop = asyncio .get_event_loop ()
159
208
parser = argparse .ArgumentParser ()
160
209
parser .add_argument ("--target" , help = "domain of the page to be cloned" , required = True )
210
+ parser .add_argument ("--max-depth" , help = "max depth of the cloning" , required = False , default = sys .maxsize )
161
211
args = parser .parse_args ()
162
- cloner = Cloner (args .target )
163
- loop .run_until_complete (cloner .run ())
212
+ try :
213
+ cloner = Cloner (args .target , int (args .max_depth ))
214
+ loop .run_until_complete (cloner .get_root_host ())
215
+ loop .run_until_complete (cloner .run ())
216
+ except KeyboardInterrupt :
217
+ pass
164
218
165
219
166
220
if __name__ == '__main__' :
0 commit comments