-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
183 lines (142 loc) · 5.35 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 12 12:26:14 2015
@author: Thibault
"""
# Import libs
import os
import sys
import hashlib
reload(sys)
sys.setdefaultencoding('utf8')
current_dir = os.getcwd()
sys.path.append(current_dir + "\\lib")
sys.path.append(current_dir + "\\src")
# Import libs
import urllib2 as urllib
from ISHtmlParser import ISHTMLParser
from PIL import Image
class Url():
def __init__(self, url = '', deep = 1):
self.url = url
self.deep = deep
class File():
def __init__(self):
self.file = []
self.nb_elements = 0
def take(self):
if self.length() > 0:
self.nb_elements -= 1
return self.file.pop(0)
def add(self, p):
self.file.append(p)
self.nb_elements += 1
def length(self):
return self.nb_elements
class Program():
def __init__(self, register_path = "", max_pages = 15):
self.url_visited = []
self.url_waiting = File()
self.register_path = register_path
self.img_visited = []
self.img_waiting = File()
self.max_pages = max_pages
# Counters
self.nb_pages = 0
self.nb_images = 0
self.img_to_remove = []
def delete_imgs(self):
for path in self.img_to_remove:
print "Remove : " + str(path)
os.remove(path)
def explore_img(self):
print("")
print("Telechargement des images")
while self.img_waiting.length() > 0:
try:
url = self.img_waiting.take()
# Récupération de l'image
extension = url[-4:] if url[-4] == "." else url[-5:]
if extension[0] == "." and extension not in [".exe", ".com", ".html", ".htm", ".css", ".less", ".net"]:
path_img = self.register_path + str(self.nb_images) + extension
f = open(path_img, 'wb')
f.write(urllib.urlopen(url).read())
f.close()
print("Download : " + str(url))
# Ajout de l'image dans la bibliothèque
self.img_visited.append(self.md5(url))
# Vérification de l'image
if not(self.verif_img(path_img)):
self.img_to_remove.append(path_img) # Si il ne respecte pas ce que l'on veux, on l'ajoute a la liste a supprimer
self.nb_images += 1
except urllib.HTTPError:
pass
except urllib.URLError:
pass
except ValueError:
pass
print("")
print("Supression des images obsolètes")
self.delete_imgs()
def verif_img(self, path_img):
try:
im = Image.open(path_img)
(l, h) = im.size
if l > 1200: # Image en HD
return True
else:
return False
except Exception:
return False
def main(self, main_url):
if main_url != None:
self.url_waiting.add(main_url)
while self.nb_pages < self.max_pages \
and self.url_waiting.length() > 0:
url = self.url_waiting.take()
print("Exploration de : " + url)
result = self.visite(url)
# Exploration des images référencés
self.explore_img()
# Si on a explorer l'url sans encombre, on peu passer a la suivante
self.nb_pages += 1 if result else 0
print("Done !")
else:
print("No arguments specified")
def visite(self, website):
try:
# Parsage de la page
parser = ISHTMLParser(website)
source = urllib.urlopen(website).read()
parser.feed(source)
# Ajout du lien dans la bibliothèque
self.url_visited.append(self.md5(website))
# Recupération des URL
for url in parser.url_container:
if self.nb_pages < self.max_pages:
if self.md5(url) not in self.url_visited \
and url not in self.url_waiting.file:
self.url_waiting.add(url)
# Récupération des images
for img in parser.img_container:
if self.md5(img) not in self.img_visited \
and img not in self.img_waiting.file:
self.img_waiting.add(img)
return True
except urllib.HTTPError:
return False
except urllib.URLError:
return False
except ValueError:
return False
def md5(self, text):
return hashlib.md5(text).hexdigest()
#### PROGRAM ####
# Récupération des arguments consoles
argv = sys.argv
register_path = argv[2] if len(argv) >= 3 else "images/"
main_url = argv[1] if len(argv) >= 2 else None
nb_pages = argv[3] if len(argv) >= 3 else 15
# Récupération de images
main = Program(register_path, nb_pages)
main.main(main_url)