-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetbooru.py
executable file
·292 lines (258 loc) · 9.07 KB
/
getbooru.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#!/usr/bin/env python3
# Author: Jan 'jarainf' Rathner <jan@rathner.net>
from xml.dom import minidom
from urllib.request import urlopen, urlretrieve, URLError
from urllib.parse import quote_plus
from hashlib import md5 as md5_hasher
import os
import sys, getopt
import signal
GELBOORU = 'http://gelbooru.com/index.php?page=dapi&s=post&q=index'
USAGE = '''\tgetbooru 1.0
A simple API-based Gelbooru crawler.
Author: Jan 'jarainf' Rathner
http://github.com/jarainf/getbooru
Usage: getbooru [options] [tags]
Options:
-h --help Display this help page and exit.
-a --artist Filter by given artist.
-d --destination Specify a destination (default: cwd).
--delete-latest Delete the newest file on SIGINT/SIGTERM.
This is meant to prevent broken files caused
by incomplete downloads.
WARNING: Only use if downloading to a dedicated folder.
THIS MAY DELETE FILES YOU OR YOUR PC CREATED!
-f --file-format Specify a file format by its extension
(gif, jpg, jpeg, png)
Add a "-" in front to exclude this type.
-n --number-of-files Process n posts.
--no-verification File checksums will not be verified.
-q --quiet Suppress output.
-r --rating Filter by rating (e.g. "safe", "questionable", "explicit").
-s --size Filter by resolution (e.g. "1920x1080").
--score Filter by score (e.g. "<20", "20", ">20", ">=20", "<=20").
-t --total Guarantee that n pictures (see -n) will be downloaded.
-v --verbose Verbose mode.
'''
def _parseURL(url, total = False, n = 0):
global duplicates
global xmlerrors
images = []
if verbose:
print('Retrieving XML...')
xml = _getContent(url)
if xml is None:
print('XML data could not be retrieved.\nExiting...')
return False
try:
parsedXml = minidom.parseString(xml)
except:
xmlerrors += 1
print('XML was malformed, couldn\'t continue.\nExiting...')
return False
posts = parsedXml.getElementsByTagName('post')
if posts == []:
return False
for i in posts:
images += {(i.attributes['id'].value, i.attributes['file_url'].value, i.attributes['md5'].value)}
for id, location, md5 in images:
filetype = location.rsplit('.', 1)[1]
if filetype not in fformat:
continue
file = '%s.%s' % (id, filetype)
filename = os.path.join(destination, '%s' % (file))
if os.path.isfile(filename):
if verbose:
print('File "%s" is duplicate.' % (file))
if md5_checking and not _check_md5(filename, md5):
_delete_file(filename, 'File "%s" differs in checksum, but could not be deleted.' % (file))
print('File "%s" differs in checksum, redownloading.' % (file))
image = _getImage(location, filename, md5)
duplicates += 1
continue
image = _getImage(location, filename, md5)
if not downloads < n and total:
return False
return True
def _getContent(url):
global xmlerrors
data = None
try:
with urlopen(url) as response:
return response.read()
except URLError as e:
xmlerrors += 1
if not quiet:
print('URL: %s could not be opened.' % (url))
return data
def _getImage(url, file, md5 = '', errors = 0):
global downloads
global urlerrors
if verbose:
print('Downloading "%s"...' % (url))
try:
urlretrieve(url, file)
downloads += 1
except URLError as e:
urlerrors += 1
_delete_file(file, 'Image: %s could not be retrieved.' % (url))
if md5_checking and not _check_md5(file, md5):
_delete_file(file, 'Download "%s" differs in checksum, redownloading.' % (url))
errors += 1
if errors >= 5:
print('Download of "%s" failed checksum 5 times, aborting.' % (url))
_getImage(url, file, md5, errors)
def _usage():
print(USAGE)
def _signals(signum = None, frame = None):
global destination
global delete_latest
if delete_latest:
latest = _latest_changed_file(destination)
_delete_file(latest, 'Could not delete latest changed file. You should probably stop using Windows.')
print('Interrupted, terminating...')
sys.exit(0)
def _latest_changed_file(dir):
def list_files(dir):
for dirpath, dirnames, filenames in os.walk(dir):
for filename in filenames:
yield os.path.join(dirpath, filename)
return max(list_files(dir), key=os.path.getmtime)
def _check_md5(file, checksum):
m = md5_hasher()
m.update(open(file, 'rb').read())
return checksum == m.hexdigest()
def _delete_file(file, output):
try:
os.remove(file)
except:
if not quiet:
print(output)
else:
pass
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], 'a:d:f:hn:qr:s:tv', ['rating=', 'help', 'quiet', 'size=' ,'score=', 'artist=', 'file-format=', 'total', 'number-of-files=', 'destination=', 'delete-latest', 'no-verification', 'verbose'])
except getopt.GetoptError as e:
print(e)
_usage()
sys.exit(1)
global quiet
global delete_latest
global downloads
global duplicates
global xmlerrors
global urlerrors
global fformat
global destination
global md5_checking
global verbose
delete_latest = False
quiet = False
downloads = 0
duplicates = 0
xmlerrors = 0
urlerrors = 0
fformat = ('jpg', 'jpeg', 'gif', 'png')
destination = os.curdir
total = False
tags = '%s ' % (' '.join(args))
pages = 1
last = 0
inf = False
md5_checking = True
verbose = False
for opt, arg in opts:
if opt in ('-a', '--artist'):
tags += '%s ' % (arg)
elif opt in ('-d', '--destination'):
if arg != '':
if not os.path.exists(arg):
try:
os.makedirs(arg)
except:
print('Failed to create directory \"%s\", exiting.' % (destination))
sys.exit(1)
destination = arg
elif opt == '--delete-latest':
delete_latest = True
elif opt in ('-f', '--file-format'):
if arg in ('jpeg', 'jpg'):
fformat = ('jpeg', 'jpg')
elif arg == 'png':
fformat = 'png'
elif arg == 'gif':
fformat = 'gif'
elif arg in ('-jpg', '-jpeg'):
fformat = ('gif', 'png')
elif arg == '-png':
fformat = ('jpg', 'jpeg', 'gif')
elif arg == '-gif':
fformat = ('jpg', 'jpeg', 'png')
else:
print('Argument: \"%s\" not recognized, possible arguments:' % (arg))
print('jpg, jpeg, gif, png, -jpg, -jpeg, -gif, -png')
print('Note that \"jpg\" and \"jpeg\" are the same format and specifying one of them will also download the other.')
elif opt in ('-h', '--help'):
_usage()
sys.exit(0)
elif opt in ('-n', '--number-of-files'):
if arg != '' and arg.isdigit():
pages = int(arg) // 100
last = int(arg) % 100
elif arg == 'inf':
inf = True
else:
print('You have to specify a number for \"-n\", using default (100).')
pages = 1
last = 0
elif opt == '--no-verification':
md5_checking = False
elif opt in ('-q', '--quiet'):
quiet = True
elif opt in ('-r', '--rating'):
tags += 'rating:%s ' % (arg)
elif opt in ('-s', '--size'):
sarg = arg.split('x')
if len(sarg) == 2 and arg.replace('x', '').isdigit():
tags += 'width:%s height:%s ' % (sarg[0], sarg[1])
else:
print('Invalid size formatting - should be \"$WIDTHx$HEIGHT\" but got: %s\nExiting...' % (arg))
elif opt in ('-t', '--total'):
total = True
elif opt == '--score':
tags += 'score:%s ' % (arg)
elif opt in ('-v', '--verbose'):
verbose = True
else:
sys.exit(1)
for sig in [signal.SIGTERM, signal.SIGINT]:
signal.signal(sig, _signals)
url = '%s&tags=%s' % (GELBOORU, quote_plus(tags))
if not inf and (not total or fformat == ('jpg', 'jpeg', 'gif', 'png')):
done = False
for i in (j for j in range(pages) if not done):
done = not _parseURL('%s&limit=100&pid=%d' % (url, i))
if last != 0 and not done:
_parseURL('%s&limit=%d&pid=%d' % (url, last, pages))
elif not inf:
done = False
i = 0
while not done:
done = not _parseURL('%s&limit=100&pid=%d' % (url, i), True, pages * 100 + last)
i += 1
if (xmlerrors + urlerrors) == 0:
print('Done, total downloads: %03d.' % downloads)
else:
print('Done, total downloads: %03d. XML-Errors: %03d, URL-Errors: %03d' % (downloads, xmlerrors, urlerrors))
sys.exit(0)
else:
i = 0
while _parseURL('%s&limit=100&pid=%s' % (url, str(i))):
i += 1
if (xmlerrors + urlerrors) == 0:
print('Done, total downloads: %03d. Duplicates: %03d. Total: %03d.' % (downloads, duplicates, duplicates + downloads))
else:
print('Done, total downloads: %03d. Duplicates: %03d. Total: %03d. XML-Errors: %03d, URL-Errors: %03d.' % (downloads, duplicates, duplicates + downloads, xmlerrors, urlerrors))
if __name__ == "__main__":
main()