-
Notifications
You must be signed in to change notification settings - Fork 0
/
checkio_downloader.py
244 lines (205 loc) · 10.5 KB
/
checkio_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#!/usr/bin/env python3
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
####################################################################################
#
# A Python script download solutions from https://checkio.org
#
# tested on Python 3.6+
# requires: Beautiful Soup (https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
#
# Christoph Haunschmidt, started 2019-09
import argparse
import datetime
import os
import re
import string
import sys
import urllib.parse
import urllib.request
from collections import defaultdict, namedtuple
from bs4 import BeautifulSoup
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
}
def get_url(url):
request = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(request)
return resp.read()
def get_schema_and_domain_from_url(url):
parse_result = urllib.parse.urlparse(url)
return '{0.scheme}://{0.netloc}'.format(parse_result)
class CheckIOSolution:
class InvalidFormatKeyError(Exception):
pass
FIELDS_BY_HTML = set('html posted_at solution_source solution_title mission_title solution_category'.split())
FIELDS_BY_URL = set('url extension mission_title_slug user_name solution_title_slug solution_hash'.split())
ALL_FIELDS = FIELDS_BY_HTML | FIELDS_BY_URL
FORMAT_STR_DICT_FIELDS = ALL_FIELDS - {'html', 'solution_source'}
SOLUTION_URL_RE = re.compile(r'^(?:http(s)?:\/\/)?'
'(?P<extension>\w+)\.checkio\.org/mission/'
'(?P<mission_title_slug>[\w\-]+)/publications/'
'(?P<user_name>[^/]+)/[^/]+/'
'(?P<solution_title_slug>[^/]+)/share/'
'(?P<solution_hash>[a-z0-f]+)/?$', re.IGNORECASE)
DEFAULT_FORMAT_STR = '{mission_title_slug}.{extension}'
def __init__(self, url):
m = CheckIOSolution.SOLUTION_URL_RE.match(url)
if not m:
raise ValueError(f'Is not a valid solution URL: {url}')
for field in CheckIOSolution.FIELDS_BY_URL:
setattr(self, field, m.groupdict().get(field, ''))
self.url = url
self.extension = self.extension.lower()
self._processed = False
def process_url(self):
self.html = get_url(self.url)
soup = BeautifulSoup(self.html, features='html.parser')
date_str = soup.select('noscript p[style="text-align: center;"]')[0].get_text(strip=True)
self.posted_at = datetime.datetime.strptime(date_str, '%B %d, %Y').date()
self.mission_title = soup.select('noscript p > a')[1].get_text(strip=True)
self.solution_title = soup.select('noscript p > b')[0].get_text(strip=True)
self.solution_category = soup.select('noscript p > a')[0].get_text(strip=True)
self.solution_source = soup.select('noscript pre[class^=brush]')[0].get_text()
self._processed = True
def __getattr__(self, name):
if name in CheckIOSolution.FIELDS_BY_HTML:
if not self._processed:
self.process_url()
return self.__getattribute__(name)
def filename(self, format_str=None):
format_str = format_str or CheckIOSolution.DEFAULT_FORMAT_STR
needed_fields = {t[1] for t in string.Formatter().parse(format_str)}
invalid_fields = needed_fields - CheckIOSolution.FORMAT_STR_DICT_FIELDS
if invalid_fields:
raise CheckIOSolution.InvalidFormatKeyError('Invalid format key: "{}"'.format('", "'.join(invalid_fields)))
return format_str.format_map({field: getattr(self, field) for field in needed_fields})
@property
def source_code(self):
return self.solution_source
def __str__(self):
result = []
for field in self.FIELDS_BY_URL:
result.append('{0}="{1}"'.format(field, getattr(self, field)))
return '\n'.join(result)
class CheckIODownloader:
SolutionMeta = namedtuple('SolutionMeta', 'url mission_title solution_title')
USER_SOLUTIONS_URL_RE = re.compile(r'^(?:http(s)?:\/\/)?'
'(?P<extension>\w+)\.checkio\.org/user/'
'(?P<user_name>[^/]+)/solutions/share/'
'(?P<user_hash>[a-z0-f]+)/?$', re.IGNORECASE)
def __init__(self, url, output_directory, filename_format='', overwrite=False, dry_run=False):
self.url = url
self.output_directory = output_directory
self.filename_format_str = filename_format
self.overwrite = overwrite
self.dry_run = dry_run
self.stats = defaultdict(set)
self.solutions_meta = []
self.solutions = []
self.parse_solution_urls()
def parse_solution_urls(self):
self.solutions_meta = []
m = CheckIOSolution.SOLUTION_URL_RE.match(self.url)
if m:
solution_meta = CheckIODownloader.SolutionMeta(
url=self.url, mission_title=m.group('mission_title_slug'),
solution_title=m.group('solution_title_slug'))
self.solutions_meta = [solution_meta]
return
m = CheckIODownloader.USER_SOLUTIONS_URL_RE.match(self.url)
if not m:
raise ValueError(f'Not a user solutions URL: {self.url}')
soup = BeautifulSoup(get_url(self.url), features='html.parser')
solution_rows = soup.select('div.block_progress.block_progress__container')
for row in solution_rows:
anchor = row.select('div.block_progress_main.block_progress__row a')[0]
url = anchor['href']
if url.startswith('/'):
url = get_schema_and_domain_from_url(self.url) + url
mission_title = row.select('span.block_progress_task.block_progress__row')[0].get_text(strip=True)
solution_meta = CheckIODownloader.SolutionMeta(
url=url, solution_title=anchor.get_text(strip=True), mission_title=mission_title)
self.solutions_meta.append(solution_meta)
return self.solutions_meta
@staticmethod
def write_file(output_file_full_path, content):
if content:
with open(output_file_full_path, 'w') as f:
f.write(content)
def process_solution_urls(self):
for i, solution_meta in enumerate(self.solutions_meta, start=1):
try:
title = '{0.mission_title} | {0.solution_title}'.format(solution_meta)[:50]
print(f'[ {i:>4} / {len(self.solutions_meta):<4} ] [ {title:50} ] ... ', end='')
solution = CheckIOSolution(solution_meta.url)
to_file = os.path.join(self.output_directory, solution.filename(format_str=self.filename_format_str))
if os.path.exists(to_file):
self.stats['existing'].add(solution)
if self.overwrite:
if not self.dry_run:
self.write_file(to_file, solution.source_code)
print('OK - already existing, overwritten')
self.stats['overwritten'].add(solution)
else:
print('OK - already existing, skipped')
else:
if not self.dry_run:
self.write_file(to_file, solution.source_code)
print('OK - new')
self.stats['new'].add(solution_meta)
self.solutions.append(solution)
except CheckIOSolution.InvalidFormatKeyError as excp:
print(f'FAIL: {excp}, QUITTING')
sys.exit(1)
except Exception as excp:
print(f'FAIL: {excp}')
self.stats['error'].add(solution)
def __str__(self):
result = ['CheckIODownloader Statistics{}'.format('' if not self.dry_run else ' (DRY RUN)')]
result.append('Existing: {}'.format(len(self.stats.get('existing', []))))
result.append('New: {}'.format(len(self.stats.get('new', []))))
result.append('Overwritten: {}'.format(len(self.stats.get('overwritten', []))))
result.append('Errors: {}'.format(len(self.stats.get('error', []))))
return '\n'.join(result)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Parses and downloads CheckiO solutions',
epilog='Written by Christoph Haunschmidt')
parser.add_argument('url', metavar='CHECKIO_URL',
help='shareable user solutions (at checkio.org, go to "Profile" > "Progress" '
'and copy the share URL) or single solution URL')
parser.add_argument('-o', '--output-directory', default=os.getcwd(),
help='output directory for writing solution source files '
'(default: the current working directory)')
parser.add_argument('--filename-format', default=CheckIOSolution.DEFAULT_FORMAT_STR,
metavar='FORMAT_IN_PYTHON_FORMAT_SYNTAX',
help='filename format for the solution files (without extension). This is in Pythons '
'string formatting syntax. All values are strings except "posted_at", which is of type '
'"datetime.date". Possible keys are: '
'"{}" (default: "%(default)s")'.format(
'", "'.join(sorted(CheckIOSolution.FORMAT_STR_DICT_FIELDS))))
parser.add_argument('--overwrite', action='store_true', default=False,
help='overwrite existing files')
parser.add_argument('--dry-run', action='store_true', default=False,
help='dry run, do not (over-)write files')
args = parser.parse_args()
try:
checkio_downloader = CheckIODownloader(**vars(args))
checkio_downloader.process_solution_urls()
print(checkio_downloader, file=sys.stderr)
except Exception as e:
print('Error: {}'.format(e), file=sys.stderr)
sys.exit(1)