-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape_shogitown.py
81 lines (71 loc) · 3.28 KB
/
scrape_shogitown.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#Scrapes all tsumes from sites including shogitown and dumps it to links.txt.
#Meant to be run in local machine and not in deployment.
import asyncio
from pyppeteer import launch
from io import BytesIO
import json
import re
urls = [
'https://www.shogitown.com/beginner/1tetume/1tetume.html',
'https://www.shogitown.com/beginner/tume/tume.html',
'https://www.shogitown.com/beginner/5te_2004/5te_tume.html'
]
blacklist = [
'https://www.shogitown.com/index.html',
'https://www.shogitown.com/beginner/top-b.html',
'https://www.shogitown.com/beginner/1tetume/1tetume01.html',
'https://www.shogitown.com/beginner/1tetume/1tetume.html',
'https://www.shogitown.com/beginner/tume/tume.html',
'https://www.shogitown.com/beginner/5te_2004/5te_tume.html'
]
# if program freezes or stops for any reason, copy the last printed url and put it here, & run the program again.
last = ''
async def scrape_shogitown():
willSkip = True
if not last:
willSkip = False
jsonList = []
browser = await launch()
fo = open('links_shogitown.txt', 'a')
for url in urls:
page = await browser.newPage()
await page.goto(url)
links = await page.querySelectorAllEval('a', 'elems => elems.map(elem => elem.getAttribute("href")).map(elem => new URL(elem, document.baseURI).href)')
await page.close()
tsumeLinks = [link for link in links if link not in blacklist]
print(tsumeLinks)
for x, tsumeLink in enumerate(tsumeLinks):
if last == tsumeLink:
willSkip = False
continue
if willSkip:
continue
#print('kek1')
page = await browser.newPage()
await page.goto(tsumeLink)
#print('kek2')
tsumeImageLinks = await page.querySelectorAllEval('img', 'elems => elems.map(elem => elem.getAttribute("src")).map(elem => new URL(elem, document.baseURI).href).filter(elem => elem.endsWith("gif"))')
#print('kek3')
answerPageLink = await page.querySelectorAllEval('a', 'elems => Array.from(elems).filter(elem => elem.innerHTML == "解答を見る").map(elem => elem.getAttribute("href")).map(elem => new URL(elem, document.baseURI).href)')
await page.close()
page = await browser.newPage()
await page.goto(answerPageLink[0])
#print('kek4')
htmlContent = await page.content()
#print('kek5')
await page.close()
answersRe = re.findall(r'第.*?問(?:<br>\n.*?)?<br>(\n.*?)((まで(1|3|5)手詰)?。|まで)<br>', htmlContent)
answers = [answer[0] for answer in answersRe]
if len(tsumeImageLinks) != len(answers):
raise ValueError #probably not the right error but w/e
for tsumeImageLink, answer in zip(tsumeImageLinks, answers):
jsonObj = {'question': tsumeImageLink, 'answer': answer.strip(), 'source': url}
fo.write(json.dumps(jsonObj) + '\n')
#print(jsonObj)
print(tsumeLink)
print(x+1,'/',len(tsumeLinks))
await browser.close()
print('Done')
async def main():
await scrape_shogitown()
asyncio.get_event_loop().run_until_complete(main())