-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
49 lines (39 loc) · 1.24 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import http.client
import zipfile
import pptx
from parsers import get_pptx_slides
base = "antares.cs.kent.edu"
req = "/~seminar/"
if __name__ == "__main__":
conn = http.client.HTTPConnection(base)
conn.request("GET", req)
data = conn.getresponse().read()
# print(data)
data = data.decode("utf-8")
fl = open("train_data/summary_title.txt", "w")
for line in data.split("\n"):
if "<p><b>" not in line:
continue
parts = line.split("<a href=\"")[1:]
urls = [sub.split("\"")[0].replace(" ", "_") for sub in parts]
if len(urls) != 2:
continue
if ".pptx" not in urls[1]:
continue
if "abs" in urls[0]:
urls[0] = urls[0].replace("abs", "pdf")
if "pdf" not in urls[0]:
continue
try:
slides = get_pptx_slides(urls[1], ask_desc=False)
except (zipfile.BadZipFile):
continue
for slide in slides:
if len(slide) < 4:
continue
title = slide.split("\n")[0]
lines = "\n".join(slide.split("\n")[1:])
fl.write(lines + "\n")
fl.write("TITLE:\n")
fl.write(title + "\n")
fl.write("--SEP--\n")