-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
84 lines (64 loc) · 2.43 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import pytz
from datetime import datetime
import requests
from requests_html import HTML
import time
tz = pytz.timezone("Europe/Helsinki")
now = datetime.now(tz)
timestamps = now.strftime("%d/%m/%Y %H:%M:%S")
base_url = "https://stackoverflow.com/questions/tagged/"
def clean_scraped_data(text, keyname=None):
if keyname == "votes":
return text.replace("\nvotes", "")
elif keyname == "tags":
return text.replace("\n", ", ")
elif keyname == "summary":
return text
return text
def parse_tagged_page(html):
question_summaries = html.find(".s-post-summary")
key_names = ["question", "summary", "answers", "votes", "tags", "links"]
classes_needed = [".s-link", ".s-post-summary--content-excerpt", ".s-post-summary--stats-item.has-answers",
".s-post-summary--stats-item-number", ".tags"]
datas = []
for q_el in question_summaries:
question_data = {}
for i, cls in enumerate(classes_needed):
keyname = key_names[i]
question_summary_id = q_el.attrs["id"].split("-")[2]
sub_el = q_el.find(cls, first=True)
if keyname == "answers":
if sub_el is None:
question_data["answers"] = 0
else:
a_sub_el = sub_el.find(".s-post-summary--stats-item-number", first=True)
question_data["answers"] = a_sub_el.text
else:
question_data[keyname] = clean_scraped_data(sub_el.text, keyname=keyname)
question_data["links"] = f"https://stackoverflow.com/questions/{question_summary_id}"
datas.append(question_data)
return datas
def extract_data_from_url(url):
r = requests.get(url)
if r.status_code not in range(200, 299):
return []
html = HTML(html=r.text)
datas = parse_tagged_page(html)
return datas
def scrape_tag(tag="python", query_filter="Unanswered", max_pages=1):
datas = []
for p in range(max_pages):
url = f"{base_url}{tag}?tab={query_filter}&page={p + 1}"
print(url)
datas += extract_data_from_url(url)
time.sleep(1.2)
return datas
def extract_data(tags=None, max_pages=1):
if tags is None:
tags = ["python"]
scraped_data = []
for tag in tags:
datas = scrape_tag(tag=tag, max_pages=max_pages)
datas.insert(0, {"timestamps": timestamps})
scraped_data = datas
return scraped_data