-
Notifications
You must be signed in to change notification settings - Fork 0
/
absence_data_scraper.py
67 lines (54 loc) · 2.32 KB
/
absence_data_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
import json
from typing import Union, Any
import requests
from bs4 import BeautifulSoup
from constants import app_constants
from settings import app_settings
def process_html(content: bytes):
attendance_data = []
soup = BeautifulSoup(content, 'html5lib')
title = soup.find('span', attrs={'id': app_constants.ATTENDANCE_PAGE_TITLE}).text
table = soup.find('div', attrs={'id': app_constants.ATTENDANCE_TABLE_ID}) # attendance table
for row in table.findAll('div', attrs={'class': 'row margin_h0 margin-Top-15'}):
person_name = row.find('a').text
person_info = row.find_all('span', attrs={'class': 'TextoRegular'})
party = person_info[0].text
is_present = True if person_info[1].text == 'Presença (P)' else False
attendance_row = {
'deputado': person_name,
'partido': party,
'is_present': is_present
}
attendance_data.append(attendance_row)
return {'title': title, 'data': attendance_data}
class AbsenceScraper:
def __init__(self, bid):
self.bid = bid
self.url = app_constants.ATTENDANCE_TABLE_URL + bid
self.is_save_files = app_settings.is_to_save_files
def get_content(self):
try:
r = requests.get(self.url)
print("Page exists.")
return r.content
except requests.exceptions.Timeout as e:
print("Timeout error: {0}".format(e))
except requests.exceptions.TooManyRedirects as e:
print("Try a different URL: {0}".format(e))
except requests.exceptions.RequestException as e:
print("Unexpected error: {0}".format(e))
def save_json(self, content: dict[str, Union[str, list[dict[str, Union[bool, Any]]]]]):
with open('data/' + self.bid + '.json', 'w') as fp:
json_data = json.dumps(content, indent=4)
json.dump(json_data, fp)
def scrape_data(self):
if self.is_save_files:
if not os.path.exists('data/' + self.bid + '.json'):
page_content = self.get_content()
processed_content = process_html(page_content)
self.save_json(processed_content)
else:
print('File already exists, no need to re-scrape it.')
else:
return process_html(self.get_content())