-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
80 lines (69 loc) · 3.67 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from config import *
from modules import pickle_helpers, parse_helpers
def main():
# save the text to a pickle file if we haven't already
pickle_helpers.save_text_from_html_div(
DATA_DIR, # directory that we're operating in
LAWS_HTML_FILENAME, # filename for the html file we want to read
LAWS_TEXT_FILENAME, # filename for the text list we will save
div_id=RESULTS_DIV_ID, # optional, id of the div we want within the HTML
force_update=False, # optional, set to True if you want to force a new file to be made
)
# read the pickle back in
text_list = pickle_helpers.read_pickle(LAWS_TEXT_PATH)
# create a list of law lists, e.g.
# [
# [law1_text1, law1_text2, law1_text3], # law1
# [law2_text1, law2_text2, law2_text3], # law2
# ...
# ]
last_law_start_index = 0
law_list = []
text_list_length = len(text_list)
for index, text in enumerate(text_list):
# the law header will be our anchor (i.e. 'AL HJR 298')
# so we can use that to parse each law, using STATES and two patterns from the config file
if index == text_list_length - 1: # but first, if we're at the end of the list, append the last law
next_law_start_index = index + 1
law_list.append(text_list[last_law_start_index:next_law_start_index])
text_split = text.split() # split the text by whitespace
# then see if we meet the conditions for a new law
if len(text_split) == 3: # there should be 3 elements to the law
# the first element should be in our states list
# the second element should be one or more UPPERCASE letters
# the third element should be one or more numbers
# if we meet those conditions, do things
if text_split[0] in STATES.keys() \
and INDEX_1_PATTERN.match(text_split[1]) \
and INDEX_2_PATTERN.match(text_split[2]):
next_law_start_index = index # the next law will start at this index
if last_law_start_index != 0: # if this isn't the start of the list, append the new law
# we append a list from the last_law_start_index to the next_law_start_index
law_list.append(text_list[last_law_start_index:next_law_start_index])
# set the next last law start index equal to the current next law start index.
# and, no, that phrasing doesn't make any sense. Just read it a few times and you'll get it.
last_law_start_index = next_law_start_index
# now we have a list of law-text-lists (law_list) and we need to parse them
parsed_laws = []
for law_text_list in law_list:
parsed_law = {
'id': parse_helpers.parse_id(law_text_list),
'status': parse_helpers.parse_status(law_text_list),
'summary': parse_helpers.parse_summary(law_text_list),
'date_of_last_action': parse_helpers.parse_date(law_text_list),
'date_status': parse_helpers.parse_date_status(law_text_list),
'author': parse_helpers.parse_author(law_text_list),
'party': parse_helpers.parse_party(law_text_list),
'add_authors': parse_helpers.parse_authors_add(law_text_list),
'add_author_party_count': parse_helpers.count_add_author_party(law_text_list),
'topics': parse_helpers.parse_topics(law_text_list),
}
parsed_laws.append(parsed_law)
for law in law_list:
if law[0] == 'IL H 5006':
print(law)
for law in parsed_laws:
if law['id'] == 'IL H 5006':
print(law)
if __name__ == '__main__':
main()