-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_comparer.py
204 lines (171 loc) · 8.77 KB
/
data_comparer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import pandas as pd
import os
import ast
from datetime import datetime
import warnings
class HistoricDataComparer:
'''
A class to compare newly scraped data against a historical record set to identify novel entries.
Attributes:
historic_ids (set): Set of unique identifiers from previously observed data.
current_datetime (str): Current date and time, formatted for file naming.
current_date (str): Current date, formatted for data tagging.
Constants:
WP_IDS_FILEPATH (str): Filepath to the historic working paper identifiers.
This is where the historic registry of paper IDs is stored.
WP_DATA_FILEPATH (str): Filepath to the historic working paper data.
This is where the titles, authors, abstracts, etc. of previously scraped working
papers are stored.
LOCAL_SCRAPE_OUTCOMES_FILEPATH (str): Filepath to the local scrape outcomes folder.
This is where the data and ids of newly scraped working papers are stored. This
folder is for local debugging and web scraping use only. It is not read by
other portions of the repository.
'''
# Class level constants
# File path to the historic working paper identifiers
WP_IDS_FILEPATH = 'data/wp_ids.txt'
# File path to the historic working paper data
WP_DATA_FILEPATH = 'data/wp_data.csv'
# File path to the local scrape outcomes
LOCAL_SCRAPE_OUTCOMES_FILEPATH = 'data/local_scrape_outcomes'
def __init__(self):
'''
Initializes the HistoricDataComparer. Loads the historic set of identifiers
from the file specified by WP_IDS_FILEPATH.
'''
# This attribute loads the contents of the historic working paper ids file,
# which are stored as a Python sets
self.historic_ids = self.load_historic_ids()
# Defining current dates and times for use in save_local_results() and
# compare() methods
self.current_datetime = datetime.now().strftime('%Y-%m-%d-%H%M')
self.current_date = datetime.now().strftime('%m/%d/%Y')
def load_historic_ids(self):
'''
Loads the set of historical identifiers from the file specified by WP_IDS_FILEPATH.
Returns:
set: A set of strings representing historic identifiers.
Raises:
FileNotFoundError: If the file specified by WP_IDS_FILEPATH does not exist.
'''
try:
with open(self.WP_IDS_FILEPATH, 'r') as file:
return ast.literal_eval(file.read())
except FileNotFoundError:
raise FileNotFoundError(f'File {self.WP_IDS_FILEPATH} not found.')
def compare(self, df):
'''
Identifies novel entries by comparing the dataframe indices against the historic set.
Parameters:
df (pd.DataFrame): The dataframe with new data to compare.
Returns:
pd.DataFrame: A dataframe containing only the novel entries.
'''
# Define set of ids from recently scraped data
recent_ids = set(df.index)
# Determine set of novel entry ds by subtracting the historic
# set from the recent set
novel_ids = recent_ids - self.historic_ids
# Check if there are any novel entries
if novel_ids:
# Filter the recently scraped data for only the novel entries
novel_df = df.loc[list(novel_ids)]
# Add an estimated publication date column to novel entries
novel_df['est_PubDate'] = self.current_date
return novel_df
else:
# Return an empty DataFrame if there are no novel entries to ensure the method always returns a DataFrame
return pd.DataFrame()
def save_local_results(self, novel_df):
'''
Saves the novel entries in .csv, .txt, and .html formats into the specified `LOCAL_SCRAPE_OUTCOMES_FILEPATH`.
Note:
Output files are saved in a directory specified by `LOCAL_SCRAPE_OUTCOMES_FILEPATH`, which
is typically configured to be ignored by version control (.gitignore).
This setup is intended for local debugging and temporary storage.
See README for more details.
Parameters:
novel_df (pd.DataFrame): Data frame containing the novel entries, which are
produced by applying the compare() method in this class to a data frame of recently
scraped data.
Outputs:
.csv: Contains all novel entries with metadata. File is saved in `LOCAL_SCRAPE_OUTCOMES_FILEPATH`
directory with name `YYYY-MM-DD-HHMM-data.csv`.
.html: Browser-viewable dashboard file with clickable links for each entry
title. File is saved in `LOCAL_SCRAPE_OUTCOMES_FILEPATH` directory with name
`YYYY-MM-DD-HHMM-dashboard.html`.
.txt: Contains the set of unique identifiers of the novel entries, formatted
as a Python set for potential future debugging. File is saved in `LOCAL_SCRAPE_OUTCOMES_FILEPATH`
directory with name `YYYY-MM-DD-HHMM-ids.txt`.
'''
# Create a template filepath, which will be used to save the below
# .csv, .html, and .txt files
filepath = os.path.join(self.LOCAL_SCRAPE_OUTCOMES_FILEPATH, self.current_datetime)
# Save .csv
# Write novel_df to file. Use utf-8 encoding to ensure special
# characters are properly displayed.
novel_df.to_csv(f'{filepath}-data.csv', encoding='utf-8')
# Save .html
# Copy novel_df and assign to df_html
df_html = novel_df.copy()
# Reset the index and drop the old index
df_html = df_html.reset_index(drop=True)
# Add hyperlinks to the titles
df_html['Title'] = df_html.apply(lambda row: f'<a href="{row["Link"]}">{row["Title"]}</a>', axis=1)
# Drop the 'Link' and 'Number' columns
df_html = df_html.drop(['Link', 'Number'], axis=1)
html_content = df_html.to_html(escape=False)
# Write html_content to file. Use utf-8 encoding to ensure special
# characters are properly displayed.
with open(f'{filepath}-dashboard.html', 'w', encoding='utf-8') as file:
file.write(html_content)
# Save .txt
# Define novel_ids using indices from novel_df
novel_ids = set(novel_df.index)
# Write set of novel_ids to file
with open(f'{filepath}-ids.txt', 'w') as file:
file.write(str(novel_ids))
def append_ids_to_historic(self, novel_df):
'''
Appends newly scraped working paper identifiers to the historic
identifiers, then saves as a .txt file in `self.WP_IDS_FILEPATH`.
File content is formatted as a Python set for easy retrieval and
comparison.
Parameters:
novel_df (pd.DataFrame): Data frame containing the novel entries, which are produced by applying the compare() method in this class to a data frame of recently
scraped data.
'''
# Define novel_ids using indices from novel_df
novel_ids = set(novel_df.index)
with open(self.WP_IDS_FILEPATH, 'w') as file:
self.historic_ids |= novel_ids
file.write(str(self.historic_ids))
def append_data_to_historic(self, novel_df):
'''
Appends newly scraped working paper rows - containing Title, Author, Link,
Date, Link, Number, Abstract, estPubDate, Source, and unique identifier
index - to the historic data. Saves as a .csv file in
`self.WP_DATA_FILEPATH`.
Parameters:
novel_df (pd.DataFrame): Data frame containing the novel entries, which are
produced by applying the compare() method in this class to a data frame of recently
scraped data.
'''
print("function initiated")
try:
# Read in the existing historic data
existing_df = pd.read_csv(self.WP_DATA_FILEPATH)
print("existing_df read")
# Map the column order of historic data to column_order variable
column_order = existing_df.columns.tolist()
print("column order mapped")
# Apply the column order to novel_df
novel_df = novel_df[column_order]
print("column order applied to novel_df")
# Append novel_df rows to csv file self.WP_DATA_FILEPATH
novel_df.to_csv(self.WP_DATA_FILEPATH, mode='a', header=False, index=False, encoding='utf-8-sig')
print("csv written")
except FileNotFoundError:
# If file not found, write a new file
novel_df.to_csv(self.WP_DATA_FILEPATH, mode='w', header=True, index=False, encoding='utf-8-sig')
warnings.warn(f'No existing file found at {self.WP_DATA_FILEPATH}. New file created.')