-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnews.py
191 lines (163 loc) · 7.28 KB
/
news.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import json
import pandas as pd
import re
import requests
import hashlib
# Function to fetch data from a URL
def fetch_json_data(url):
response = requests.get(url)
if response.status_code == 200:
return response.json()
else:
return None
# Function to clean text by removing newline characters
def clean_text(text):
return text.replace('\n', ' ').replace('\r', ' ') if text else text
# Function to separate text from URL in the TITLE fields using regex for accuracy
def extract_title_url(entry):
title_text_en = ""
title_url_en = ""
title_text_fr = ""
title_url_fr = ""
# Separate English title and URL
if 'TITLE_EN' in entry and entry['TITLE_EN']:
match_en = re.search(r"<a href='(.*?)'>(.*?)</a>", entry['TITLE_EN'])
if match_en:
title_url_en = match_en.group(1)
title_text_en = match_en.group(2)
else:
title_text_en = entry['TITLE_EN']
# Separate French title and URL
if 'TITLE_FR' in entry and entry['TITLE_FR']:
match_fr = re.search(r"<a href='(.*?)'>(.*?)</a>", entry['TITLE_FR'])
if match_fr:
title_url_fr = match_fr.group(1)
title_text_fr = match_fr.group(2)
else:
title_text_fr = entry['TITLE_FR']
entry['TITLE_TEXT_EN'] = clean_text(title_text_en)
entry['TITLE_URL_EN'] = clean_text(title_url_en)
entry['TITLE_TEXT_FR'] = clean_text(title_text_fr)
entry['TITLE_URL_FR'] = clean_text(title_url_fr)
return entry
# Function to normalize Minister names
def normalize_minister_name(name):
if name.startswith("Hon. "):
return name.replace("Hon. ", "")
elif name.startswith("L'hon. "):
return name.replace("L'hon. ", "")
return name
# Function to compute MD5 hash of a row
def hash_row(row):
row_str = ','.join(row.astype(str).fillna(''))
return hashlib.md5(row_str.encode()).hexdigest()
# Main function to fetch data, process it, and return the DataFrame
def main():
url_en = "https://www.canada.ca/en/news.datatable.json"
url_fr = "https://www.canada.ca/fr/nouvelles.datatable.json"
# Fetch the JSON data
data_en = fetch_json_data(url_en)
data_fr = fetch_json_data(url_fr)
if data_en is None or data_fr is None:
print("Failed to fetch data from URLs.")
return None
news_en = data_en['data']
news_fr = data_fr['data']
# Normalize Minister names in both datasets and clean text fields
for entry in news_en:
entry['MINISTER_EN'] = normalize_minister_name(clean_text(entry.get('MINISTER', '')))
for key in entry:
entry[key] = clean_text(entry[key])
for entry in news_fr:
entry['MINISTER_FR'] = normalize_minister_name(clean_text(entry.get('MINISTER', '')))
for key in entry:
entry[key] = clean_text(entry[key])
# Combine the data into a single DataFrame based on timestamp and minister name
combined_data = []
for en in news_en:
match_fr = None
for fr in news_fr:
if (en.get("PUBDATE") == fr.get("PUBDATE") and
normalize_minister_name(en.get("MINISTER", "")) == normalize_minister_name(fr.get("MINISTER", ""))):
match_fr = fr
break
if match_fr:
combined_entry = {
"PUBDATE": en.get("PUBDATE", ""),
"TITLE_EN": en.get("TITLE", ""),
"TEASER_EN": en.get("TEASER", ""),
"ADDITIONAL_TOPICS_EN": en.get("ADDITIONAL_TOPICS", ""),
"AUDIENCE_EN": en.get("AUDIENCE", ""),
"TYPE_EN": en.get("TYPE", ""),
"DEPT_EN": en.get("DEPT", ""),
"LOCATION_EN": en.get("LOCATION", ""),
"MINISTER_EN": en.get("MINISTER", ""),
"TOPIC_EN": en.get("TOPIC", ""),
"SUBJECT_EN": en.get("SUBJECT", ""),
"TITLE_FR": match_fr.get("TITLE", ""),
"TEASER_FR": match_fr.get("TEASER", ""),
"ADDITIONAL_TOPICS_FR": match_fr.get("ADDITIONAL_TOPICS", ""),
"AUDIENCE_FR": match_fr.get("AUDIENCE", ""),
"TYPE_FR": match_fr.get("TYPE", ""),
"DEPT_FR": match_fr.get("DEPT", ""),
"LOCATION_FR": match_fr.get("LOCATION", ""),
"MINISTER_FR": match_fr.get("MINISTER", ""),
"TOPIC_FR": match_fr.get("TOPIC", ""),
"SUBJECT_FR": match_fr.get("SUBJECT", "")
}
combined_data.append(extract_title_url(combined_entry))
# Create a DataFrame
df_combined = pd.DataFrame(combined_data)
# Reorder the columns to include the separated TITLE_TEXT and TITLE_URL fields
ordered_columns = [
"PUBDATE",
"TITLE_TEXT_EN", "TITLE_URL_EN", "TITLE_TEXT_FR", "TITLE_URL_FR",
"TEASER_EN", "TEASER_FR",
"ADDITIONAL_TOPICS_EN", "ADDITIONAL_TOPICS_FR",
"AUDIENCE_EN", "AUDIENCE_FR",
"TYPE_EN", "TYPE_FR",
"DEPT_EN", "DEPT_FR",
"LOCATION_EN", "LOCATION_FR",
"MINISTER_EN", "MINISTER_FR",
"TOPIC_EN", "TOPIC_FR",
"SUBJECT_EN", "SUBJECT_FR"
]
df_combined_ordered = df_combined[ordered_columns]
# Sort by PUBDATE descending
df_combined_ordered['PUBDATE'] = pd.to_datetime(df_combined_ordered['PUBDATE'])
df_combined_ordered.sort_values(by='PUBDATE', ascending=False, inplace=True)
return df_combined_ordered
if __name__ == "__main__":
new_data = main()
if new_data is not None:
# Load the existing CSV
existing_csv_path = 'combined_news.csv'
try:
existing_data = pd.read_csv(existing_csv_path)
existing_data['PUBDATE'] = pd.to_datetime(existing_data['PUBDATE'])
# Check if 'hash' column exists in the existing data
if 'hash' not in existing_data.columns:
# Compute the hash for the existing data
existing_data['hash'] = existing_data.apply(hash_row, axis=1)
print("Backfilled 'hash' column for existing data.")
# Compute the hash for the new data
new_data['hash'] = new_data.apply(hash_row, axis=1)
# Combine data
combined_data = pd.concat([existing_data, new_data], ignore_index=True)
# Remove duplicates based on 'hash'
combined_data.drop_duplicates(subset='hash', keep='last', inplace=True)
except FileNotFoundError:
# If the file does not exist, use new data as the combined data
combined_data = new_data
# Compute the hash for the new data
combined_data['hash'] = combined_data.apply(hash_row, axis=1)
print("Created new data with 'hash' column.")
# Sort by PUBDATE descending
combined_data.sort_values(by='PUBDATE', ascending=False, inplace=True)
# Move the 'hash' column to the first position
columns = combined_data.columns.tolist()
columns.insert(0, columns.pop(columns.index('hash')))
combined_data = combined_data[columns]
# Save the updated CSV file
combined_data.to_csv(existing_csv_path, index=False)
print(f"Updated combined CSV saved to {existing_csv_path}")