-
Notifications
You must be signed in to change notification settings - Fork 1
/
post_wrangler.py
142 lines (118 loc) · 4.4 KB
/
post_wrangler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import requests
import json
import logging
import datetime
import re
remove = '])}while(1);</x>'
headers = {"Accept": "application/json"}
def get_publ_list(url):
r = requests.get(url, headers=headers)
if r.ok:
raw_json = json.loads(r.text.replace(remove, ''))
try:
years_list = get_year_posts(raw_json, url)
except:
logging.warning('failed obtaining publication years')
try:
url_list = get_month_posts(years_list, url)
except:
logging.warning('failed obtaining posts per month')
try:
article_list = pull_publication_list(url_list, url)
except:
logging.warning('failed pulling entire article list')
else:
logging.warning('https request is failing 1')
return article_list
def get_year_posts(raw_json, url):
years_list = []
archIndex = raw_json['payload']['archiveIndex']['yearlyBuckets']
for year in archIndex:
if year['hasStories']:
year_url = url + '/' + year['year']
years_list.append(year_url)
else:
logging.warning('could not find years')
if len(years_list) == 0:
year_url = url + '/' + str(datetime.datetime.now().year)
years_list.append(year_url)
return years_list
def get_month_posts(years_list, url):
url_list = []
for each in years_list:
r = requests.get(each, headers=headers)
if r.ok:
data = json.loads(r.text.replace(remove, ''))
months = data['payload']['archiveIndex']
if len(months['monthlyBuckets']) == 0:
url_list.append(each)
elif months['monthlyBuckets']:
for month in months['monthlyBuckets']:
new_url = url + '/' + month['year'] + '/' + month['month']
url_list.append(new_url)
return url_list
def pull_publication_list(url_list, url):
complete_list = []
seen = set()
for each in url_list:
r = requests.get(each, headers=headers)
if r.ok:
data = json.loads(r.text.replace(remove, ''))
posts_raw = data['payload']['references']['Post']
for k,v in posts_raw.items():
post_dict = {
"id": posts_raw[k]["id"],
"article_title": posts_raw[k]["title"],
"article_subtitle": posts_raw[k]["previewContent"]["subtitle"],
"article_slug": posts_raw[k]["uniqueSlug"]
}
if post_dict["id"] not in seen:
seen.add(post_dict["id"])
complete_list.append(post_dict)
else:
logging.info('duplication found, will not be included in array')
else:
logging.warning('https request is failing in pull_entire_list')
return complete_list
def pull_user_list(url2):
url = url2 + '/latest?format=json&limit=1000'
user_list = []
r = requests.get(url, headers=headers)
if r.ok:
articles = r.text.replace(remove, '')
data = json.loads(articles)
post_json = data["payload"]["references"]["Post"]
for key in post_json:
data_dict = {
"id": post_json[key]["id"],
"article_title": post_json[key]["title"],
"article_subtitle": post_json[key]["previewContent"]["subtitle"],
"article_slug": post_json[key]["uniqueSlug"]
}
user_list.append(data_dict)
else:
logging.warning('https request to pull user list is failing')
return user_list
def get_solo_article(data):
solo_article = []
data_dict = {
"id": data["value"]["id"],
"article_title": data["value"]["title"],
"article_subtitle": data["value"]["content"]["subtitle"],
"article_slug": data["value"]["uniqueSlug"]
}
solo_article.append(data_dict)
return solo_article
def is_single_article(arti_url):
rgx = re.search(r'([a-zA-Z0-9]{12})', arti_url[-12:])
r = requests.get(arti_url, headers=headers)
if r.ok and rgx:
data = json.loads(r.text.replace(remove, ''))
else:
return False
try:
payload = data["payload"]["value"]["id"]
except KeyError:
return False
if payload == rgx.group():
return (data["payload"])