forked from shane-et-al/cville29
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcville29.py
94 lines (77 loc) · 3.34 KB
/
cville29.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import requests
from bs4 import BeautifulSoup
import string
import json
import re
def cleanup(s):
return s.replace("’","'").strip().strip(string.punctuation)
def find_article(element):
# Try to find the link to the original article so we can extract the date
previous_ps=element.find_all_previous('p')
for i in range(7):
if len(previous_ps)<=i:
break
anchor = previous_ps[i].find("a",href=re.compile("charlottesville29.com\/[0-9]{4}\/[0-9]{2}\/[0-9]{2}\/"))
if anchor and len(previous_ps[i].findAll(string=True, recursive=False))==0:
return anchor
# If we can't find it, we can't find it. There are some egregious HTML errors in here.
return None
url = "https://charlottesville29.com/five-finds-on-friday/"
html = requests.get(url).text
soup = BeautifulSoup(html, features="html.parser")
# styling for entries is *inconsistent*, so we'll use this regex to catch (hopefully) all of them
entries = soup.find_all('p', attrs={'style':re.compile("padding-left: [2-9]0px;")})
restaurantcount = {}
restauranturl = {}
restaurantrecs = {}
rnames = set()
with open("./metadata.json","r") as infile:
METADATA = json.load(infile)
for element in entries:
if not str.isnumeric(element.text[0]):
continue
article = find_article(element)
if article:
date = "/".join(article["href"].split("/")[3:6])
finder = article.text
else:
date = "unknown"
finder = "unknown"
# Limitations: Sometimes people recommend more than one restaurant; we'll only count the first one because it's harder to distinguish subsequent recommendations and regular links
# Sometimes, there's no link (because there's no website, because of neglect, or because it's "wife's meatballs"), which we won't count at all.
item = element.find('strong')
if not item:
item = element.find('b')
if not item:
continue
restaurant = item.find_next_sibling('a')
if restaurant:
rname = cleanup(restaurant.text)
if rname in METADATA["name_normalization"]:
rname = METADATA["name_normalization"][rname]
rnames.add(rname)
rurl = restaurant["href"]
rec_text = "".join(s.text for s in restaurant.next_siblings).replace("\n"," ").strip().lstrip(".").strip().strip("“”\"")
recommendation = {"item":cleanup(item.text),"date":date,"finder":finder,"recommendation":rec_text}
if rname in restaurantcount:
restaurantcount[rname] += 1
if item:
restaurantrecs[rname].append(recommendation)
else:
restaurantcount[rname] = 1
restauranturl[rname] = rurl
if item:
restaurantrecs[rname] = [recommendation]
restaurantcount = sorted(restaurantcount.items(), key=lambda x:x[1], reverse=True)
finds = []
for r in restaurantcount:
if r[0] in restaurantrecs:
finds.append({"name":r[0],"url":restauranturl[r[0]],"times picked":r[1],"recommendations":restaurantrecs[r[0]]})
else:
finds.append({"name":r[0],"url":restauranturl[r[0]],"times picked":r[1],"recommendations":[]})
with open("./finds.json","w", encoding="utf-8") as out:
json.dump(finds, out, indent=4, ensure_ascii=False)
rnames = sorted(rnames)
with open("rnames.txt","w") as out:
for n in rnames:
out.write(n+"\n")