-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathget_historic_likes.py
73 lines (64 loc) · 2.79 KB
/
get_historic_likes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# get_historic_likes.py
# Input: a timemap of a tweet
# Output: redirect to a file, CSV format for historic tweet likes
# Lesley Frew
# October 5, 2022
# 1. Run memgator in JSON format to generate timemap.json
# memgator-windows-amd64 --format=JSON https://twitter.com/jk_rowling/status/1207646162813100033 > timemap.json
# 2. Run get_historic_likes.py and redirect to a file
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import pandas as pd
jso = pd.read_json('timemap.json')
print ('memento-datetime,likes')
for memento in reversed(jso.loc['list']['mementos']):
urim = memento['uri']
#need tweet id for 2022 layout scraping
urim2 = re.sub(r".*/status/","", urim)
tweet_id = re.sub(r"[^0-9].*$", "", urim2)
#get the http response for each memento
try:
resp = requests.get(urim)
except:
continue
if 'memento-datetime' not in resp.headers:
continue
#Re-format memento-datetime - code from Shawn Jones/Hypercane
mdt_date = datetime.strptime(resp.headers['memento-datetime'],
"%a, %d %b %Y %H:%M:%S GMT")
mdt = mdt_date.strftime('%Y-%m-%dT%H:%M:%SZ')
#get the HTML tree for each memento
soup = BeautifulSoup(resp.text, features="lxml")
#Try both the 2022 layout and pre-2022 layout to scrape likes
found_count = False
for div in soup.find_all('div'):
#2022 layout
if 'itemtype' in div.attrs and div['itemtype'] == 'https://schema.org/InteractionCounter':
if len(div.contents) > 3:
child = div.contents[0]
if child.name == 'meta' and 'content' in child.attrs and child['content'] == 'https://schema.org/LikeAction':
child2 = div.contents[1]
if 'content' in child2.attrs and child2['content'] == 'Likes':
child3 = div.contents[2]
if 'content' in child3.attrs and tweet_id in child3['content']:
child4 = div.contents[3]
if 'content' in child4.attrs:
print(mdt + "," + child4['content'])
found_count = True
if not found_count:
for ul in soup.find_all('ul'):
#pre-2022 layout - English only
if 'aria-label' in ul.attrs and ul['aria-label'] == 'Retweeted and favorited by':
for li in ul.contents:
if li is None:
continue
li2 = str(li).replace("\n", "")
if 'likes' in li2 and 'data-tweet-stat-count' in li2:
#This did not parse into beautifulsoup as its own element
#Remove everything but the numeric like value
dtsc = re.sub(".*data-tweet-stat-count=\"", "", li2)
dtsc = re.sub("[^0-9].*$", "", dtsc)
print(mdt + "," + dtsc)
found_count = True