-
Notifications
You must be signed in to change notification settings - Fork 0
/
alt_scrape.py
64 lines (53 loc) · 1.6 KB
/
alt_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import requests
from bs4 import BeautifulSoup as bs
import time
def scrape_handles(iop):
print("scraping IOP for twitter handles...")
n = 0
results = []
output = []
while n < 5:
if n != 0:
time.sleep(5) # let's be polite
n += 1
print(f"page {n}")
r = requests.get(iop + f"twitter/page:{n}")
results.append(r.text)
for r in results:
soup = bs(r, "lxml") # import data
x = soup.select(".handle") # select only <div id=handle> tags
x = [i.string for i in x] # remove tags
output = output + x
output = list(set(output)) # deduplicate
print("-" * 80)
return output
def scrape_links(iop):
print("scraping IOP for links to tweets...")
n = 0
results = []
output = []
while n < 5:
if n != 0:
time.sleep(5) # let's be polite
n += 1
print(f"page {n}")
r = requests.get(iop + f"twitter/page:{n}")
results.append(r.text)
for r in results:
soup = bs(r, "lxml") # import data
x = soup.select("time") # select only <time> tags
for i in x:
links = i.select("a") # select the links within the <time tags>
for link in links:
output.append(link.get("href")) # Append the link target to the list
output = list(set(output)) # deduplicate
print("-" * 80)
return output
if __name__ == "__main__":
iop = "https://iop.altmetric.com/details/5152221/"
a = scrape_handles(iop)
for i in a:
print(i)
a = scrape_links(iop)
for i in a:
print(i)