-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathweb_scraping.py
48 lines (46 loc) · 2.04 KB
/
web_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from bs4 import BeautifulSoup
import requests, lxml, os, json
import numpy
data = []
for x in range(0, 510, 10):
params = {
"q": 'biodivers* OR "genetic diversity" OR "*omic diversity" OR "phylogenetic diversity" OR "population diversity" OR "species diversity" OR "ecosystem diversity" OR "functional diversity" OR "microbial diversity" OR "soil diversity" AND "Deep Learning"',
"hl": "en",
"as_ylo": 2015,
"as_yhi": 2021,
"start": "{}".format(x),
}
headers = {
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
html = requests.get(
"https://scholar.google.com/scholar", headers=headers, params=params, timeout=30
).text
soup = BeautifulSoup(html, "lxml")
for result in soup.select(".gs_ri"):
try:
title = result.select_one(".gs_rt").text
title_link = result.select_one(".gs_rt a")["href"]
publication_info = result.select_one(".gs_a").text
snippet = result.select_one(".gs_rs").text
cited_by = result.select_one("#gs_res_ccl_mid .gs_nph+ a")["href"]
related_articles = result.select_one("a:nth-child(4)")["href"]
try:
all_article_versions = result.select_one("a~ a+ .gs_nph")["href"]
except:
all_article_versions = None
data.append(
{
"title": title,
"title_link": title_link,
"publication_info": publication_info,
"snippet": snippet,
"cited_by": f"https://scholar.google.com{cited_by}",
"related_articles": f"https://scholar.google.com{related_articles}",
"all_article_versions": f"https://scholar.google.com{all_article_versions}",
}
)
except:
pass
with open("Google_scholar_500_publications_run1.json", "w") as outfile:
json.dump(data, outfile)