-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
100 lines (78 loc) · 3.38 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import requests
from bs4 import BeautifulSoup
from helper_functions import *
base_url = "https://arxiv.org"
categories = {}
@info_print("Fetching categories")
def get_categories():
# Fetch html of base arxiv url
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")
global categories
categories = {} # Dictionary storing category names and corresponding links
# Get all unordered lists of the content div
content_div = soup.find(id="content")
ul_elements = content_div.find_all("ul")
# Iterate over every ul and check for valid category links
for ul in ul_elements:
li_elements = ul.find_all("li")
for li in li_elements:
cat_title_link = li.find("a", id=True) # id=True because links of "About arxiv" dont have ids
if cat_title_link is not None:
categories[cat_title_link.text.lower()] = li.find("a", string="recent").get("href")
return categories
@info_print("Fetching recents")
def get_recent(category, num=5):
# Fetch html of recent site for category
response = requests.get(base_url + categories[category.lower()] + "/recent")
soup = BeautifulSoup(response.content, "html.parser")
# get dive of recent submissions
paper_div = soup.find("dl")
submissions_links = paper_div.find_all("dt")
submissions_content = paper_div.find_all("dd")
title_replacements = [["Title: ", ""], ["\n", ""], [" ", " "]] # Clean text by removing double spaces, new lines
all_submissions = []
# Fetch infos for every submission
for i, submission in enumerate(submissions_content):
sub_dict = {}
# Get Title
try:
title = submission.find("div", class_="list-title mathjax").text
for old, new in title_replacements:
title = title.replace(old, new)
sub_dict["title"] = title
except:
print("[Error] Could not get title of paper")
continue # Dont add paper when title cant be fetched
# Get Authors
try:
authors_div = submission.find("div", class_="list-authors")
authors_names = [link.text for link in authors_div.find_all("a")]
authors_links = [base_url + link.get("href") for link in authors_div.find_all("a")]
sub_dict["author_names"] = authors_names
sub_dict["author_links"] = authors_links
except:
print("[Error] Could not get title of paper")
continue # Dont add paper when authors cant be fetched
# Get Links
try:
paper_link = submissions_links[i].find("a", title="Abstract").get("href")
sub_dict["paper_link"] = base_url + paper_link
except:
print("[Error] Could not get arxiv link of paper")
try:
pdf_link = submissions_links[i].find("a", title="Download PDF").get("href")
sub_dict["pdf_link"] = base_url + pdf_link
except:
print("[Error] Could not get pdf link of paper")
# Add fetched info to return list
all_submissions.append(sub_dict)
# Break loop if desired number of papers has been reached
if len(all_submissions) >= num:
break
return all_submissions
if __name__ == "__main__":
cats = get_categories()
print(cats)
recents = get_recent('astrophysics', num=5)
print(recents)