This repository has been archived by the owner on Nov 29, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
ICLR20132014Scraper.py
95 lines (80 loc) · 3.29 KB
/
ICLR20132014Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# https://openreview.net/group?id=ICLR.cc/2017/conference
# Need to put RENDERED SOURCE into ICLR/*.html
from bs4 import BeautifulSoup
import io
import csv
import requests
from Modules import modules
from ResearchPaper import ResearchPaper
class ICLR20132014Scraper:
staticmethod
def scrape_iclr_2013_2014(url, year):
assert year == 2013 or year == 2014
research_paper_object_list = []
soup = BeautifulSoup(open("./ICLR/ICLR_{}.html".format(year)), "html.parser")
raw_papers_list = soup.find_all("div", class_="note panel")
for idx, raw_paper in enumerate(raw_papers_list):
title = raw_paper.find(class_="note_content_title").text
meta_rows = raw_paper.find_all(class_="meta_row")
authors = meta_rows[0].find_all("a")
# decision = meta_rows[3]
# if 'ICLR 2017 Conference Reject' in decision.text: # TODO
# continue
base_url = "https://openreview.net"
pdf_row = raw_paper.find(class_="title_pdf_row clearfix")
pdf_url = pdf_row.find("a", href=True, title="Download PDF")["href"]
if "http" not in pdf_url:
pdf_url = (
base_url + pdf_url
) # TODO this is a problem for a few that have full urls...
try:
unique_id = pdf_url.split("id=")[1]
except:
unique_id = pdf_url.split("/")[-1].split(".pdf")[0]
abstract_url = pdf_url
abs_src = requests.get(abstract_url).content
soup = BeautifulSoup(abs_src, "lxml")
abstract = soup.find("blockquote", class_="abstract").text.strip()
if abstract.startswith("Abstract:"):
abstract = abstract.split("Abstract:")[1].strip()
abstract = abstract.replace("\n", "")
authors = [a.text for a in authors]
pdf_url = pdf_url.replace("abs", "pdf")
pdf_url += ".pdf"
try:
paper_text = modules.pdf_string_from_url(pdf_url)
this_paper = ResearchPaper(
unique_id, year, title, pdf_url, authors, paper_text, abstract
)
research_paper_object_list.append(this_paper)
print(
"{} / {} {}".format(idx + 1, len(raw_papers_list), this_paper.title)
)
except KeyboardInterrupt:
quit()
except Exception as e:
print("**** PAPER WITH TITLE [{}] FAILED ****".format(title))
print(e)
with open("iclr" + str(year) + ".csv", "w") as new_csv:
writer = csv.writer(new_csv)
header = [
"unique_id",
"year",
"title",
"authors",
"pdf_url",
"paper_text",
"abstract",
]
writer.writerow(header)
for paper in research_paper_object_list:
row = [
paper.unique_id,
paper.year,
paper.title,
paper.authors,
paper.pdf_url,
paper.paper_text,
paper.abstract,
]
writer.writerow(row)