This repository has been archived by the owner on Nov 29, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
NeurIPS2019Scraper.py
89 lines (79 loc) · 3.11 KB
/
NeurIPS2019Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# NeurIPS webpages that contain paper proceedings from 1987 - 2018 are in the exact same format. This is the scraper.
from PyPDF4 import PdfFileReader
from bs4 import BeautifulSoup
import requests
import io
import csv
from ResearchPaper import ResearchPaper
from Modules import modules
class NeurIPS2019Scraper:
staticmethod
def scrape_neurips_2019(url, year):
# result = requests.get(url)
# src = result.content
# soup = BeautifulSoup(src, 'lxml')
soup = BeautifulSoup(open("./NeurIPS/neurips_2019_src.html"), "html.parser")
accepted_papers = soup.find("h4").parent
papers = accepted_papers.find_all("p")
research_paper_object_list = []
author_to_affil = {}
# iterate through all paper urls and create ResearchPaper objects
print(papers)
for idx, paper in enumerate(
papers[2:]
): # skip first two - they are instructional
title = paper.find("b").text
authors_and_affils = paper.find("i").text
authors_and_affils = authors_and_affils.split(" · ")
authors = []
affils = []
for author_and_affil in authors_and_affils:
author_str, affil_str = author_and_affil.split(" (", 1)
affil_str = affil_str[:-1]
authors.append(author_str)
affils.append(affil_str)
author_to_affil[
author_str
] = affil_str # assuming each author has one affiliation, or it gets overwritten
assert len(authors) == len(affils)
# extract the Paper Text
abstract = "<none>"
paper_text = "<none>"
pdf_url = "<none>"
unique_id = idx
this_paper = ResearchPaper(
unique_id, year, title, pdf_url, authors, paper_text, abstract
)
research_paper_object_list.append(this_paper)
print("{} / {} ingested".format(idx + 1, len(papers) - 2))
# write it all to CSV when everything is ingested
with open("neurips" + str(year) + ".csv", "w") as new_csv:
writer = csv.writer(new_csv)
header = [
"unique_id",
"year",
"title",
"authors",
"pdf_url",
"paper_text",
"abstract",
]
writer.writerow(header)
for paper in research_paper_object_list:
row = [
paper.unique_id,
paper.year,
paper.title,
paper.authors,
paper.pdf_url,
paper.paper_text,
paper.abstract,
]
writer.writerow(row)
with open("author_affiliations.csv", "w") as new_csv:
writer = csv.writer(new_csv)
header = ["author_name", "neurips_2019_affil"]
writer.writerow(header)
for author, affil in author_to_affil.items():
row = [author, affil]
writer.writerow(row)