This repository has been archived by the owner on Nov 29, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Modules.py
59 lines (49 loc) · 1.61 KB
/
Modules.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# NeurIPS webpages that contain paper proceedings from 1987 - 2018 are in the exact same format. This is the scraper.
import PyPDF4
from PyPDF4 import PdfFileReader
from bs4 import BeautifulSoup
import requests
import io
import csv
from ResearchPaper import ResearchPaper
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
class modules:
staticmethod
def get_pdf_from_fp(f, remove_newline=True):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = "utf-8"
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
for page in PDFPage.get_pages(
f,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True,
):
interpreter.process_page(page)
text = retstr.getvalue()
f.close()
device.close()
retstr.close()
# TODO
text = text.replace("\0", "")
if remove_newline:
text = text.replace("\n", " ") # removes newlines
return text
# TODO doesn't work for those autodownload bad boys
def pdf_string_from_url(pdf_url):
r = requests.get(pdf_url)
f = io.BytesIO(r.content)
return modules.get_pdf_from_fp(f)