-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils_pdf.py
44 lines (38 loc) · 1.11 KB
/
utils_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from PyPDF2 import PdfFileReader
import pdfplumber
def get_pdf_info(path):
"""This will return the doc info infomation from the
Named file."""
data = {}
with open(path, 'rb') as f:
pdf = PdfFileReader(f)
cp = pdf.getDocumentInfo()
if cp:
# get the core properties from the file...
data['author'] = cp.author
data['creator'] = cp.creator
data['producer'] = cp.producer
data['subject'] = cp.subject
data['title'] = cp.title
return data
"""
def get_pdf_text(path):
data = []
with open(path, 'rb') as f:
pdf = PdfFileReader(f)
try:
for i in range(0,pdf.getNumPages()):
pdf_page = pdf.getPage(0)
strs = pdf_page.extractText()
if strs:
data.append(strs)
except Exception as ex:
print("ERROR: {}\n\t{}".format(path, ex))
return data
"""
def get_pdf_text(path):
data = []
with pdfplumber.open(path) as pdf:
for p in pdf.pages:
data.append(p.extract_text())
return data