-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathextract_pdf_attachments.py
36 lines (31 loc) · 1.18 KB
/
extract_pdf_attachments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/usr/bin/python3
# very slightly modified version of https://gist.github.com/kevinl95/29a9e18d474eb6e23372074deff2df38 with cmd arguments and by default no output
import PyPDF2
import sys
from pathlib import Path
def getAttachments(reader):
"""
Retrieves the file attachments of the PDF as a dictionary of file names
and the file data as a bytestring.
:return: dictionary of filenames and bytestrings
"""
catalog = reader.trailer["/Root"]
fileNames = catalog["/Names"]["/EmbeddedFiles"]["/Kids"][0].getObject()["/Names"]
attachments = {}
for f in fileNames:
if isinstance(f, str):
name = f
dataIndex = fileNames.index(f) + 1
fDict = fileNames[dataIndex].getObject()
fData = fDict["/EF"]["/F"].getData()
attachments[name] = fData
return attachments
handler = open(sys.argv[1], "rb")
reader = PyPDF2.PdfFileReader(handler)
dictionary = getAttachments(reader)
# print(dictionary)
for fName, fData in dictionary.items():
path = Path.cwd() / ("." + str(Path(fName).resolve()))
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "wb") as outfile:
outfile.write(fData)