-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf_extract_images.py
40 lines (35 loc) · 1.23 KB
/
pdf_extract_images.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import os
from PIL import Image
import fitz # pip install PyMuPDF
import io
def extimg(location, pdffile):
pf = fitz.open(os.path.join(location, pdffile))
for pi in range(len(pf)):
iml = pf[pi].getImageList()
if not iml:
continue
for i in range(len(iml)):
im = pf.extractImage(iml[i][0]) #image index of page, xml ref string is in [0]
image_bytes = im["image"]
# get the image extension
image_ext = im["ext"]
# load it to PIL
image = Image.open(io.BytesIO(image_bytes))
# save it to local disk
fn = os.path.join('/tmp/images',location, pdffile)
fn = f'{"".join(fn.split(".")[:-1])}/{pi+1}_{i}.{image_ext}'
print(fn)
os.makedirs(os.path.dirname(fn),exist_ok=True)
with open(fn, "wb") as w:
image.save(w)
def map_files(path = '.'):
for root, dirs, files in os.walk(path):
for name in files:
if '.pdf' in name:
extimg(root,name)
for name in dirs:
if name == 'venv' or name =='images':
continue
else:
map_files(os.path.join(path,root,name))
map_files()