-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_extract_imgs.py
40 lines (29 loc) · 1.03 KB
/
pdf_extract_imgs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#pip install PyMuPDF
import fitz # PyMuPDF
import os
# Specify the path to the PDF file
pdf_path = '/home/soyrl/pdf_birds.pdf'
# Specify the directory to save the images
output_dir = '/home/soyrl/pdf_saves_new'
if not os.path.exists(output_dir): #Create directory if it doesn't exist
os.makedirs(output_dir)
# Open the PDF file
doc = fitz.open(pdf_path)
# Iterate over each page in the PDF
for page_num in range(len(doc)):
page = doc[page_num]
image_list = page.get_images(full=True)
# Save each image
image_counter = 1
for img_index, img in enumerate(page.get_images(full=True)):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
# Construct image save path
image_filename = f"page_{page_num+1}_{image_counter}.png"
image_save_path = os.path.join(output_dir, image_filename)
with open(image_save_path, "wb") as image_file:
image_file.write(image_bytes)
image_counter += 1
# Close the document
doc.close()