-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathingest.py
84 lines (67 loc) · 2.67 KB
/
ingest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv(usecwd=True))
import os
from pathlib import Path
import pypdfium2 as pdfium
from langchain_chroma import Chroma
from langchain_experimental.open_clip import OpenCLIPEmbeddings
def get_images_from_pdf(pdf_path, img_dump_path):
"""
Extract images from each page of a PDF document and save as JPEG files.
:param pdf_path: A string representing the path to the PDF file.
:param img_dump_path: A string representing the path to dummp images.
"""
pdf = pdfium.PdfDocument(pdf_path)
n_pages = len(pdf)
for page_number in range(n_pages):
page = pdf.get_page(page_number)
bitmap = page.render(scale=1, rotation=0, crop=(0, 0, 0, 0))
pil_image = bitmap.to_pil()
pil_image.save(f"{img_dump_path}/img_{page_number + 1}.jpg", format="JPEG")
def get_images_from_pdf_2(pdf_dir, img_dump_path):
"""
Extract images from each page of a PDF document and save as JPEG files.
:param pdf_path: A string representing the path to the PDF file.
:param img_dump_path: A string representing the path to dummp images.
"""
pdf_files = os.listdir(pdf_dir)
for pdf_file in pdf_files:
pdf = pdfium.PdfDocument(pdf_dir + pdf_file)
n_pages = len(pdf)
for page_number in range(n_pages):
page = pdf.get_page(page_number)
bitmap = page.render(scale=1, rotation=0, crop=(0, 0, 0, 0))
pil_image = bitmap.to_pil()
pil_image.save(f"{img_dump_path}/{pdf_file[:-4]}_img_{page_number + 1}.jpg", format="JPEG")
# Load PDF
# doc_path = Path(__file__).parent / "docs/SIC_AI_Chapter1.pdf"
# img_dump_path = Path(__file__).parent / "docs/"
doc_path = "./slides/"
img_dump_path = "./images/"
# rel_doc_path = doc_path.relative_to(Path.cwd())
# rel_img_dump_path = img_dump_path.relative_to(Path.cwd())
print("pdf index")
pil_images = get_images_from_pdf_2(doc_path, img_dump_path)
print("done")
vectorstore = Path(__file__).parent / "chroma_db_multi_modal"
re_vectorstore_path = vectorstore.relative_to(Path.cwd())
# Load embedding function
print("Loading embedding function")
embedding = OpenCLIPEmbeddings(model_name="ViT-H-14", checkpoint="laion2b_s32b_b79k")
# Create chroma
vectorstore_mmembd = Chroma(
collection_name="multi-modal-rag",
persist_directory=str(Path(__file__).parent / "chroma_db_multi_modal"),
embedding_function=embedding,
)
# Get image URIs
image_uris = sorted(
[
os.path.join(img_dump_path, image_name)
for image_name in os.listdir(img_dump_path)
if image_name.endswith(".jpg")
]
)
# Add images
print("Embedding images")
vectorstore_mmembd.add_images(uris=image_uris)