Skip to content

Commit

Permalink
Added code for bypassing small images
Browse files Browse the repository at this point in the history
  • Loading branch information
Ansh5461 committed May 3, 2024
1 parent 975f7e2 commit 4256fa3
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 1 deletion.
20 changes: 20 additions & 0 deletions querent/ingestors/pdfs/pdf_ingestor_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,11 @@ async def extract_img(self, doc, file_path, data, doc_source):
if img: # If image extraction was successful
image_data = img["image"]
image_ext = img["ext"]

image_status, aspect_ratio = await self.analyze_image(image_data)
if image_status != "Image accepted":
continue

ocr_text = await self.get_ocr_from_image(image=img["image"])

# Adjust page_num for 0-indexed access and check if it's within range
Expand All @@ -180,6 +185,21 @@ async def extract_img(self, doc, file_path, data, doc_source):
doc_source=doc_source,
)

async def analyze_image(self, image_bytes):
image = Image.open(io.BytesIO(image_bytes))

width, height = image.size

aspect_ratio = width / height

min_dimension = 100

# Check if image meets the criteria
if width < min_dimension and height < min_dimension:
return "Image too small", aspect_ratio
else:
return "Image accepted", aspect_ratio

async def get_ocr_from_image(self, image):
"""Implement this to return ocr text of the image"""
try:
Expand Down
2 changes: 1 addition & 1 deletion tests/workflows/openai_ingested_images_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
# # host="localhost",
# # port="5432")
# # # ml_conn = MilvusDBConnection()
# directories = [ "/home/ansh/pyg-trail/testing-ocr"]
# directories = [ "/home/ansh/pyg-trail/testing-aspect"]
# collectors = [
# FSCollectorFactory().resolve(
# Uri("file://" + str(Path(directory).resolve())),
Expand Down

0 comments on commit 4256fa3

Please sign in to comment.