From 4256fa3edafc4d3bf8d286338154a7a4eb067c5f Mon Sep 17 00:00:00 2001
From: Ansh5461 <anshjoshi0607@gmail.com>
Date: Fri, 3 May 2024 23:39:55 +0530
Subject: [PATCH] Added code for bypassing small images

---
 querent/ingestors/pdfs/pdf_ingestor_v1.py     | 20 +++++++++++++++++++
 .../workflows/openai_ingested_images_test.py  |  2 +-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/querent/ingestors/pdfs/pdf_ingestor_v1.py b/querent/ingestors/pdfs/pdf_ingestor_v1.py
index 32c84ce9..958785de 100644
--- a/querent/ingestors/pdfs/pdf_ingestor_v1.py
+++ b/querent/ingestors/pdfs/pdf_ingestor_v1.py
@@ -160,6 +160,11 @@ async def extract_img(self, doc, file_path, data, doc_source):
             if img:  # If image extraction was successful
                 image_data = img["image"]
                 image_ext = img["ext"]
+
+                image_status, aspect_ratio = await self.analyze_image(image_data)
+                if image_status != "Image accepted":
+                    continue
+
                 ocr_text = await self.get_ocr_from_image(image=img["image"])
 
                 # Adjust page_num for 0-indexed access and check if it's within range
@@ -180,6 +185,21 @@ async def extract_img(self, doc, file_path, data, doc_source):
                     doc_source=doc_source,
                 )
 
+    async def analyze_image(self, image_bytes):
+        image = Image.open(io.BytesIO(image_bytes))
+
+        width, height = image.size
+
+        aspect_ratio = width / height
+
+        min_dimension = 100
+
+        # Check if image meets the criteria
+        if width < min_dimension and height < min_dimension:
+            return "Image too small", aspect_ratio
+        else:
+            return "Image accepted", aspect_ratio
+
     async def get_ocr_from_image(self, image):
         """Implement this to return ocr text of the image"""
         try:
diff --git a/tests/workflows/openai_ingested_images_test.py b/tests/workflows/openai_ingested_images_test.py
index 7304fa03..c264d2dd 100644
--- a/tests/workflows/openai_ingested_images_test.py
+++ b/tests/workflows/openai_ingested_images_test.py
@@ -32,7 +32,7 @@
 #     #         host="localhost", 
 #     #         port="5432")
 #     # # ml_conn = MilvusDBConnection()
-#     directories = [ "/home/ansh/pyg-trail/testing-ocr"]
+#     directories = [ "/home/ansh/pyg-trail/testing-aspect"]
 #     collectors = [
 #         FSCollectorFactory().resolve(
 #             Uri("file://" + str(Path(directory).resolve())),