Added code for bypassing small images

Querent-ai · May 3, 2024 · 4256fa3 · 4256fa3
1 parent 975f7e2
commit 4256fa3
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 1 deletion.
diff --git a/querent/ingestors/pdfs/pdf_ingestor_v1.py b/querent/ingestors/pdfs/pdf_ingestor_v1.py
@@ -160,6 +160,11 @@ async def extract_img(self, doc, file_path, data, doc_source):
             if img:  # If image extraction was successful
                 image_data = img["image"]
                 image_ext = img["ext"]
+
+                image_status, aspect_ratio = await self.analyze_image(image_data)
+                if image_status != "Image accepted":
+                    continue
+
                 ocr_text = await self.get_ocr_from_image(image=img["image"])
 
                 # Adjust page_num for 0-indexed access and check if it's within range
@@ -180,6 +185,21 @@ async def extract_img(self, doc, file_path, data, doc_source):
                     doc_source=doc_source,
                 )
 
+    async def analyze_image(self, image_bytes):
+        image = Image.open(io.BytesIO(image_bytes))
+
+        width, height = image.size
+
+        aspect_ratio = width / height
+
+        min_dimension = 100
+
+        # Check if image meets the criteria
+        if width < min_dimension and height < min_dimension:
+            return "Image too small", aspect_ratio
+        else:
+            return "Image accepted", aspect_ratio
+
     async def get_ocr_from_image(self, image):
         """Implement this to return ocr text of the image"""
         try:

diff --git a/tests/workflows/openai_ingested_images_test.py b/tests/workflows/openai_ingested_images_test.py
@@ -32,7 +32,7 @@
 #     #         host="localhost", 
 #     #         port="5432")
 #     # # ml_conn = MilvusDBConnection()
-#     directories = [ "/home/ansh/pyg-trail/testing-ocr"]
+#     directories = [ "/home/ansh/pyg-trail/testing-aspect"]
 #     collectors = [
 #         FSCollectorFactory().resolve(
 #             Uri("file://" + str(Path(directory).resolve())),