diff --git a/querent/ingestors/pdfs/pdf_ingestor_v1.py b/querent/ingestors/pdfs/pdf_ingestor_v1.py index 32c84ce9..958785de 100644 --- a/querent/ingestors/pdfs/pdf_ingestor_v1.py +++ b/querent/ingestors/pdfs/pdf_ingestor_v1.py @@ -160,6 +160,11 @@ async def extract_img(self, doc, file_path, data, doc_source): if img: # If image extraction was successful image_data = img["image"] image_ext = img["ext"] + + image_status, aspect_ratio = await self.analyze_image(image_data) + if image_status != "Image accepted": + continue + ocr_text = await self.get_ocr_from_image(image=img["image"]) # Adjust page_num for 0-indexed access and check if it's within range @@ -180,6 +185,21 @@ async def extract_img(self, doc, file_path, data, doc_source): doc_source=doc_source, ) + async def analyze_image(self, image_bytes): + image = Image.open(io.BytesIO(image_bytes)) + + width, height = image.size + + aspect_ratio = width / height + + min_dimension = 100 + + # Check if image meets the criteria + if width < min_dimension and height < min_dimension: + return "Image too small", aspect_ratio + else: + return "Image accepted", aspect_ratio + async def get_ocr_from_image(self, image): """Implement this to return ocr text of the image""" try: diff --git a/tests/workflows/openai_ingested_images_test.py b/tests/workflows/openai_ingested_images_test.py index 7304fa03..c264d2dd 100644 --- a/tests/workflows/openai_ingested_images_test.py +++ b/tests/workflows/openai_ingested_images_test.py @@ -32,7 +32,7 @@ # # host="localhost", # # port="5432") # # # ml_conn = MilvusDBConnection() -# directories = [ "/home/ansh/pyg-trail/testing-ocr"] +# directories = [ "/home/ansh/pyg-trail/testing-aspect"] # collectors = [ # FSCollectorFactory().resolve( # Uri("file://" + str(Path(directory).resolve())),