Skip to content

Commit

Permalink
fix assert part
Browse files Browse the repository at this point in the history
  • Loading branch information
sebastianMindee committed Sep 4, 2024
1 parent 2f77fe3 commit b1e9ba0
Showing 1 changed file with 16 additions and 11 deletions.
27 changes: 16 additions & 11 deletions mindee/parsing/common/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class Document(Generic[TypePrediction, TypePage]):
"""Result of the base inference"""
id: str
"""Id of the document as sent back by the server"""
extras: Optional[Extras]
extras: Extras
"""Potential Extras fields sent back along the prediction"""
ocr: Optional[Ocr]
"""Potential raw text results read by the OCR (limited feature)"""
Expand Down Expand Up @@ -59,20 +59,25 @@ def __str__(self) -> str:
f"{self.inference}"
)

def _inject_full_text_ocr(self, raw_prediction: StringDict):
if len(raw_prediction["inference"]["pages"]) < 1:
return
if "extras" not in raw_prediction["inference"]["pages"][0]:
return
if "full_text_ocr" not in raw_prediction["inference"]["pages"][0]["extras"]:
def _inject_full_text_ocr(self, raw_prediction: StringDict) -> None:
pages = raw_prediction.get("inference", {}).get("pages", [])

if (
not pages
or "extras" not in pages[0]
or "full_text_ocr" not in pages[0]["extras"]
):
return
artificial_text_obj = {"content": ""}
artificial_text_obj["content"] += "\n".join(

full_text_content = "\n".join(
page["extras"]["full_text_ocr"]["content"]
for page in raw_prediction["inference"]["pages"]
for page in pages
if "extras" in page and "full_text_ocr" in page["extras"]
)

artificial_text_obj = {"content": full_text_content}

if not hasattr(self, "extras"):
self.extras = Extras({"full_text_ocr": artificial_text_obj})
else:
assert self.extras is not None
self.extras.add_artificial_extra({"full_text_ocr": artificial_text_obj})

0 comments on commit b1e9ba0

Please sign in to comment.