update doc & update test

mindee · Oct 20, 2023 · 2b96ca7 · 2b96ca7
1 parent 75231a4
commit 2b96ca7
Show file tree

Hide file tree

Showing 3 changed files with 88 additions and 11 deletions.
diff --git a/docs/extras/guide/custom_v1.md b/docs/extras/guide/custom_v1.md
@@ -85,6 +85,53 @@ print(str(result.document.inference.prediction.fields["my-field"]))
 print(str(result.document.inference.prediction.classifications["my-classification"]))
 ```
 
+
+# 🧪 Custom Line Items
+
+> **⚠️ Warning**: Custom Line Items are an **experimental** feature, results may vary.
+
+
+Though not supported directly in the API, sometimes you might need to reconstitute line items by hand.
+The library provides a tool for this very purpose:
+
+## columns_to_line_items()
+The **columns_to_line_items()** function can be called from the document and page level prediction objects.
+
+It takes the following arguments:
+
+* **anchor_names** (`List[str]`): a list of the names of possible anchor (field) candidate for the horizontal placement a line. If all provided anchors are invalid, the `LineItemV1` won't be built.
+* **field_names** (`List[str]`): a list of fields to retrieve the values from
+* **height_tolerance** (`float`): Optional, the height tolerance used to build the line. It helps when the height of a line can vary unexpectedly.
+
+Example use:
+
+```python
+# document-level
+response.document.inference.prediction.columns_to_line_items(
+  anchor_names,
+  field_names,
+  0.011 # optional, defaults to 0.01
+)
+
+# page-level
+response.document.pages[0].prediction.columns_to_line_items(
+    anchor_names,
+    field_names,
+    0.011 # optional, defaults to 0.01
+)
+```
+
+It returns a list of [CustomLineV1](#CustomlineV1) objects.
+
+## CustomlineV1
+
+`CustomlineV1` represents a line as it has been read from column fields. It has the following attributes:
+
+* **row_number** (`int`): Number of a given line. Starts at 1.
+* **fields** (`Dict[str, ListFieldValueV1]`[]): List of the fields associated with the line, indexed by their column name.
+* **bbox** (`BBox`): Simple bounding box of the current line representing the 4 minimum & maximum coordinates as `float` values.
+
+
 # Questions?
 
 [Join our Slack](https://join.slack.com/t/mindee-community/shared_invite/zt-1jv6nawjq-FDgFcF2T5CmMmRpl9LLptw)
diff --git a/mindee/product/custom/custom_v1_page.py b/mindee/product/custom/custom_v1_page.py
@@ -1,7 +1,8 @@
-from typing import Dict, Optional
+from typing import Dict, List, Optional
 
 from mindee.parsing.common import Prediction, StringDict, clean_out_string
 from mindee.parsing.custom import ListFieldV1
+from mindee.parsing.custom.line_items import CustomLineV1, get_line_items
 
 
 class CustomV1Page(Prediction):
@@ -20,6 +21,26 @@ def __init__(self, raw_prediction: StringDict, page_id: Optional[int]) -> None:
         for field_name, field_contents in raw_prediction.items():
             self.fields[field_name] = ListFieldV1(field_contents, page_id=page_id)
 
+    def columns_to_line_items(
+        self,
+        anchor_names: List[str],
+        field_names: List[str],
+        height_tolerance: float = 0.01,
+    ) -> List[CustomLineV1]:
+        """
+        Order column fields into line items.
+
+        :param anchor_names: list of possible anchor fields.
+        :param field_names: list of all column fields.
+        :param height_tolerance: height tolerance to apply to lines.
+        """
+        return get_line_items(
+            anchor_names,
+            field_names,
+            self.fields,
+            height_tolerance,
+        )
+
     def __str__(self) -> str:
         out_str = ""
         for field_name, field_value in self.fields.items():

diff --git a/tests/product/custom/test_custom_v1_line_items.py b/tests/product/custom/test_custom_v1_line_items.py
@@ -1,7 +1,22 @@
 import json
 
 from mindee.parsing.common.document import Document
+from mindee.parsing.common.page import Page
 from mindee.product.custom.custom_v1 import CustomV1
+from mindee.product.custom.custom_v1_page import CustomV1Page
+
+
+def do_tests(line_items):
+    assert len(line_items) == 3
+    assert line_items[0].fields["beneficiary_name"].content == "JAMES BOND 007"
+    assert line_items[0].fields["beneficiary_birth_date"].content == "1970-11-11"
+    assert line_items[0].row_number == 1
+    assert line_items[1].fields["beneficiary_name"].content == "HARRY POTTER"
+    assert line_items[1].fields["beneficiary_birth_date"].content == "2010-07-18"
+    assert line_items[1].row_number == 2
+    assert line_items[2].fields["beneficiary_name"].content == "DRAGO MALFOY"
+    assert line_items[2].fields["beneficiary_birth_date"].content == "2015-07-05"
+    assert line_items[2].row_number == 3
 
 
 def test_single_table_01():
@@ -10,6 +25,7 @@ def test_single_table_01():
     )
     json_data = json.load(open(json_data_path, "r"))
     doc = Document(CustomV1, json_data["document"]).inference.prediction
+    page = Page(CustomV1Page, json_data["document"]["inference"]["pages"][0])
     anchors = ["beneficiary_name"]
     columns = [
         "beneficiary_birth_date",
@@ -18,13 +34,6 @@ def test_single_table_01():
         "beneficiary_rank",
     ]
     line_items = doc.columns_to_line_items(anchors, columns, 0.011)
-    assert len(line_items) == 3
-    assert line_items[0].fields["beneficiary_name"].content == "JAMES BOND 007"
-    assert line_items[0].fields["beneficiary_birth_date"].content == "1970-11-11"
-    assert line_items[0].row_number == 1
-    assert line_items[1].fields["beneficiary_name"].content == "HARRY POTTER"
-    assert line_items[1].fields["beneficiary_birth_date"].content == "2010-07-18"
-    assert line_items[1].row_number == 2
-    assert line_items[2].fields["beneficiary_name"].content == "DRAGO MALFOY"
-    assert line_items[2].fields["beneficiary_birth_date"].content == "2015-07-05"
-    assert line_items[2].row_number == 3
+    do_tests(line_items)
+    line_items_page = page.prediction.columns_to_line_items(anchors, columns, 0.011)
+    do_tests(line_items_page)