Skip to content

Commit

Permalink
update doc & update test
Browse files Browse the repository at this point in the history
  • Loading branch information
sebastianMindee committed Oct 20, 2023
1 parent 75231a4 commit 2b96ca7
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 11 deletions.
47 changes: 47 additions & 0 deletions docs/extras/guide/custom_v1.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,53 @@ print(str(result.document.inference.prediction.fields["my-field"]))
print(str(result.document.inference.prediction.classifications["my-classification"]))
```


# 🧪 Custom Line Items

> **⚠️ Warning**: Custom Line Items are an **experimental** feature, results may vary.

Though not supported directly in the API, sometimes you might need to reconstitute line items by hand.
The library provides a tool for this very purpose:

## columns_to_line_items()
The **columns_to_line_items()** function can be called from the document and page level prediction objects.

It takes the following arguments:

* **anchor_names** (`List[str]`): a list of the names of possible anchor (field) candidate for the horizontal placement a line. If all provided anchors are invalid, the `LineItemV1` won't be built.
* **field_names** (`List[str]`): a list of fields to retrieve the values from
* **height_tolerance** (`float`): Optional, the height tolerance used to build the line. It helps when the height of a line can vary unexpectedly.

Example use:

```python
# document-level
response.document.inference.prediction.columns_to_line_items(
anchor_names,
field_names,
0.011 # optional, defaults to 0.01
)

# page-level
response.document.pages[0].prediction.columns_to_line_items(
anchor_names,
field_names,
0.011 # optional, defaults to 0.01
)
```

It returns a list of [CustomLineV1](#CustomlineV1) objects.

## CustomlineV1

`CustomlineV1` represents a line as it has been read from column fields. It has the following attributes:

* **row_number** (`int`): Number of a given line. Starts at 1.
* **fields** (`Dict[str, ListFieldValueV1]`[]): List of the fields associated with the line, indexed by their column name.
* **bbox** (`BBox`): Simple bounding box of the current line representing the 4 minimum & maximum coordinates as `float` values.


# Questions?

[Join our Slack](https://join.slack.com/t/mindee-community/shared_invite/zt-1jv6nawjq-FDgFcF2T5CmMmRpl9LLptw)
23 changes: 22 additions & 1 deletion mindee/product/custom/custom_v1_page.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from typing import Dict, Optional
from typing import Dict, List, Optional

from mindee.parsing.common import Prediction, StringDict, clean_out_string
from mindee.parsing.custom import ListFieldV1
from mindee.parsing.custom.line_items import CustomLineV1, get_line_items


class CustomV1Page(Prediction):
Expand All @@ -20,6 +21,26 @@ def __init__(self, raw_prediction: StringDict, page_id: Optional[int]) -> None:
for field_name, field_contents in raw_prediction.items():
self.fields[field_name] = ListFieldV1(field_contents, page_id=page_id)

def columns_to_line_items(
self,
anchor_names: List[str],
field_names: List[str],
height_tolerance: float = 0.01,
) -> List[CustomLineV1]:
"""
Order column fields into line items.
:param anchor_names: list of possible anchor fields.
:param field_names: list of all column fields.
:param height_tolerance: height tolerance to apply to lines.
"""
return get_line_items(
anchor_names,
field_names,
self.fields,
height_tolerance,
)

def __str__(self) -> str:
out_str = ""
for field_name, field_value in self.fields.items():
Expand Down
29 changes: 19 additions & 10 deletions tests/product/custom/test_custom_v1_line_items.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,22 @@
import json

from mindee.parsing.common.document import Document
from mindee.parsing.common.page import Page
from mindee.product.custom.custom_v1 import CustomV1
from mindee.product.custom.custom_v1_page import CustomV1Page


def do_tests(line_items):
assert len(line_items) == 3
assert line_items[0].fields["beneficiary_name"].content == "JAMES BOND 007"
assert line_items[0].fields["beneficiary_birth_date"].content == "1970-11-11"
assert line_items[0].row_number == 1
assert line_items[1].fields["beneficiary_name"].content == "HARRY POTTER"
assert line_items[1].fields["beneficiary_birth_date"].content == "2010-07-18"
assert line_items[1].row_number == 2
assert line_items[2].fields["beneficiary_name"].content == "DRAGO MALFOY"
assert line_items[2].fields["beneficiary_birth_date"].content == "2015-07-05"
assert line_items[2].row_number == 3


def test_single_table_01():
Expand All @@ -10,6 +25,7 @@ def test_single_table_01():
)
json_data = json.load(open(json_data_path, "r"))
doc = Document(CustomV1, json_data["document"]).inference.prediction
page = Page(CustomV1Page, json_data["document"]["inference"]["pages"][0])
anchors = ["beneficiary_name"]
columns = [
"beneficiary_birth_date",
Expand All @@ -18,13 +34,6 @@ def test_single_table_01():
"beneficiary_rank",
]
line_items = doc.columns_to_line_items(anchors, columns, 0.011)
assert len(line_items) == 3
assert line_items[0].fields["beneficiary_name"].content == "JAMES BOND 007"
assert line_items[0].fields["beneficiary_birth_date"].content == "1970-11-11"
assert line_items[0].row_number == 1
assert line_items[1].fields["beneficiary_name"].content == "HARRY POTTER"
assert line_items[1].fields["beneficiary_birth_date"].content == "2010-07-18"
assert line_items[1].row_number == 2
assert line_items[2].fields["beneficiary_name"].content == "DRAGO MALFOY"
assert line_items[2].fields["beneficiary_birth_date"].content == "2015-07-05"
assert line_items[2].row_number == 3
do_tests(line_items)
line_items_page = page.prediction.columns_to_line_items(anchors, columns, 0.011)
do_tests(line_items_page)

0 comments on commit 2b96ca7

Please sign in to comment.