Skip to content

Commit

Permalink
Clean up too small regions
Browse files Browse the repository at this point in the history
  • Loading branch information
MMaas3 committed Dec 14, 2023
1 parent f62d95f commit 90e69fe
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 15 deletions.
17 changes: 13 additions & 4 deletions page_xml/output_pageXML.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def __init__(
cfg: Optional[CfgNode] = None,
whitelist: Optional[Iterable[str]] = None,
rectangle_regions: Optional[list[str]] = [],
min_region_contour: int = 10
) -> None:
"""
Class for the generation of the pageXML from class predictions on images
Expand All @@ -68,10 +69,19 @@ def __init__(
regions (Optional[list[str]], optional): list of regions to extract from pageXML. Defaults to None.
merge_regions (Optional[list[str]], optional): list of region to merge into one. Defaults to None.
region_type (Optional[list[str]], optional): list of strings that map Page XML Region to a class defined in
'regions'.
'regions'. Defaults to None.
cfg (Optional[CfgNode]): contains the configuration that is used for providence in the pageXML.
Defaults to None.
whitelist (Optional[Iterable[str]]): names of the configuration fields to be used in the pageXML.
Defaults to None.
rectangle_regions (Optional[list[str]]): the regions that have to be described with the minimal rectangle,
that fits them. Defaults to an empty list.
min_region_contour (int): minimum size a region has to be, to be considered a valid region.
Defaults to 10 pixels.
"""
super().__init__(mode, line_width, regions, merge_regions, region_type)

self.min_region_contour = min_region_contour
self.rectangle_regions = rectangle_regions
self.logger = logging.getLogger(get_logger_name())

Expand Down Expand Up @@ -182,9 +192,8 @@ def generate_single_page(
# --- remove small objects
if cnt.shape[0] < 4:
continue
# TODO what size
# if cv2.contourArea(cnt) < size:
# continue
if cv2.contourArea(cnt) < self.min_region_contour:
continue

region_id += 1

Expand Down
61 changes: 50 additions & 11 deletions test/test_output_pageXML.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def test_one_region_type(self):
array = np.array([background, image])
tensor = torch.from_numpy(array)

xml.generate_single_page(tensor, Path("/tmp/test.png"), 20, 20)
xml.generate_single_page(tensor, Path("/tmp/test.png"), 100, 100)

page_path = path.join(output, "page", "test.xml")
self.assertTrue(path.exists(page_path), "Page file does not exist")
Expand All @@ -42,7 +42,7 @@ def test_one_region_type(self):
namespaces = {"page": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"}
coords_elements = page.findall("./page:Page/page:ImageRegion/page:Coords", namespaces=namespaces)
self.assertEqual(1, len(coords_elements))
self.assertEqual("10,0 10,18 18,18 18,0", coords_elements[0].attrib.get("points"))
self.assertEqual("50,0 50,90 90,90 90,0", coords_elements[0].attrib.get("points"))

def test_multiple_region_types(self):
output = tempfile.mktemp("_laypa_test")
Expand All @@ -63,7 +63,7 @@ def test_multiple_region_types(self):
array = np.array([background, image, text])
tensor = torch.from_numpy(array)

xml.generate_single_page(tensor, Path("/tmp/test.png"), 20, 20)
xml.generate_single_page(tensor, Path("/tmp/test.png"), 100, 100)

page_path = path.join(output, "page", "test.xml")
self.assertTrue(path.exists(page_path), "Page file does not exist")
Expand All @@ -72,11 +72,11 @@ def test_multiple_region_types(self):
namespaces = {"page": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"}
image_coords_elements = page.findall("./page:Page/page:ImageRegion/page:Coords", namespaces=namespaces)
self.assertEqual(1, len(image_coords_elements))
self.assertEqual("4,0 4,18 8,18 8,0", image_coords_elements[0].attrib.get("points"))
self.assertEqual("20,0 20,90 40,90 40,0", image_coords_elements[0].attrib.get("points"))

text_coords_elements = page.findall("./page:Page/page:TextRegion/page:Coords", namespaces=namespaces)
self.assertEqual(1, len(text_coords_elements))
self.assertEqual("10,0 10,18 18,18 18,0", text_coords_elements[0].attrib.get("points"))
self.assertEqual("50,0 50,90 90,90 90,0", text_coords_elements[0].attrib.get("points"))

def test_region_not_square(self):
output = tempfile.mktemp("_laypa_test")
Expand Down Expand Up @@ -105,15 +105,15 @@ def test_region_not_square(self):
array = np.array([background, image])
tensor = torch.from_numpy(array)

xml.generate_single_page(tensor, Path("/tmp/test.png"), 20, 20)
xml.generate_single_page(tensor, Path("/tmp/test.png"), 100, 100)

page_path = path.join(output, "page", "test.xml")
self.assertTrue(path.exists(page_path), "Page file does not exist")
page = ET.parse(page_path)
namespaces = {"page": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"}
image_coords_elements = page.findall("./page:Page/page:ImageRegion/page:Coords", namespaces=namespaces)
self.assertEqual(1, len(image_coords_elements))
self.assertEqual("8,4 4,8 8,12 10,12 14,8 10,4", image_coords_elements[0].attrib.get("points"))
self.assertEqual("40,20 20,40 40,60 50,60 70,40 50,20", image_coords_elements[0].attrib.get("points"))

def test_rectangle_region_does_cotains_4_points(self):
output = tempfile.mktemp("_laypa_test")
Expand Down Expand Up @@ -143,7 +143,7 @@ def test_rectangle_region_does_cotains_4_points(self):
array = np.array([background, image])
tensor = torch.from_numpy(array)

xml.generate_single_page(tensor, Path("/tmp/test.png"), 20, 20)
xml.generate_single_page(tensor, Path("/tmp/test.png"), 100, 100)

page_path = path.join(output, "page", "test.xml")
self.assertTrue(path.exists(page_path), "Page file does not exist")
Expand Down Expand Up @@ -182,7 +182,7 @@ def test_rectangle_region_does_create_floating_point_coords(self):
array = np.array([background, image])
tensor = torch.from_numpy(array)

xml.generate_single_page(tensor, Path("/tmp/test.png"), 20, 20)
xml.generate_single_page(tensor, Path("/tmp/test.png"), 100, 100)

page_path = path.join(output, "page", "test.xml")
self.assertTrue(path.exists(page_path), "Page file does not exist")
Expand Down Expand Up @@ -242,7 +242,7 @@ def test_only_rectangle_region_one_type(self):
array = np.array([background, image, text])
tensor = torch.from_numpy(array)

xml.generate_single_page(tensor, Path("/tmp/test.png"), 20, 20)
xml.generate_single_page(tensor, Path("/tmp/test.png"), 100, 100)

page_path = path.join(output, "page", "test.xml")
self.assertTrue(path.exists(page_path), "Page file does not exist")
Expand Down Expand Up @@ -287,7 +287,7 @@ def test_merge_overlapping_squares(self):
array = np.array([background, image])
tensor = torch.from_numpy(array)

xml.generate_single_page(tensor, Path("/tmp/test.png"), 10, 10)
xml.generate_single_page(tensor, Path("/tmp/test.png"), 100, 100)

page_path = path.join(output, "page", "test.xml")
self.assertTrue(path.exists(page_path), "Page file does not exist")
Expand All @@ -297,5 +297,44 @@ def test_merge_overlapping_squares(self):
self.assertEqual(1, len(image_coords_elements), "more than 1 image is found")


def test_ignores_too_small_regions(self):
output = tempfile.mktemp("_laypa_test")
xml = OutputPageXML(
"region",
output,
5,
["Photo"],
[],
["ImageRegion:Photo"],
None,
[],
["Photo"],
10
)
background = np.array([[0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
[0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

image = np.invert(background == 1) * 1
array = np.array([background, image])
tensor = torch.from_numpy(array)

xml.generate_single_page(tensor, Path("/tmp/test.png"), 20, 20)

page_path = path.join(output, "page", "test.xml")
self.assertTrue(path.exists(page_path), "Page file does not exist")
page = ET.parse(page_path)
namespaces = {"page": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"}
image_region_elements = page.findall("./page:Page/page:ImageRegion", namespaces=namespaces)
self.assertEqual(0, len(image_region_elements))


if __name__ == "__main__":
unittest.main()

0 comments on commit 90e69fe

Please sign in to comment.