Merge pull request #24 from bricksdont/dgs_signer_id

AmitMY · web-flow · commit 363403d67333 · 2022-11-11T11:28:12.000+01:00
feature(dgs_corpus): util to extract signer ids
diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@
 setup(
     name="sign-language-datasets",
     packages=packages,
-    version="0.1.4",
+    version="0.1.5",
     description="TFDS Datasets for sign language",
     author="Amit Moryossef",
     author_email="amitmoryossef@gmail.com",
diff --git a/sign_language_datasets/datasets/dgs_corpus/dgs_utils.py b/sign_language_datasets/datasets/dgs_corpus/dgs_utils.py
@@ -1,5 +1,8 @@
 import pympi
 
+from lxml import etree
+from typing import Dict, List
+
 
 def get_elan_sentences(elan_path: str):
 
@@ -71,3 +74,88 @@ def get_elan_sentences(elan_path: str):
             )
 
             yield sentence
+
+
+def get_child_elements(root: etree.ElementTree,
+                       element_name: str,
+                       attributes_to_extract: List[str]) -> Dict[str, Dict[str, str]]:
+    """
+
+    :param root:
+    :param element_name:
+    :param attributes_to_extract:
+    :return:
+    """
+
+    elements = root.xpath("/ilex-data/" + element_name)  # type: List[etree.Element]
+
+    by_id = {}
+
+    for element in elements:
+        id_ = element.get("id")
+        by_id[id_] = {}
+        for attribute_name in attributes_to_extract:
+            value = element.get(attribute_name)
+            by_id[id_][attribute_name] = value
+
+    return by_id
+
+
+def get_signer_ids_from_ilex(ilex_path: str) -> Dict[str, List[str]]:
+    """
+
+    File structure:
+
+    <ilex-data source="meinedgs.de" version="1.1" database_version="51">
+        <camera_perspective id="1" code="A1" english="Frontal view on informant A"
+            localised="Frontalansicht Informant A" visible_persons="{1}"/>
+        <camera_perspective id="2" code="B1" english="Frontal view on informant B"
+            localised="Frontalansicht Informant B" visible_persons="{2}"/>
+        <camera_perspective id="3" code="C" english="Total on all three persons"
+            localised="Totale auf alle drei Personen" visible_persons="{2,3,1}"/>
+        <movie_track id="3" movie="1" camera_perspective="3" path="./1177918_1c.mp4"
+            track_length="00:09:25:04"/>
+        <movie_track id="1" movie="1" camera_perspective="1" path="./1177918_1a1.mp4"
+            track_length="00:09:25:04"/>
+        <movie_track id="2" movie="1" camera_perspective="2" path="./1177918_1b1.mp4"
+            track_length="00:09:25:04"/>
+        <informant id="1" sex="1" name="SH-12" short_name="SH-12"/>
+        <informant id="2" sex="1" name="SH-13" short_name="SH-13"/>
+        <informant id="3" sex="2" name="sh-mod-1" short_name="sh-mod-1"/>
+        <participation id="1" movie="1" role="1" informant="1"/>
+        <participation id="2" movie="1" role="1" informant="2"/>
+        <participation id="3" movie="1" role="2" informant="3"/>
+        <!--...-->
+    </ilex-data>
+
+    :param ilex_path:
+    :return:
+    """
+
+    root = etree.parse(ilex_path)
+
+    informant_dict = get_child_elements(root=root,
+                                        element_name="informant",
+                                        attributes_to_extract=["name"])
+
+    camera_perspective_dict = get_child_elements(root=root,
+                                                 element_name="camera_perspective",
+                                                 attributes_to_extract=["visible_persons", "code"])
+
+    signer_identities_by_perspective = {}  # type: Dict[str, List[str]]
+
+    for camera_perspective in camera_perspective_dict.values():
+
+        # extract A, B or C without trailing numbers
+
+        clean_code = camera_perspective["code"][0].lower()
+
+        # remove enclosing "{" and "}" for list of informant ids
+
+        ids_of_visible_persons = camera_perspective["visible_persons"][1:-1].split(",")
+
+        names_of_visible_persons = [informant_dict[id_]["name"] for id_ in ids_of_visible_persons]
+
+        signer_identities_by_perspective[clean_code] = names_of_visible_persons
+
+    return signer_identities_by_perspective