|
1 | 1 | import pympi |
2 | 2 |
|
| 3 | +from lxml import etree |
| 4 | +from typing import Dict, List |
| 5 | + |
3 | 6 |
|
4 | 7 | def get_elan_sentences(elan_path: str): |
5 | 8 |
|
@@ -71,3 +74,88 @@ def get_elan_sentences(elan_path: str): |
71 | 74 | ) |
72 | 75 |
|
73 | 76 | yield sentence |
| 77 | + |
| 78 | + |
| 79 | +def get_child_elements(root: etree.ElementTree, |
| 80 | + element_name: str, |
| 81 | + attributes_to_extract: List[str]) -> Dict[str, Dict[str, str]]: |
| 82 | + """ |
| 83 | +
|
| 84 | + :param root: |
| 85 | + :param element_name: |
| 86 | + :param attributes_to_extract: |
| 87 | + :return: |
| 88 | + """ |
| 89 | + |
| 90 | + elements = root.xpath("/ilex-data/" + element_name) # type: List[etree.Element] |
| 91 | + |
| 92 | + by_id = {} |
| 93 | + |
| 94 | + for element in elements: |
| 95 | + id_ = element.get("id") |
| 96 | + by_id[id_] = {} |
| 97 | + for attribute_name in attributes_to_extract: |
| 98 | + value = element.get(attribute_name) |
| 99 | + by_id[id_][attribute_name] = value |
| 100 | + |
| 101 | + return by_id |
| 102 | + |
| 103 | + |
| 104 | +def get_signer_ids_from_ilex(ilex_path: str) -> Dict[str, List[str]]: |
| 105 | + """ |
| 106 | +
|
| 107 | + File structure: |
| 108 | +
|
| 109 | + <ilex-data source="meinedgs.de" version="1.1" database_version="51"> |
| 110 | + <camera_perspective id="1" code="A1" english="Frontal view on informant A" |
| 111 | + localised="Frontalansicht Informant A" visible_persons="{1}"/> |
| 112 | + <camera_perspective id="2" code="B1" english="Frontal view on informant B" |
| 113 | + localised="Frontalansicht Informant B" visible_persons="{2}"/> |
| 114 | + <camera_perspective id="3" code="C" english="Total on all three persons" |
| 115 | + localised="Totale auf alle drei Personen" visible_persons="{2,3,1}"/> |
| 116 | + <movie_track id="3" movie="1" camera_perspective="3" path="./1177918_1c.mp4" |
| 117 | + track_length="00:09:25:04"/> |
| 118 | + <movie_track id="1" movie="1" camera_perspective="1" path="./1177918_1a1.mp4" |
| 119 | + track_length="00:09:25:04"/> |
| 120 | + <movie_track id="2" movie="1" camera_perspective="2" path="./1177918_1b1.mp4" |
| 121 | + track_length="00:09:25:04"/> |
| 122 | + <informant id="1" sex="1" name="SH-12" short_name="SH-12"/> |
| 123 | + <informant id="2" sex="1" name="SH-13" short_name="SH-13"/> |
| 124 | + <informant id="3" sex="2" name="sh-mod-1" short_name="sh-mod-1"/> |
| 125 | + <participation id="1" movie="1" role="1" informant="1"/> |
| 126 | + <participation id="2" movie="1" role="1" informant="2"/> |
| 127 | + <participation id="3" movie="1" role="2" informant="3"/> |
| 128 | + <!--...--> |
| 129 | + </ilex-data> |
| 130 | +
|
| 131 | + :param ilex_path: |
| 132 | + :return: |
| 133 | + """ |
| 134 | + |
| 135 | + root = etree.parse(ilex_path) |
| 136 | + |
| 137 | + informant_dict = get_child_elements(root=root, |
| 138 | + element_name="informant", |
| 139 | + attributes_to_extract=["name"]) |
| 140 | + |
| 141 | + camera_perspective_dict = get_child_elements(root=root, |
| 142 | + element_name="camera_perspective", |
| 143 | + attributes_to_extract=["visible_persons", "code"]) |
| 144 | + |
| 145 | + signer_identities_by_perspective = {} # type: Dict[str, List[str]] |
| 146 | + |
| 147 | + for camera_perspective in camera_perspective_dict.values(): |
| 148 | + |
| 149 | + # extract A, B or C without trailing numbers |
| 150 | + |
| 151 | + clean_code = camera_perspective["code"][0].lower() |
| 152 | + |
| 153 | + # remove enclosing "{" and "}" for list of informant ids |
| 154 | + |
| 155 | + ids_of_visible_persons = camera_perspective["visible_persons"][1:-1].split(",") |
| 156 | + |
| 157 | + names_of_visible_persons = [informant_dict[id_]["name"] for id_ in ids_of_visible_persons] |
| 158 | + |
| 159 | + signer_identities_by_perspective[clean_code] = names_of_visible_persons |
| 160 | + |
| 161 | + return signer_identities_by_perspective |
0 commit comments