aphp · pidoux7 · May 15, 2024 · Jul 3, 2024 · Jul 3, 2024 · Jul 3, 2024
diff --git a/.gitignore b/.gitignore
@@ -53,6 +53,7 @@ _build/
 *.tar.gz
 *.tsv
 *.ann
+!text.ann
 
 # Editors
 .idea

diff --git a/changelog.md b/changelog.md
@@ -6,6 +6,14 @@
 
 - `eds.tables` accepts a minimum_table_size (default 2) argument to reduce pollution
 - `RuleBasedQualifier` now expose a `process` method that only returns qualified entities and token without actually tagging them, defering this task to the `__call__` method.
+- Relation implementation in `doc.spans["<label>"][i]._.rel = [{'type':'rel_type', 'target': <span>},]`
+- Relation connector with brat2docs and docs2brat in `edsnlp.connectors.brat` compatible with `edsnlp.data.read_*` and `edsnlp.data.write_*` (modified files : `edsnlp.data.converters`, `edsnlp.data.standoff`)
+- Rule based relation model using proximity and/or sentence in `edsnlp.pipes.misc.relations` registered as `eds.relation`
+- Documentation using Mkdocs for relations `docs.pipes.misc.relations.md` and `docs.pipes.misc.index.md`
+- Tests for relations `tests.pipelines.misc.test_relations` and ressources `ressources.relations`
+- `data.set_processing(...)` now expose an `autocast` parameter to disable or tweak the automatic casting of the tensor
+  during the processing. Autocasting should result in a slight speedup, but may lead to numerical instability.
+- Use `torch.inference_mode` to disable view tracking and version counter bumps during inference.
 
 ### Fixed
 

diff --git a/docs/pipes/misc/index.md b/docs/pipes/misc/index.md
@@ -16,5 +16,6 @@ For instance, the date detection and normalisation pipeline falls in this catego
 | `eds.sections`           | Section detection                           |
 | `eds.reason`             | Rule-based hospitalisation reason detection |
 | `eds.tables`             | Tables detection                            |
+| `eds.relations`          | Relations extraction                        |
 
 <!-- --8<-- [end:components] -->
diff --git a/docs/pipes/misc/relations.md b/docs/pipes/misc/relations.md
@@ -0,0 +1,8 @@
+# Relations {: #edsnlp.pipes.misc.relations.factory.create_component }
+
+::: edsnlp.pipes.misc.relations.factory.create_component
+    options:
+        heading_level: 2
+        show_bases: false
+        show_source: false
+        only_class_level: true
diff --git a/edsnlp/data/converters.py b/edsnlp/data/converters.py
@@ -244,12 +244,19 @@ def __call__(self, obj):
             if not Span.has_extension(dst):
                 Span.set_extension(dst, default=None)
 
+        ############## Modifications for relations ###############
+        dict_entities = {}  ## dict for entity storage
         for ent in obj.get("entities") or ():
+            begin = min(f["begin"] for f in ent["fragments"])  # start of the entity
+            end = max(f["end"] for f in ent["fragments"])  # end of the entity
+            dict_entities[ent["entity_id"]] = (
+                ent["label"] + ";" + str(begin) + ";" + str(end)
+            )
             fragments = (
                 [
                     {
-                        "begin": min(f["begin"] for f in ent["fragments"]),
-                        "end": max(f["end"] for f in ent["fragments"]),
+                        "begin": begin,
+                        "end": end,
                     }
                 ]
                 if not self.split_fragments
@@ -267,6 +274,11 @@ def __call__(self, obj):
                     if isinstance(ent["attributes"], list)
                     else ent["attributes"]
                 )
+                attributes = (
+                    {a["label"]: a["value"] for a in ent["attributes"]}
+                    if isinstance(ent["attributes"], list)
+                    else ent["attributes"]
+                )
                 if self.notes_as_span_attribute and ent["notes"]:
                     ent["attributes"][self.notes_as_span_attribute] = "|".join(
                         note["value"] for note in ent["notes"]
@@ -302,6 +314,67 @@ def __call__(self, obj):
                 if span._.get(attr) is None:
                     span._.set(attr, value)
 
+        ############## Modifications fo relations ###############
+        # add relations in spans
+        if not Span.has_extension("rel"):
+            Span.set_extension("rel", default=[])
+
+        for rel in obj.get("relations") or ():  # iterates relations
+            for label in doc.spans:  # iterates source labels
+                for i, spa in enumerate(doc.spans[label]):  # iterates source spans
+                    bo = False
+
+                    # relations
+                    if dict_entities[rel["from_entity_id"]].split(";") == [
+                        label,
+                        str(spa.start_char),
+                        str(spa.end_char),
+                    ]:  # sif source entity is the same as the span
+                        for label2 in doc.spans:  # iiterates target labels
+                            for j, spa2 in enumerate(
+                                doc.spans[label2]
+                            ):  # iterates target label
+                                if dict_entities[rel["to_entity_id"]].split(";") == [
+                                    label2,
+                                    str(spa2.start_char),
+                                    str(spa2.end_char),
+                                ]:  # if target entity is the same as the span
+                                    relation = {
+                                        "type": rel["relation_label"],
+                                        "target": doc.spans[label2][j],
+                                    }  # create the relation
+                                    doc.spans[label][i]._.rel.append(
+                                        relation
+                                    )  # add the relation to the span
+                                    bo = True
+                                    break
+                            if bo:
+                                break
+                    bo = False
+
+                    # inverse relations
+                    if dict_entities[rel["to_entity_id"]].split(";") == [
+                        label,
+                        str(spa.start_char),
+                        str(spa.end_char),
+                    ]:
+                        for label2 in doc.spans:
+                            for j, spa2 in enumerate(doc.spans[label2]):
+                                if dict_entities[rel["from_entity_id"]].split(";") == [
+                                    label2,
+                                    str(spa2.start_char),
+                                    str(spa2.end_char),
+                                ]:
+                                    relation = {
+                                        "type": "inv_" + rel["relation_label"],
+                                        "target": doc.spans[label2][j],
+                                    }
+                                    doc.spans[label][i]._.rel.append(relation)
+                                    bo = True
+                                    break
+                            if bo:
+                                break
+
         return doc
 
 
@@ -346,12 +419,9 @@ def __init__(
 
     def __call__(self, doc):
         spans = get_spans(doc, self.span_getter)
-        obj = {
-            FILENAME: doc._.note_id,
-            "doc_id": doc._.note_id,
-            "text": doc.text,
-            "entities": [
-                {
+        entities = []
+        for i, ent in enumerate(sorted(dict.fromkeys(spans))):
+            entity = {
                     "entity_id": i,
                     "fragments": [
                         {
@@ -366,9 +436,61 @@ def __call__(self, doc):
                     },
                     "label": ent.label_,
                 }
-                for i, ent in enumerate(sorted(dict.fromkeys(spans)))
-            ],
+            if ent._.has("note") and ent._.note is not None:
+                entity["note"] = ent._.note
+            entities.append(entity)
+
+        # mapping between entities and their `entity_id`
+        entity_map = {
+            (
+                ent["fragments"][0]["begin"],
+                ent["fragments"][0]["end"],
+                ent["label"],
+            ): ent["entity_id"]
+            for ent in entities
+        }
+
+        # doesn't include 'inv_' relations
+        relations = []
+        relation_idx = 1
+        for span_label, span_list in doc.spans.items():
+            for spa in span_list:
+                if spa._.has("rel") and len(spa._.rel) > 0:
+                    source_entity_id = entity_map.get(
+                        (spa.start_char, spa.end_char, spa.label_)
+                    )
+                    for rel in spa._.rel:
+                        if not rel["type"].startswith("inv_"):
+                            target_entity_id = entity_map.get(
+                                (
+                                    rel["target"].start_char,
+                                    rel["target"].end_char,
+                                    rel["target"].label_,
+                                )
+                            )
+                            if (
+                                source_entity_id is not None
+                                and target_entity_id is not None
+                            ):
+                                relations.append(
+                                    {
+                                        "rel_id": relation_idx,
+                                        "from_entity_id": source_entity_id,
+                                        "relation_type": rel["type"],
+                                        "to_entity_id": target_entity_id,
+                                    }
+                                )
+                                relation_idx += 1
+
+        # final object
+        obj = {
+            FILENAME: doc._.note_id,
+            "doc_id": doc._.note_id,
+            "text": doc.text,
+            "entities": entities,
+            "relations": relations,
         }
+
         return obj
 
 

diff --git a/edsnlp/data/standoff.py b/edsnlp/data/standoff.py
@@ -220,14 +220,15 @@ def dump_standoff_file(
     if parent_dir and not fs.exists(parent_dir):
         fs.makedirs(parent_dir, exist_ok=True)
     if not fs.exists(txt_filename) or overwrite_txt:
-        with fs.open(txt_filename, "w") as f:
+        with fs.open(txt_filename, "w", encoding="utf-8") as f:
             f.write(doc["text"])
 
     ann_filename = txt_filename.replace(".txt", ".ann")
     attribute_idx = 1
+    note_idx = 1
     entities_ids = defaultdict(lambda: "T" + str(len(entities_ids) + 1))
     if not fs.exists(ann_filename) or overwrite_ann:
-        with fs.open(ann_filename, "w") as f:
+        with fs.open(ann_filename, "w", encoding="utf-8") as f:
             if "entities" in doc:
                 for entity in doc["entities"]:
                     spans = []
@@ -264,20 +265,30 @@ def dump_standoff_file(
                                     file=f,
                                 )
                                 attribute_idx += 1
-
-                    # fmt: off
-                    # if "relations" in doc:
-                    #     for i, relation in enumerate(doc["relations"]):
-                    #         entity_from = entities_ids[relation["from_entity_id"]]
-                    #         entity_to = entities_ids[relation["to_entity_id"]]
-                    #         print(
-                    #             "R{}\t{} Arg1:{} Arg2:{}\t".format(
-                    #                 i + 1, str(relation["label"]), entity_from,
-                    #                 entity_to
-                    #             ),
-                    #             file=f,
-                    #         )
-                    # fmt: on
+                    if "note" in entity:
+                        print(
+                            "#{}\tAnnotatorNotes {}\t{}".format(
+                                note_idx,
+                                brat_entity_id,
+                                (" " + str(entity["note"])),
+                            ),
+                            file=f,
+                        )
+                        note_idx += 1
+            # Ajout du traitement des relations
+            relation_idx = 1
+            if "relations" in doc:
+                for relation in doc["relations"]:
+                    print(
+                        "R{}\t{} Arg1:{} Arg2:{}".format(
+                            relation_idx,
+                            relation["relation_type"],
+                            entities_ids[relation["from_entity_id"]],
+                            entities_ids[relation["to_entity_id"]],
+                        ),
+                        file=f,
+                    )
+                    relation_idx += 1
 
 
 class StandoffReader(BaseReader):

diff --git a/edsnlp/pipes/__init__.py b/edsnlp/pipes/__init__.py
@@ -22,6 +22,7 @@
     from .misc.dates.factory import create_component as dates
     from .misc.quantities.factory import create_component as quantities
     from .misc.reason.factory import create_component as reason
+    from .misc.relations.factory import create_component as relations
     from .misc.sections.factory import create_component as sections
     from .misc.tables.factory import create_component as tables
     from .ner.adicap.factory import create_component as adicap

diff --git a/edsnlp/pipes/misc/relations/__init__.py b/edsnlp/pipes/misc/relations/__init__.py
@@ -0,0 +1 @@
+from .relations import RelationsMatcher
diff --git a/edsnlp/pipes/misc/relations/factory.py b/edsnlp/pipes/misc/relations/factory.py
@@ -0,0 +1,17 @@
+from edsnlp.core import registry
+
+from .relations import RelationsMatcher
+
+DEFAULT_CONFIG = dict(
+    scheme=None,
+    use_sentences=False,
+    clean_rel=False,
+    proximity_method="right",
+    max_dist=45,
+)
+
+create_component = registry.factory.register(
+    "eds.relations",
+    assigns=["doc.spans"],
+    deprecated=["relations"],
+)(RelationsMatcher)
diff --git a/edsnlp/pipes/misc/relations/patterns.py b/edsnlp/pipes/misc/relations/patterns.py
@@ -0,0 +1,17 @@
+scheme = [
+    {
+        "source": [{"label": "Chemical_and_drugs", "attr": {"Tech": [None]}}],
+        "target": [
+            {
+                "label": "Temporal",
+                "attr": {"AttTemp": [None, "Duration", "Date", "Frequency"]},
+            },
+            {
+                "label": "Chemical_and_drugs",
+                "attr": {"Tech": ["dosage", "route", "strength", "form"]},
+            },
+        ],
+        "type": "Depend",
+        "inv_type": "inv_Depend",
+    },
+]
-Original file line number
+Diff line change
@@ Expand Up / @@ -53,6 +53,7 @@ _build/ @@
     *.tar.gz
     *.tsv
     *.ann
+    !text.ann
     # Editors
     .idea
@@ Expand Down @@