aphp · aricohen93 · Sep 4, 2025 · Sep 4, 2025 · Sep 4, 2025 · Sep 5, 2025
diff --git a/.gitignore b/.gitignore
@@ -73,3 +73,6 @@ docs/reference
 docs/changelog.md
 docs/contributing.md
 .vercel
+
+# Work development
+dev/*
diff --git a/changelog_unreleased.md b/changelog_unreleased.md
@@ -0,0 +1,9 @@
+### Added
+- New `edsnlp.external_information_qualifier` qualifies spans in a document based on external information and a defined distance to these contextual/external elements as in Distant Supervision.
+- New `eds.contextual_qualifier` pipeline component to qualify spans based on contextual information.
+- Add the fixture `edsnlp_blank_nlp` for the test.
+
+### Fixed
+- Correct the contributing documentation. Delete `$ pre-commit run --all-files`recommendation.
+- Fix the the `Obj Class` in the doc template `class.html`.
+- Fix the `get_pipe_meta` function.
diff --git a/contributing.md b/contributing.md
@@ -43,16 +43,7 @@ $ pre-commit install
 
 The pre-commit hooks defined in the [configuration](https://github.com/aphp/edsnlp/blob/master/.pre-commit-config.yaml) will automatically run when you commit your changes, letting you know if something went wrong.
 
-The hooks only run on staged changes. To force-run it on all files, run:
-
-<div class="termy">
-
-```console
-$ pre-commit run --all-files
----> 100%
-color:green All good !
-```
-
+The hooks only run on staged changes.
 </div>
 
 ## Proposing a merge request

diff --git a/docs/advanced-tutorials/distant_annotation.md b/docs/advanced-tutorials/distant_annotation.md
@@ -0,0 +1,163 @@
+# External Information & Context qualifiers
+
+This tutorial shows the use of two pipes to qualify spans or entities by using the `ContextualQualifier` and the `ExternalInformationQualifier`
+
+### Import dependencies
+```python
+import datetime
+
+import pandas as pd
+
+import edsnlp
+from edsnlp.pipes.qualifiers.contextual.contextual import (
+    ClassPatternsContext,
+    ContextualQualifier,
+)
+from edsnlp.pipes.qualifiers.external_information.external_information import (
+    ExternalInformation,
+    ExternalInformationQualifier,
+)
+from edsnlp.utils.collections import get_deep_attr
+```
+
+### Data
+Lets start creating a toy example
+```python
+# Create context dates
+# The elements under this attribute should be a list of dicts with keys value and class
+context_dates = [
+    {
+        "value": datetime.datetime(2024, 2, 15),
+        "class": "Magnetic resonance imaging (procedure)",
+    },
+    {"value": datetime.datetime(2024, 2, 17), "class": "Biopsy (procedure)"},
+    {"value": datetime.datetime(2024, 2, 17), "class": "Colonoscopy (procedure)"},
+]
+
+# Texy
+text = """
+RCP du 18/12/2024 : DUPONT Jean
+
+Homme de 68 ans adressé en consultation d’oncologie pour prise en charge d’une tumeur du colon.
+Antécédents : HTA, diabète de type 2, dyslipidémie, tabagisme actif (30 PA), alcoolisme chronique (60 g/jour).
+
+Examen clinique : patient en bon état général, poids 80 kg, taille 1m75.
+
+
+HISTOIRE DE LA MALADIE :
+Lors du PET-CT (14/02/2024), des dépôts pathologiques ont été observés qui coïncidaient avec les résultats du scanner.
+Le 15/02/2024, une IRM a été réalisée pour évaluer l’extension de la tumeur.
+Une colonoscopie a été réalisée le 17/02/2024 avec une biopsie d'adénopathie sous-carinale.
+Une deuxième a été biopsié le 18/02/2024. Les résultats de la biopsie ont confirmé un adénocarcinome du colon.
+Il a été opéré le 20/02/2024. L’examen anatomopath ologique de la pièce opératoire a confirmé un adénocarcinome du colon stade IV avec métastases hépatiques et pulmonaires.
+Trois mois après la fin du traitement de chimiothérapie (abril 2024), le patient a signalé une aggravation progressive des symptômes
+
+CONCLUSION :  Adénocarcinome du colon stade IV avec métastases hépatiques et pulmonaires.
+"""
+
+
+# Create a toy dataframe
+df = pd.DataFrame.from_records(
+    [
+        {
+            "person_id": 1,
+            "note_id": 1,
+            "note_text": text,
+            "context_dates": context_dates,
+        }
+    ]
+)
+df
+```
+
+### Define the nlp pipeline
+```python
+import edsnlp.pipes as eds
+
+nlp = edsnlp.blank("eds")
+
+nlp.add_pipe(eds.sentences())
+nlp.add_pipe(eds.normalizer())
+nlp.add_pipe(eds.dates())
+
+
+nlp.add_pipe(
+    ContextualQualifier(
+        span_getter="dates",
+        patterns={
+            "lf1": {
+                "Magnetic resonance imaging (procedure)": ClassPatternsContext(
+                    **{
+                        "terms": {"irm": ["IRM", "imagerie par résonance magnétique"]},
+                        "regex": None,
+                        "context_words": 0,
+                        "context_sents": 1,
+                        "attr": "TEXT",
+                    }
+                )
+            },
+            "lf2": {
+                "Biopsy (procedure)": {
+                    "regex": {"biopsy": ["biopsie", "biopsié"]},
+                    "context_words": (10, 10),
+                    "context_sents": 0,
+                    "attr": "TEXT",
+                }
+            },
+            "lf3": {
+                "Surgical procedure (procedure)": {
+                    "regex": {"chirurgie": ["chirurgie", "exerese", "opere"]},
+                    "context_words": 0,
+                    "context_sents": (2, 2),
+                    "attr": "NORM",
+                },
+            },
+        },
+    )
+)
+
+nlp.add_pipe(
+    ExternalInformationQualifier(
+        nlp=nlp,
+        span_getter="dates",
+        external_information={
+            "lf4": ExternalInformation(
+                doc_attr="_.context_dates",
+                span_attribute="_.date.to_datetime()",
+                threshold=datetime.timedelta(days=0),
+            )
+        },
+    )
+)
+```
+
+### Apply the pipeline to texts
+```python
+doc_iterator = edsnlp.data.from_pandas(
+    df, converter="omop", doc_attributes=["context_dates"]
+)
+
+docs = list(nlp.pipe(doc_iterator))
+```
+
+### Lets inspect the results
+```python
+doc = docs[0]
+dates = doc.spans["dates"]
+
+for date in dates:
+    for attr in ["lf1", "lf2", "lf3", "lf4"]:
+        value = get_deep_attr(date, "_." + attr)
+
+        if value:
+            print(date.start, date.end, date, attr, value)
+```
+
+```python
+# Out : 120 125 15/02/2024 lf1 Magnetic resonance imaging (procedure)
+# Out : 120 125 15/02/2024 lf4 ['Magnetic resonance imaging (procedure)']
+# Out : 147 152 17/02/2024 lf2 Biopsy (procedure)
+# Out : 147 152 17/02/2024 lf4 ['Biopsy (procedure)', 'Colonoscopy (procedure)']
+# Out : 168 173 18/02/2024 lf2 Biopsy (procedure)
+# Out : 192 197 20/02/2024 lf3 Surgical procedure (procedure)
+```
diff --git a/docs/pipes/qualifiers/contextual.md b/docs/pipes/qualifiers/contextual.md
@@ -0,0 +1,8 @@
+# Contextual {: #edsnlp.pipes.qualifiers.contextual.factory.create_component }
+
+::: edsnlp.pipes.qualifiers.contextual.factory.create_component
+    options:
+        heading_level: 2
+        show_bases: true
+        show_source: true
+        only_class_level: true
diff --git a/docs/pipes/qualifiers/external_information.md b/docs/pipes/qualifiers/external_information.md
@@ -0,0 +1,8 @@
+# External Information {: #edsnlp.pipes.qualifiers.external_information.factory.create_component }
+
+::: edsnlp.pipes.qualifiers.external_information.factory.create_component
+    options:
+        heading_level: 2
+        show_bases: true
+        show_source: true
+        only_class_level: true
diff --git a/docs/tutorials/training-ner.md b/docs/tutorials/training-ner.md
@@ -115,7 +115,7 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li
 
     # 🎛️ OPTIMIZER
     optimizer:
-      "@core": optimizer
+      "@core": optimizer !draft  # (2)!
       optim: adamw
       groups:
         # Assign parameters starting with transformer (ie the parameters of the transformer component)
@@ -133,7 +133,6 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li
             "warmup_rate": 0.1
             "start_value": 3e-4
             "max_value": 3e-4
-      module: ${ nlp }
       total_steps: ${ train.max_steps }
 
     # 📚 DATA
@@ -216,6 +215,14 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li
     1. Why do we use `'@core': pipeline` here ? Because we need the reference used in `optimizer.module = ${ nlp }` to be the actual Pipeline and not its keyword arguments : when confit sees `'@core': pipeline`, it will instantiate the `Pipeline` class with the arguments provided in the dict.
 
         In fact, you could also use `'@core': eds.pipeline` in every config when you define a pipeline, but sometimes it's more convenient to let Confit infer that the type of the nlp argument based on the function when it's type hinted. Not specifying `'@core': pipeline` is also more aligned with `spacy`'s pipeline config API. However, in general, explicit is better than implicit, so feel free to use explicitly write `'@core': eds.pipeline` when you define a pipeline.
+    1. What does "draft" mean here ? We'll let the train function pass the nlp object
+    to the optimizer after it has been been `post_init`'ed : `post_init` is the operation that
+    looks at some data, finds how many label the model must learn, and updates the model weights
+    to have as many heads as there are labels observed in the train data. This function will be
+    called by `train`, so the optimizer should be defined *after*, when the model parameter
+    tensors are final. To do that, instead of instantiating the optimizer right now, we create
+    a "Draft", which will be instantiated inside the `train` function, once all the required
+    parameters are set.
 
     To train the model, you can use the following command:
 
@@ -277,9 +284,8 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li
 
     # 🎛️ OPTIMIZER
     max_steps = 2000
-    optimizer = ScheduledOptimizer(
+    optimizer = ScheduledOptimizer.draft(  # (1)!
         optim=torch.optim.Adam,
-        module=nlp,
         total_steps=max_steps,
         groups=[
             {
@@ -333,6 +339,15 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li
     )
     ```
 
+    1. Wait, what's does "draft" mean here ? We'll let the train function pass the nlp object
+    to the optimizer after it has been been `post_init`'ed : `post_init` is the operation that
+    looks at some data, finds how many label the model must learn, and updates the model weights
+    to have as many heads as there are labels observed in the train data. This function will be
+    called by `train`, so the optimizer should be defined *after*, when the model parameter
+    tensors are final. To do that, instead of instantiating the optimizer right now, we create
+    a "Draft", which will be instantiated inside the `train` function, once all the required
+    parameters are set.
+
 or use the config file:
 
 ```{ .python .no-check }

diff --git a/docs/tutorials/training-span-classifier.md b/docs/tutorials/training-span-classifier.md
@@ -184,13 +184,14 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li
     ```
 
     1. Put entities extracted by `eds.dates` in `doc.ents`, instead of `doc.spans['dates']`.
-    2. Wait, what's does "draft" mean here ? The rationale is this: we don't want to
-    instantiate the optimizer now, because the nlp object hasn't been `post_init`'ed
-    yet : `post_init` is the operation that looks at some data, finds how many labels the model must learn,
-    and updates the model weights to have as many heads as there are labels. This function will
-    be called by `train`, so the optimizer should be defined *after*, when the model parameter tensors are
-    final. To do that, instead of instantiating the optimizer, we create a "Draft", which will be
-    instantiated inside the `train` function, once all the required parameters are set.
+    2. What does "draft" mean here ? We'll let the train function pass the nlp object
+    to the optimizer after it has been been `post_init`'ed : `post_init` is the operation that
+    looks at some data, finds how many label the model must learn, and updates the model weights
+    to have as many heads as there are labels observed in the train data. This function will be
+    called by `train`, so the optimizer should be defined *after*, when the model parameter
+    tensors are final. To do that, instead of instantiating the optimizer right now, we create
+    a "Draft", which will be instantiated inside the `train` function, once all the required
+    parameters are set.
 
     And train the model:
 
@@ -309,13 +310,14 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li
     ```
 
     1. Put entities extracted by `eds.dates` in `doc.ents`, instead of `doc.spans['dates']`.
-    2. Wait, what's does "draft" mean here ? The rationale is this: we don't want to
-    instantiate the optimizer now, because the nlp object hasn't been `post_init`'ed
-    yet : `post_init` is the operation that looks at some data, finds how many label the model must learn,
-    and updates the model weights to have as many heads as there are labels. This function will
-    be called by `train`, so the optimizer should be defined *after*, when the model parameter tensors are
-    final. To do that, instead of instantiating the optimizer, we create a "Draft", which will be
-    instantiated inside the `train` function, once all the required parameters are set.
+    2. What does "draft" mean here ? We'll let the train function pass the nlp object
+    to the optimizer after it has been been `post_init`'ed : `post_init` is the operation that
+    looks at some data, finds how many label the model must learn, and updates the model weights
+    to have as many heads as there are labels observed in the train data. This function will be
+    called by `train`, so the optimizer should be defined *after*, when the model parameter
+    tensors are final. To do that, instead of instantiating the optimizer right now, we create
+    a "Draft", which will be instantiated inside the `train` function, once all the required
+    parameters are set.
 
 
 !!! note "Upstream annotations at training vs inference time"

diff --git a/edsnlp/core/pipeline.py b/edsnlp/core/pipeline.py
@@ -338,7 +338,7 @@ def get_pipe_meta(self, name: str) -> FactoryMeta:
         Dict[str, Any]
         """
         pipe = self.get_pipe(name)
-        return PIPE_META.get(pipe, {})
+        return PIPE_META.get(pipe, FactoryMeta([], [], False, {}))
 
     def make_doc(self, text: str) -> Doc:
         """

diff --git a/edsnlp/metrics/span_attribute.py b/edsnlp/metrics/span_attribute.py
@@ -121,6 +121,8 @@ def span_attribute_metric(
                     continue
                 getter_key = attr if attr.startswith("_.") else f"_.{attr}"
                 value = BINDING_GETTERS[getter_key](span)
+                if isinstance(value, dict):
+                    value = max(value, key=value.get)
                 if (value or include_falsy) and default_values[attr] != value:
                     labels[micro_key][1].add((eg_idx, beg, end, attr, value))
                     labels[attr][1].add((eg_idx, beg, end, attr, value))

diff --git a/edsnlp/pipes/__init__.py b/edsnlp/pipes/__init__.py
@@ -74,6 +74,8 @@
     from .qualifiers.negation.factory import create_component as negation
     from .qualifiers.reported_speech.factory import create_component as reported_speech
     from .qualifiers.reported_speech.factory import create_component as rspeech
+    from .qualifiers.contextual.factory import create_component as contextual_qualifier
+    from .qualifiers.external_information.factory import create_component as external_information_qualifier
     from .trainable.ner_crf.factory import create_component as ner_crf
     from .trainable.biaffine_dep_parser.factory import create_component as biaffine_dep_parser
     from .trainable.extractive_qa.factory import create_component as extractive_qa

diff --git a/edsnlp/pipes/qualifiers/contextual/__init__.py b/edsnlp/pipes/qualifiers/contextual/__init__.py