CentreForDigitalHumanities · BeritJanssen · Nov 21, 2024 · Oct 3, 2024 · Oct 17, 2024 · Oct 17, 2024
diff --git a/backend/addcorpus/constants.py b/backend/addcorpus/constants.py
@@ -26,6 +26,7 @@ class MappingType(Enum):
     FLOAT = 'float'
     BOOLEAN = 'boolean'
     GEO_POINT = 'geo_point'
+    ANNOTATED_TEXT = 'annotated_text'
 
 
 class VisualizationType(Enum):

diff --git a/backend/addcorpus/es_mappings.py b/backend/addcorpus/es_mappings.py
@@ -98,3 +98,7 @@ def bool_mapping():
 
 def geo_mapping():
     return {'type': 'geo_point'}
+
+
+def annotated_text_mapping():
+    return {'type': 'annotated_text'}
diff --git a/backend/addcorpus/migrations/0023_alter_corpusdocumentationpage_type_alter_field_name.py b/backend/addcorpus/migrations/0023_alter_corpusdocumentationpage_type_alter_field_name.py
@@ -14,11 +14,29 @@ class Migration(migrations.Migration):
         migrations.AlterField(
             model_name='corpusdocumentationpage',
             name='type',
-            field=models.CharField(choices=[('general', 'General information'), ('citation', 'Citation'), ('license', 'License'), ('terms_of_service', 'Terms of service'), ('wordmodels', 'Word models')], default='general', help_text='the type of documentation', max_length=16),
+            field=models.CharField(
+                choices=[
+                    ('general', 'General information'),
+                    ('citation', 'Citation'),
+                    ('license', 'License'),
+                    ('terms_of_service', 'Terms of service'),
+                    ('wordmodels', 'Word models'),
+                ],
+                default='general',
+                help_text='the type of documentation',
+                max_length=16,
+            ),
         ),
         migrations.AlterField(
             model_name='field',
             name='name',
-            field=models.SlugField(help_text='internal name for the field', max_length=126, validators=[addcorpus.validation.creation.validate_name_is_not_a_route_parameter, addcorpus.validation.creation.validate_name_has_no_ner_suffix]),
+            field=models.SlugField(
+                help_text='internal name for the field',
+                max_length=126,
+                validators=[
+                    addcorpus.validation.creation.validate_name_is_not_a_route_parameter,
+                    addcorpus.validation.creation.validate_ner_slug,
+                ],
+            ),
         ),
     ]
diff --git a/backend/addcorpus/migrations/0024_remove_field_name_ner_validator.py b/backend/addcorpus/migrations/0024_remove_field_name_ner_validator.py
@@ -0,0 +1,19 @@
+# Generated by Django 4.2.15 on 2024-11-06 08:33
+
+import addcorpus.validation.creation
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('addcorpus', '0023_alter_corpusdocumentationpage_type_alter_field_name'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='field',
+            name='name',
+            field=models.SlugField(help_text='internal name for the field', max_length=126, validators=[addcorpus.validation.creation.validate_name_is_not_a_route_parameter]),
+        ),
+    ]
diff --git a/backend/addcorpus/models.py b/backend/addcorpus/models.py
@@ -9,12 +9,19 @@
 
 from addcorpus.constants import CATEGORIES, MappingType, VisualizationType
 from addcorpus.validation.creation import (
-    validate_es_mapping, validate_field_language, validate_implication, validate_language_code,
+    validate_es_mapping,
+    validate_field_language,
+    validate_implication,
+    validate_language_code,
     validate_mimetype,
-    validate_name_is_not_a_route_parameter, validate_name_has_no_ner_suffix,
-    validate_search_filter, validate_search_filter_with_mapping,
+    validate_field_name_permissible_characters,
+    validate_name_is_not_a_route_parameter,
+    validate_ner_slug,
+    validate_search_filter,
+    validate_search_filter_with_mapping,
     validate_searchable_field_has_full_text_search,
-    validate_sort_configuration, validate_visualizations_with_mapping,
+    validate_sort_configuration,
+    validate_visualizations_with_mapping,
     validate_source_data_directory,
 )
 from addcorpus.validation.indexing import (validate_essential_fields,
@@ -272,7 +279,7 @@ def has_named_entities(self):
         try:
             # we check if any fields exist for filtering named entities
             ner_exists = client.search(
-                index=self.es_index, query={"exists": {"field": "ner:*"}}, size=0
+                index=self.es_index, query={"exists": {"field": "*:ner-kw"}}, size=0
             )
             if total_hits(ner_exists):
                 return True
@@ -311,10 +318,12 @@ def has_named_entities(self):
 
 
 class Field(models.Model):
-    name = models.SlugField(
+    name = models.CharField(
         max_length=MAX_LENGTH_NAME,
-        validators=[validate_name_is_not_a_route_parameter,
-                    validate_name_has_no_ner_suffix],
+        validators=[
+            validate_name_is_not_a_route_parameter,
+            validate_field_name_permissible_characters,
+        ],
         help_text='internal name for the field',
     )
     corpus_configuration = models.ForeignKey(
@@ -426,6 +435,7 @@ def __str__(self) -> str:
 
     def clean(self):
         validate_searchable_field_has_full_text_search(self.es_mapping, self.searchable)
+        validate_ner_slug(self.es_mapping, self.name)
 
         if self.search_filter:
             validate_search_filter_with_mapping(self.es_mapping, self.search_filter)

diff --git a/backend/addcorpus/tests/test_validators.py b/backend/addcorpus/tests/test_validators.py
@@ -1,6 +1,12 @@
 import pytest
 from addcorpus.models import Field
-from addcorpus.es_mappings import int_mapping, text_mapping, keyword_mapping, main_content_mapping, date_mapping
+from addcorpus.es_mappings import (
+    annotated_text_mapping,
+    date_mapping,
+    int_mapping,
+    text_mapping,
+    keyword_mapping,
+)
 from addcorpus.validation.creation import *
 
 def test_validate_mimetype():
@@ -9,6 +15,26 @@ def test_validate_mimetype():
     with pytest.raises(ValidationError):
         validate_mimetype('nonsense')
 
+
+def test_validate_field_name_permissible_characters():
+    validate_field_name_permissible_characters("valid:slug")
+    with pytest.raises(ValidationError):
+        validate_field_name_permissible_characters("some invalid slug!")
+
+
+def test_validate_ner_slug():
+    with pytest.raises(ValidationError):
+        validate_ner_slug({}, "some:slug")
+    with pytest.raises(ValidationError):
+        validate_ner_slug({}, "some:ner_inslug")
+    with pytest.raises(ValidationError):
+        validate_ner_slug(keyword_mapping(), "slug:ner")
+    validate_ner_slug(annotated_text_mapping(), "slug:ner")
+    with pytest.raises(ValidationError):
+        validate_ner_slug(date_mapping(), "slug:ner-kw")
+    validate_ner_slug(keyword_mapping(), "slug:ner-kw")
+
+
 def test_validate_es_mapping():
     validate_es_mapping({'type': 'text'})
 

diff --git a/backend/addcorpus/validation/creation.py b/backend/addcorpus/validation/creation.py
@@ -4,28 +4,37 @@
 
 import mimetypes
 import os
+import re
 import warnings
 
-from addcorpus.constants import (FORBIDDEN_FIELD_NAMES, MappingType,
-                                 VisualizationType)
-from addcorpus.python_corpora.filters import \
-    VALID_MAPPINGS as VALID_SEARCH_FILTER_MAPPINGS
 from django.core.exceptions import ValidationError
-from addcorpus.es_mappings import primary_mapping_type
 from langcodes import tag_is_valid
 
+from addcorpus.constants import (FORBIDDEN_FIELD_NAMES, MappingType,
+                                 VisualizationType)
+from addcorpus.python_corpora.filters import (
+    VALID_MAPPINGS as VALID_SEARCH_FILTER_MAPPINGS,
+)
+from addcorpus.es_mappings import primary_mapping_type
 
 
 def supports_full_text_search(es_mapping):
-    is_text = primary_mapping_type(es_mapping) == MappingType.TEXT.value
     has_text_multifield = 'text' in es_mapping.get('fields', {})
-    return is_text or has_text_multifield
+    return _is_text(es_mapping) or has_text_multifield
+
+
+def _is_text(es_mapping):
+    return primary_mapping_type(es_mapping) in [
+        MappingType.TEXT.value,
+        MappingType.ANNOTATED_TEXT.value,
+    ]
+
 
 def is_geo_field(es_mapping):
     return primary_mapping_type(es_mapping) == MappingType.GEO_POINT.value
 
 def supports_aggregation(es_mapping):
-    return primary_mapping_type(es_mapping) != MappingType.TEXT.value
+    return not _is_text(es_mapping)
 
 def validate_language_code(value):
     '''
@@ -123,12 +132,40 @@ def validate_name_is_not_a_route_parameter(value):
         )
 
 
-def validate_name_has_no_ner_suffix(value):
-    if value.endswith(':ner'):
+def validate_field_name_permissible_characters(slug: str):
+    """
+    reject names which contain characters other than colons, hyphens, underscores or alphanumeric
+    """
+    slug_re = re.compile(r"^[\w:-]+$")
+    if not slug_re.match(slug):
         raise ValidationError(
-            f'{value} cannot be used as a field name: the suffix `:ner` is reserved for annotated_text fields'
+            f"{slug} is not valid: it should consist of no other characters than letters, numbers, underscores, hyphens or colons"
         )
 
+
+def validate_ner_slug(es_mapping: dict, name: str):
+    """
+    Checks if colons are in field name, will raise ValidationError if the field does not meet the following requirements:
+    - ends with `:ner` suffix and is an annotated_text field
+    - ends with `:ner-kw` suffix and is a keyword field
+    """
+    if ":" in name:
+        if name.endswith(":ner"):
+            if primary_mapping_type(es_mapping) != MappingType.ANNOTATED_TEXT.value:
+                raise ValidationError(
+                    f"{name} cannot be used as a field name: the suffix `:ner` is reserved for annotated_text fields"
+        )
+        elif name.endswith(":ner-kw"):
+            if primary_mapping_type(es_mapping) != MappingType.KEYWORD.value:
+                raise ValidationError(
+                    f"{name} cannot be used as a field name: the suffix `:ner-kw` is reserved for Named Entity keyword fields"
+                )
+        else:
+            raise ValidationError(
+                f"{name} cannot be used as a field name: colons are reserved for special (named entity related) fields"
+            )
+
+
 def mapping_can_be_searched(es_mapping):
     '''
     Verify if a mapping is appropriate for searching

diff --git a/backend/corpora/parliament/conftest.py b/backend/corpora/parliament/conftest.py
@@ -272,6 +272,7 @@ def parliament_corpora_settings(settings):
                 "debate_id": "ParlaMint-NL_2017-01-31-tweedekamer-23",
                 "topic": 'Rapport "Welvaart in kaart"',
                 "speech": "Ik heet de minister van Economische Zaken van harte welkom.",
+                "speech:ner": "Ik heet de minister van [Economische Zaken](ORG) van harte welkom.",
                 "id": "ParlaMint-NL_2017-01-31-tweedekamer-23.u1",
                 "speaker": "Khadija Arib",
                 "speaker_id": "#KhadijaArib",
@@ -283,9 +284,13 @@ def parliament_corpora_settings(settings):
                 "page": None,
                 "url": None,
                 "sequence": 1,
+                "location:ner-kw": [],
+                "miscellaneous:ner-kw": [],
+                "organization:ner-kw": ["Economische Zaken"],
+                "person:ner-kw": [],
             }
         ],
-        "n_documents": 98,
+        "n_documents": 2,
         "start": datetime(2015, 1, 1),
     },
     {

diff --git a/backend/corpora/parliament/ireland.py b/backend/corpora/parliament/ireland.py
@@ -14,15 +14,10 @@
 from corpora.parliament.parliament import Parliament
 import corpora.parliament.utils.field_defaults as field_defaults
 import corpora.utils.formatting as formatting
+from corpora.utils.filter_sources import in_date_range
 import corpora.parliament.utils.parlamint as parlamint
 
 
-def in_date_range(corpus, start, end):
-    start_date = start or corpus.min_date
-    end_date = end or corpus.max_date
-
-    return start_date <= corpus.max_date and end_date >= corpus.min_date
-
 def format_mininster_role(position, department):
     '''Format 1919-2013 minister positions analogous to the 2014-2020 positions'''