Skip to content

Add pipeline to add NER annotations from ParlaMint to ES index #1681

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Nov 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backend/addcorpus/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class MappingType(Enum):
FLOAT = 'float'
BOOLEAN = 'boolean'
GEO_POINT = 'geo_point'
ANNOTATED_TEXT = 'annotated_text'


class VisualizationType(Enum):
Expand Down
4 changes: 4 additions & 0 deletions backend/addcorpus/es_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,7 @@ def bool_mapping():

def geo_mapping():
return {'type': 'geo_point'}


def annotated_text_mapping():
return {'type': 'annotated_text'}
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,29 @@ class Migration(migrations.Migration):
migrations.AlterField(
model_name='corpusdocumentationpage',
name='type',
field=models.CharField(choices=[('general', 'General information'), ('citation', 'Citation'), ('license', 'License'), ('terms_of_service', 'Terms of service'), ('wordmodels', 'Word models')], default='general', help_text='the type of documentation', max_length=16),
field=models.CharField(
choices=[
('general', 'General information'),
('citation', 'Citation'),
('license', 'License'),
('terms_of_service', 'Terms of service'),
('wordmodels', 'Word models'),
],
default='general',
help_text='the type of documentation',
max_length=16,
),
),
migrations.AlterField(
model_name='field',
name='name',
field=models.SlugField(help_text='internal name for the field', max_length=126, validators=[addcorpus.validation.creation.validate_name_is_not_a_route_parameter, addcorpus.validation.creation.validate_name_has_no_ner_suffix]),
field=models.SlugField(
help_text='internal name for the field',
max_length=126,
validators=[
addcorpus.validation.creation.validate_name_is_not_a_route_parameter,
addcorpus.validation.creation.validate_ner_slug,
],
),
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Generated by Django 4.2.15 on 2024-11-06 08:33

import addcorpus.validation.creation
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('addcorpus', '0023_alter_corpusdocumentationpage_type_alter_field_name'),
]

operations = [
migrations.AlterField(
model_name='field',
name='name',
field=models.SlugField(help_text='internal name for the field', max_length=126, validators=[addcorpus.validation.creation.validate_name_is_not_a_route_parameter]),
),
]
26 changes: 18 additions & 8 deletions backend/addcorpus/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,19 @@

from addcorpus.constants import CATEGORIES, MappingType, VisualizationType
from addcorpus.validation.creation import (
validate_es_mapping, validate_field_language, validate_implication, validate_language_code,
validate_es_mapping,
validate_field_language,
validate_implication,
validate_language_code,
validate_mimetype,
validate_name_is_not_a_route_parameter, validate_name_has_no_ner_suffix,
validate_search_filter, validate_search_filter_with_mapping,
validate_field_name_permissible_characters,
validate_name_is_not_a_route_parameter,
validate_ner_slug,
validate_search_filter,
validate_search_filter_with_mapping,
validate_searchable_field_has_full_text_search,
validate_sort_configuration, validate_visualizations_with_mapping,
validate_sort_configuration,
validate_visualizations_with_mapping,
validate_source_data_directory,
)
from addcorpus.validation.indexing import (validate_essential_fields,
Expand Down Expand Up @@ -272,7 +279,7 @@ def has_named_entities(self):
try:
# we check if any fields exist for filtering named entities
ner_exists = client.search(
index=self.es_index, query={"exists": {"field": "ner:*"}}, size=0
index=self.es_index, query={"exists": {"field": "*:ner-kw"}}, size=0
)
if total_hits(ner_exists):
return True
Expand Down Expand Up @@ -311,10 +318,12 @@ def has_named_entities(self):


class Field(models.Model):
name = models.SlugField(
name = models.CharField(
max_length=MAX_LENGTH_NAME,
validators=[validate_name_is_not_a_route_parameter,
validate_name_has_no_ner_suffix],
validators=[
validate_name_is_not_a_route_parameter,
validate_field_name_permissible_characters,
],
help_text='internal name for the field',
)
corpus_configuration = models.ForeignKey(
Expand Down Expand Up @@ -426,6 +435,7 @@ def __str__(self) -> str:

def clean(self):
validate_searchable_field_has_full_text_search(self.es_mapping, self.searchable)
validate_ner_slug(self.es_mapping, self.name)

if self.search_filter:
validate_search_filter_with_mapping(self.es_mapping, self.search_filter)
Expand Down
28 changes: 27 additions & 1 deletion backend/addcorpus/tests/test_validators.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import pytest
from addcorpus.models import Field
from addcorpus.es_mappings import int_mapping, text_mapping, keyword_mapping, main_content_mapping, date_mapping
from addcorpus.es_mappings import (
annotated_text_mapping,
date_mapping,
int_mapping,
text_mapping,
keyword_mapping,
)
from addcorpus.validation.creation import *

def test_validate_mimetype():
Expand All @@ -9,6 +15,26 @@ def test_validate_mimetype():
with pytest.raises(ValidationError):
validate_mimetype('nonsense')


def test_validate_field_name_permissible_characters():
validate_field_name_permissible_characters("valid:slug")
with pytest.raises(ValidationError):
validate_field_name_permissible_characters("some invalid slug!")


def test_validate_ner_slug():
with pytest.raises(ValidationError):
validate_ner_slug({}, "some:slug")
with pytest.raises(ValidationError):
validate_ner_slug({}, "some:ner_inslug")
with pytest.raises(ValidationError):
validate_ner_slug(keyword_mapping(), "slug:ner")
validate_ner_slug(annotated_text_mapping(), "slug:ner")
with pytest.raises(ValidationError):
validate_ner_slug(date_mapping(), "slug:ner-kw")
validate_ner_slug(keyword_mapping(), "slug:ner-kw")


def test_validate_es_mapping():
validate_es_mapping({'type': 'text'})

Expand Down
59 changes: 48 additions & 11 deletions backend/addcorpus/validation/creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,37 @@

import mimetypes
import os
import re
import warnings

from addcorpus.constants import (FORBIDDEN_FIELD_NAMES, MappingType,
VisualizationType)
from addcorpus.python_corpora.filters import \
VALID_MAPPINGS as VALID_SEARCH_FILTER_MAPPINGS
from django.core.exceptions import ValidationError
from addcorpus.es_mappings import primary_mapping_type
from langcodes import tag_is_valid

from addcorpus.constants import (FORBIDDEN_FIELD_NAMES, MappingType,
VisualizationType)
from addcorpus.python_corpora.filters import (
VALID_MAPPINGS as VALID_SEARCH_FILTER_MAPPINGS,
)
from addcorpus.es_mappings import primary_mapping_type


def supports_full_text_search(es_mapping):
is_text = primary_mapping_type(es_mapping) == MappingType.TEXT.value
has_text_multifield = 'text' in es_mapping.get('fields', {})
return is_text or has_text_multifield
return _is_text(es_mapping) or has_text_multifield


def _is_text(es_mapping):
return primary_mapping_type(es_mapping) in [
MappingType.TEXT.value,
MappingType.ANNOTATED_TEXT.value,
]


def is_geo_field(es_mapping):
return primary_mapping_type(es_mapping) == MappingType.GEO_POINT.value

def supports_aggregation(es_mapping):
return primary_mapping_type(es_mapping) != MappingType.TEXT.value
return not _is_text(es_mapping)

def validate_language_code(value):
'''
Expand Down Expand Up @@ -123,12 +132,40 @@ def validate_name_is_not_a_route_parameter(value):
)


def validate_name_has_no_ner_suffix(value):
if value.endswith(':ner'):
def validate_field_name_permissible_characters(slug: str):
"""
reject names which contain characters other than colons, hyphens, underscores or alphanumeric
"""
slug_re = re.compile(r"^[\w:-]+$")
if not slug_re.match(slug):
raise ValidationError(
f'{value} cannot be used as a field name: the suffix `:ner` is reserved for annotated_text fields'
f"{slug} is not valid: it should consist of no other characters than letters, numbers, underscores, hyphens or colons"
)


def validate_ner_slug(es_mapping: dict, name: str):
"""
Checks if colons are in field name, will raise ValidationError if the field does not meet the following requirements:
- ends with `:ner` suffix and is an annotated_text field
- ends with `:ner-kw` suffix and is a keyword field
"""
if ":" in name:
if name.endswith(":ner"):
if primary_mapping_type(es_mapping) != MappingType.ANNOTATED_TEXT.value:
raise ValidationError(
f"{name} cannot be used as a field name: the suffix `:ner` is reserved for annotated_text fields"
)
elif name.endswith(":ner-kw"):
if primary_mapping_type(es_mapping) != MappingType.KEYWORD.value:
raise ValidationError(
f"{name} cannot be used as a field name: the suffix `:ner-kw` is reserved for Named Entity keyword fields"
)
else:
raise ValidationError(
f"{name} cannot be used as a field name: colons are reserved for special (named entity related) fields"
)


def mapping_can_be_searched(es_mapping):
'''
Verify if a mapping is appropriate for searching
Expand Down
7 changes: 6 additions & 1 deletion backend/corpora/parliament/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ def parliament_corpora_settings(settings):
"debate_id": "ParlaMint-NL_2017-01-31-tweedekamer-23",
"topic": 'Rapport "Welvaart in kaart"',
"speech": "Ik heet de minister van Economische Zaken van harte welkom.",
"speech:ner": "Ik heet de minister van [Economische Zaken](ORG) van harte welkom.",
"id": "ParlaMint-NL_2017-01-31-tweedekamer-23.u1",
"speaker": "Khadija Arib",
"speaker_id": "#KhadijaArib",
Expand All @@ -283,9 +284,13 @@ def parliament_corpora_settings(settings):
"page": None,
"url": None,
"sequence": 1,
"location:ner-kw": [],
"miscellaneous:ner-kw": [],
"organization:ner-kw": ["Economische Zaken"],
"person:ner-kw": [],
}
],
"n_documents": 98,
"n_documents": 2,
"start": datetime(2015, 1, 1),
},
{
Expand Down
7 changes: 1 addition & 6 deletions backend/corpora/parliament/ireland.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,10 @@
from corpora.parliament.parliament import Parliament
import corpora.parliament.utils.field_defaults as field_defaults
import corpora.utils.formatting as formatting
from corpora.utils.filter_sources import in_date_range
import corpora.parliament.utils.parlamint as parlamint


def in_date_range(corpus, start, end):
start_date = start or corpus.min_date
end_date = end or corpus.max_date

return start_date <= corpus.max_date and end_date >= corpus.min_date

def format_mininster_role(position, department):
'''Format 1919-2013 minister positions analogous to the 2014-2020 positions'''

Expand Down
Loading