Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Always add term vector to main content field and fix mappings in some parliamentary corpora #1677

Merged
merged 7 commits into from
Oct 17, 2024
14 changes: 5 additions & 9 deletions backend/addcorpus/es_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
def primary_mapping_type(es_mapping: Dict) -> str:
return es_mapping.get('type', None)

def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None, updated_highlighting=True):

def main_content_mapping(
token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None
):
'''
Mapping for the main content field. Options:

Expand All @@ -14,14 +17,7 @@ def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_an
- `updated_highlighting`: enables the new highlighter, which only works for fields that are indexed with the term vector set to 'with_positions_offsets'.
'''

mapping = {
'type': 'text'
}

if updated_highlighting:
mapping.update({
'term_vector': 'with_positions_offsets' # include char positions on _source (in addition to the multifields) for highlighting
})
mapping = {"type": "text", "term_vector": "with_positions_offsets"}

if any([token_counts, stopword_analysis, stemming_analysis]):
multifields = {}
Expand Down
2 changes: 1 addition & 1 deletion backend/corpora/parliament/finland.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def sources(self, start, end):
speaker_birth_year = field_defaults.speaker_birth_year()
speaker_birth_year.extractor = person_attribute_extractor('birth_year')

speech = field_defaults.speech()
speech = field_defaults.speech(language="fi")
speech.extractor = XML(transform = clean_value)

speech_id = field_defaults.speech_id()
Expand Down
13 changes: 2 additions & 11 deletions backend/corpora/parliament/ireland.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from addcorpus.python_corpora.corpus import CorpusDefinition, CSVCorpusDefinition, XMLCorpusDefinition
from addcorpus.python_corpora.extract import Constant, CSV, XML, Metadata, Combined, Backup
from addcorpus.es_mappings import main_content_mapping
from corpora.parliament.parliament import Parliament
import corpora.parliament.utils.field_defaults as field_defaults
import corpora.utils.formatting as formatting
Expand Down Expand Up @@ -149,7 +150,6 @@ def sources(self, start, end):
source_archive = field_defaults.source_archive()
source_archive.extractor = Constant('1919-2013')


fields = [
date,
country,
Expand Down Expand Up @@ -495,17 +495,8 @@ def source2dicts(self, source):
speaker_id = field_defaults.speaker_id()
speaker_constituency = field_defaults.speaker_constituency()

speech = field_defaults.speech()
# no language-specific analysers since the corpus is mixed-language
speech.es_mapping = {
"type" : "text",
"fields": {
"length": {
"type": "token_count",
"analyzer": "standard"
}
}
}
speech = field_defaults.speech()

speech_id = field_defaults.speech_id()
topic = field_defaults.topic()
Expand Down
1 change: 0 additions & 1 deletion backend/corpora/parliament/utils/field_defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,6 @@ def speech(language=None):
stopword_analysis=has_language,
stemming_analysis=has_language,
language=language,
updated_highlighting=True
),
results_overview=True,
search_field_core=True,
Expand Down