diff --git a/backend/addcorpus/es_mappings.py b/backend/addcorpus/es_mappings.py index 921870a65..f76021ef1 100644 --- a/backend/addcorpus/es_mappings.py +++ b/backend/addcorpus/es_mappings.py @@ -4,7 +4,10 @@ def primary_mapping_type(es_mapping: Dict) -> str: return es_mapping.get('type', None) -def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None, updated_highlighting=True): + +def main_content_mapping( + token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None +): ''' Mapping for the main content field. Options: @@ -14,14 +17,7 @@ def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_an - `updated_highlighting`: enables the new highlighter, which only works for fields that are indexed with the term vector set to 'with_positions_offsets'. ''' - mapping = { - 'type': 'text' - } - - if updated_highlighting: - mapping.update({ - 'term_vector': 'with_positions_offsets' # include char positions on _source (in addition to the multifields) for highlighting - }) + mapping = {"type": "text", "term_vector": "with_positions_offsets"} if any([token_counts, stopword_analysis, stemming_analysis]): multifields = {} diff --git a/backend/corpora/parliament/finland.py b/backend/corpora/parliament/finland.py index 8be053707..0d99aed0d 100644 --- a/backend/corpora/parliament/finland.py +++ b/backend/corpora/parliament/finland.py @@ -108,7 +108,7 @@ def sources(self, start, end): speaker_birth_year = field_defaults.speaker_birth_year() speaker_birth_year.extractor = person_attribute_extractor('birth_year') - speech = field_defaults.speech() + speech = field_defaults.speech(language="fi") speech.extractor = XML(transform = clean_value) speech_id = field_defaults.speech_id() diff --git a/backend/corpora/parliament/ireland.py b/backend/corpora/parliament/ireland.py index 3c06238a4..a76a47fd3 100644 --- a/backend/corpora/parliament/ireland.py +++ b/backend/corpora/parliament/ireland.py @@ -10,6 +10,7 @@ from addcorpus.python_corpora.corpus import CorpusDefinition, CSVCorpusDefinition, XMLCorpusDefinition from addcorpus.python_corpora.extract import Constant, CSV, XML, Metadata, Combined, Backup +from addcorpus.es_mappings import main_content_mapping from corpora.parliament.parliament import Parliament import corpora.parliament.utils.field_defaults as field_defaults import corpora.utils.formatting as formatting @@ -149,7 +150,6 @@ def sources(self, start, end): source_archive = field_defaults.source_archive() source_archive.extractor = Constant('1919-2013') - fields = [ date, country, @@ -495,17 +495,8 @@ def source2dicts(self, source): speaker_id = field_defaults.speaker_id() speaker_constituency = field_defaults.speaker_constituency() - speech = field_defaults.speech() # no language-specific analysers since the corpus is mixed-language - speech.es_mapping = { - "type" : "text", - "fields": { - "length": { - "type": "token_count", - "analyzer": "standard" - } - } - } + speech = field_defaults.speech() speech_id = field_defaults.speech_id() topic = field_defaults.topic() diff --git a/backend/corpora/parliament/utils/field_defaults.py b/backend/corpora/parliament/utils/field_defaults.py index 35dc4c651..9ee54d2ef 100644 --- a/backend/corpora/parliament/utils/field_defaults.py +++ b/backend/corpora/parliament/utils/field_defaults.py @@ -289,7 +289,6 @@ def speech(language=None): stopword_analysis=has_language, stemming_analysis=has_language, language=language, - updated_highlighting=True ), results_overview=True, search_field_core=True,