From da9b04006564415844e2bd13ec7edd57b8013fdc Mon Sep 17 00:00:00 2001 From: "Jens W. Klein" Date: Fri, 1 Dec 2023 16:02:54 +0100 Subject: [PATCH] Move vocabulary and section handling to preprocessing and Get OPENSEARCH from c.e.ingest module --- CHANGES.rst | 5 ++- src/collective/elastic/ingest/__init__.py | 4 -- src/collective/elastic/ingest/client.py | 2 +- .../ingest/{ingest/__init__.py => ingest.py} | 27 +++++------ .../elastic/ingest/ingest/section.py | 14 ------ .../elastic/ingest/ingest/vocabularyfields.py | 15 ------- src/collective/elastic/ingest/mapping.py | 2 +- .../elastic/ingest/preprocessing.py | 45 +++++++++++++++++++ .../elastic/ingest/preprocessings.json | 4 ++ 9 files changed, 65 insertions(+), 53 deletions(-) rename src/collective/elastic/ingest/{ingest/__init__.py => ingest.py} (85%) delete mode 100644 src/collective/elastic/ingest/ingest/section.py delete mode 100644 src/collective/elastic/ingest/ingest/vocabularyfields.py diff --git a/CHANGES.rst b/CHANGES.rst index 77aadef..4014e4f 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -6,7 +6,10 @@ Changelog - Fix: Plone schema caching logic was broken. [jensens] - Fix: Remove unnecessary ``batching`` on preprocessing. [jensens] -- Fix: Add missigit sng ``zope.schema._field.Set`` to examples mapping. [jensens] +- Fix: Add missing ``zope.schema._field.Set`` to examples mapping. [jensens] +- Enhancement: Move vocabulary and section handling to preprocessing. + Turn ``ingest`` package into module. [jensens] +- Fix: Get OPENSEARCH from c.e.ingest module and do not dup here. [jensens] 2.0.0rc4 (2023-11-28) --------------------- diff --git a/src/collective/elastic/ingest/__init__.py b/src/collective/elastic/ingest/__init__.py index cf5b831..e69de29 100644 --- a/src/collective/elastic/ingest/__init__.py +++ b/src/collective/elastic/ingest/__init__.py @@ -1,4 +0,0 @@ -import os - - -OPENSEARCH = os.environ.get("INDEX_OPENSEARCH") == "1" diff --git a/src/collective/elastic/ingest/client.py b/src/collective/elastic/ingest/client.py index b0624d4..74a5915 100644 --- a/src/collective/elastic/ingest/client.py +++ b/src/collective/elastic/ingest/client.py @@ -1,5 +1,5 @@ -from . import OPENSEARCH from .logging import logger +from collective.elastic.ingest import OPENSEARCH import os import threading diff --git a/src/collective/elastic/ingest/ingest/__init__.py b/src/collective/elastic/ingest/ingest.py similarity index 85% rename from src/collective/elastic/ingest/ingest/__init__.py rename to src/collective/elastic/ingest/ingest.py index 6fff56b..bbf3c33 100644 --- a/src/collective/elastic/ingest/ingest/__init__.py +++ b/src/collective/elastic/ingest/ingest.py @@ -1,14 +1,12 @@ -from .. import OPENSEARCH -from ..client import get_client -from ..logging import logger -from ..mapping import create_or_update_mapping -from ..mapping import EXPANSION_FIELDS -from ..mapping import get_field_map -from ..mapping import iterate_schema -from ..postprocessing import postprocess -from ..preprocessing import preprocess -from .section import enrichWithSection -from .vocabularyfields import stripVocabularyTermTitles +from .client import get_client +from .logging import logger +from .mapping import create_or_update_mapping +from .mapping import EXPANSION_FIELDS +from .mapping import get_field_map +from .mapping import iterate_schema +from .postprocessing import postprocess +from .preprocessing import preprocess +from collective.elastic.ingest import OPENSEARCH from pprint import pformat import time @@ -84,13 +82,8 @@ def ingest(content, full_schema, index_name): then postprocess and finally index the content. """ - logger.debug(f"Process content: {pformat(content)}") start = time.time() - - # special preprocessing logic for section and vocabulary fields - # TODO: refactor as special preprocessing - enrichWithSection(content) - stripVocabularyTermTitles(content) + logger.debug(f"Process content: {pformat(content)}") # generic preprocessing accrording to rule in preprocessings.json preprocess(content, full_schema) diff --git a/src/collective/elastic/ingest/ingest/section.py b/src/collective/elastic/ingest/ingest/section.py deleted file mode 100644 index 3511a08..0000000 --- a/src/collective/elastic/ingest/ingest/section.py +++ /dev/null @@ -1,14 +0,0 @@ -import os - - -def enrichWithSection(content): - base = "/".join( - [ - str(os.environ.get("PLONE_SERVICE")), - str(os.environ.get("PLONE_PATH")), - ] - ).strip("/") - content_url = content["@id"] - path = content_url.replace(base, "") - content["section"] = path.split("/")[1] if len(path.split("/")) > 1 else "__root__" - return content diff --git a/src/collective/elastic/ingest/ingest/vocabularyfields.py b/src/collective/elastic/ingest/ingest/vocabularyfields.py deleted file mode 100644 index 4d596bc..0000000 --- a/src/collective/elastic/ingest/ingest/vocabularyfields.py +++ /dev/null @@ -1,15 +0,0 @@ -def stripVocabularyTermTitles(content): - """If field with vocabulary: Convert field value to token or list of tokens.""" - for fieldname in content.keys(): - if type(content[fieldname]) is dict: - if sorted(list(content[fieldname].keys())) == ["title", "token"]: - content[fieldname] = content[fieldname]["token"] - - if type(content[fieldname]) is list: - if ( - len(content[fieldname]) > 0 - and type(content[fieldname][0]) is dict - and sorted(list(content[fieldname][0].keys())) == ["title", "token"] - ): - content[fieldname] = [el["token"] for el in content[fieldname]] - return content diff --git a/src/collective/elastic/ingest/mapping.py b/src/collective/elastic/ingest/mapping.py index 685872a..623402d 100644 --- a/src/collective/elastic/ingest/mapping.py +++ b/src/collective/elastic/ingest/mapping.py @@ -1,7 +1,7 @@ -from . import OPENSEARCH from .analysis import get_analysis from .client import get_client from .logging import logger +from collective.elastic.ingest import OPENSEARCH from copy import deepcopy import json diff --git a/src/collective/elastic/ingest/preprocessing.py b/src/collective/elastic/ingest/preprocessing.py index 1fa7df5..41495ec 100644 --- a/src/collective/elastic/ingest/preprocessing.py +++ b/src/collective/elastic/ingest/preprocessing.py @@ -156,6 +156,51 @@ def action_empty_removal(content, full_schema, key): ACTION_FUNCTIONS["remove_empty"] = action_empty_removal +def action_strip_vocabulary_term_titles(content, full_schema): + """If field with vocabulary: Convert field value to token or list of tokens.""" + for fieldname, field in content.items(): + if isinstance(field, dict) and set(field.keys()) == {"title", "token"}: + content[fieldname] = field["token"] + elif ( + isinstance(field, list) + and len(field) > 0 + and isinstance(field[0], dict) + and set(field.keys()) == {"title", "token"} + ): + content[fieldname] = [el["token"] for el in field] + + +ACTION_FUNCTIONS["strip_vocabulary_term_titles"] = action_strip_vocabulary_term_titles + + +def action_enrich_with_section(content, fullschema): + """Add section to content. + + TODO: make this take the site root into account and remove this step (see below) + + At the moment this is not very generic. + It guesses the section based on the path element after the root. + + In fact, this element can be something else, e.g. a language root folder defining a subsite. + Same is possible with Lineage based subsites and so on. + + A solution would be to add an expansion in c.e.plone to add the site and section to the content and use this information. + Then this step can be deprecated and later on removed and a rewrite + addtional schema would be enough. + """ + base = "/".join( + [ + str(os.environ.get("PLONE_SERVICE")), + str(os.environ.get("PLONE_PATH")), + ] + ).strip("/") + content_url = content["@id"] + path = content_url.replace(base, "") + content["section"] = path.split("/")[1] if len(path.split("/")) > 1 else "__root__" + + +ACTION_FUNCTIONS["enrich_with_section"] = action_enrich_with_section + + def preprocess(content, full_schema): """run full preprocessing pipeline on content and schema""" for ppcfg in PREPROCESSOR_CONFIGS: diff --git a/src/collective/elastic/ingest/preprocessings.json b/src/collective/elastic/ingest/preprocessings.json index 2639e75..3ad7c4b 100644 --- a/src/collective/elastic/ingest/preprocessings.json +++ b/src/collective/elastic/ingest/preprocessings.json @@ -3,6 +3,10 @@ "comment": "Remove all empty fields.", "action": "remove_empty" }, + { + "comment": "Transform vocabularies to it token only form.", + "action": "strip_vocabulary_term_titles" + }, { "comment": "ProxyIndex needs this information, essential rewrite, do not remove", "action": "rewrite",