Move vocabulary and section handling to preprocessing and Get OPENSEA…

…RCH from c.e.ingest module
collective · Dec 1, 2023 · da9b040 · da9b040
1 parent 8c5f916
commit da9b040
Show file tree

Hide file tree

Showing 9 changed files with 65 additions and 53 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -6,7 +6,10 @@ Changelog
 
 - Fix: Plone schema caching logic was broken. [jensens]
 - Fix: Remove unnecessary ``batching`` on preprocessing. [jensens]
-- Fix: Add missigit sng ``zope.schema._field.Set`` to examples mapping. [jensens]
+- Fix: Add missing ``zope.schema._field.Set`` to examples mapping. [jensens]
+- Enhancement: Move vocabulary and section handling to preprocessing.
+  Turn ``ingest`` package into module. [jensens]
+- Fix: Get OPENSEARCH from c.e.ingest module and do not dup here. [jensens]
 
 2.0.0rc4 (2023-11-28)
 ---------------------

diff --git a/src/collective/elastic/ingest/__init__.py b/src/collective/elastic/ingest/__init__.py
@@ -1,4 +0,0 @@
-import os
-
-
-OPENSEARCH = os.environ.get("INDEX_OPENSEARCH") == "1"

diff --git a/src/collective/elastic/ingest/client.py b/src/collective/elastic/ingest/client.py
@@ -1,5 +1,5 @@
-from . import OPENSEARCH
 from .logging import logger
+from collective.elastic.ingest import OPENSEARCH
 
 import os
 import threading

diff --git a/...lective/elastic/ingest/ingest/__init__.py → src/collective/elastic/ingest/ingest.py b/...lective/elastic/ingest/ingest/__init__.py → src/collective/elastic/ingest/ingest.py
@@ -1,14 +1,12 @@
-from .. import OPENSEARCH
-from ..client import get_client
-from ..logging import logger
-from ..mapping import create_or_update_mapping
-from ..mapping import EXPANSION_FIELDS
-from ..mapping import get_field_map
-from ..mapping import iterate_schema
-from ..postprocessing import postprocess
-from ..preprocessing import preprocess
-from .section import enrichWithSection
-from .vocabularyfields import stripVocabularyTermTitles
+from .client import get_client
+from .logging import logger
+from .mapping import create_or_update_mapping
+from .mapping import EXPANSION_FIELDS
+from .mapping import get_field_map
+from .mapping import iterate_schema
+from .postprocessing import postprocess
+from .preprocessing import preprocess
+from collective.elastic.ingest import OPENSEARCH
 from pprint import pformat
 
 import time
@@ -84,13 +82,8 @@ def ingest(content, full_schema, index_name):
     then postprocess and finally index the content.
     """
 
-    logger.debug(f"Process content: {pformat(content)}")
     start = time.time()
-
-    # special preprocessing logic for section and vocabulary fields
-    # TODO: refactor as special preprocessing
-    enrichWithSection(content)
-    stripVocabularyTermTitles(content)
+    logger.debug(f"Process content: {pformat(content)}")
 
     # generic preprocessing accrording to rule in preprocessings.json
     preprocess(content, full_schema)

diff --git a/src/collective/elastic/ingest/ingest/section.py b/src/collective/elastic/ingest/ingest/section.py
diff --git a/src/collective/elastic/ingest/ingest/vocabularyfields.py b/src/collective/elastic/ingest/ingest/vocabularyfields.py
diff --git a/src/collective/elastic/ingest/mapping.py b/src/collective/elastic/ingest/mapping.py
@@ -1,7 +1,7 @@
-from . import OPENSEARCH
 from .analysis import get_analysis
 from .client import get_client
 from .logging import logger
+from collective.elastic.ingest import OPENSEARCH
 from copy import deepcopy
 
 import json

diff --git a/src/collective/elastic/ingest/preprocessing.py b/src/collective/elastic/ingest/preprocessing.py
@@ -156,6 +156,51 @@ def action_empty_removal(content, full_schema, key):
 ACTION_FUNCTIONS["remove_empty"] = action_empty_removal
 
 
+def action_strip_vocabulary_term_titles(content, full_schema):
+    """If field with vocabulary: Convert field value to token or list of tokens."""
+    for fieldname, field in content.items():
+        if isinstance(field, dict) and set(field.keys()) == {"title", "token"}:
+            content[fieldname] = field["token"]
+        elif (
+            isinstance(field, list)
+            and len(field) > 0
+            and isinstance(field[0], dict)
+            and set(field.keys()) == {"title", "token"}
+        ):
+            content[fieldname] = [el["token"] for el in field]
+
+
+ACTION_FUNCTIONS["strip_vocabulary_term_titles"] = action_strip_vocabulary_term_titles
+
+
+def action_enrich_with_section(content, fullschema):
+    """Add section to content.
+
+    TODO: make this take the site root into account and remove this step (see below)
+
+    At the moment this is not very generic.
+    It guesses the section based on the path element after the root.
+
+    In fact, this element can be something else, e.g. a language root folder defining a subsite.
+    Same is possible with Lineage based subsites and so on.
+
+    A solution would be to add an expansion in c.e.plone to add the site and section to the content and use this information.
+    Then this step can be deprecated and later on removed and a rewrite + addtional schema would be enough.
+    """
+    base = "/".join(
+        [
+            str(os.environ.get("PLONE_SERVICE")),
+            str(os.environ.get("PLONE_PATH")),
+        ]
+    ).strip("/")
+    content_url = content["@id"]
+    path = content_url.replace(base, "")
+    content["section"] = path.split("/")[1] if len(path.split("/")) > 1 else "__root__"
+
+
+ACTION_FUNCTIONS["enrich_with_section"] = action_enrich_with_section
+
+
 def preprocess(content, full_schema):
     """run full preprocessing pipeline on content and schema"""
     for ppcfg in PREPROCESSOR_CONFIGS:

diff --git a/src/collective/elastic/ingest/preprocessings.json b/src/collective/elastic/ingest/preprocessings.json
@@ -3,6 +3,10 @@
     "comment": "Remove all empty fields.",
     "action": "remove_empty"
   },
+  {
+    "comment": "Transform vocabularies to it token only form.",
+    "action": "strip_vocabulary_term_titles"
+  },
   {
     "comment": "ProxyIndex needs this information, essential rewrite, do not remove",
     "action": "rewrite",