From da9b04006564415844e2bd13ec7edd57b8013fdc Mon Sep 17 00:00:00 2001
From: "Jens W. Klein" <jk@kleinundpartner.at>
Date: Fri, 1 Dec 2023 16:02:54 +0100
Subject: [PATCH] Move vocabulary and section handling to preprocessing and Get
 OPENSEARCH from c.e.ingest module

---
 CHANGES.rst                                   |  5 ++-
 src/collective/elastic/ingest/__init__.py     |  4 --
 src/collective/elastic/ingest/client.py       |  2 +-
 .../ingest/{ingest/__init__.py => ingest.py}  | 27 +++++------
 .../elastic/ingest/ingest/section.py          | 14 ------
 .../elastic/ingest/ingest/vocabularyfields.py | 15 -------
 src/collective/elastic/ingest/mapping.py      |  2 +-
 .../elastic/ingest/preprocessing.py           | 45 +++++++++++++++++++
 .../elastic/ingest/preprocessings.json        |  4 ++
 9 files changed, 65 insertions(+), 53 deletions(-)
 rename src/collective/elastic/ingest/{ingest/__init__.py => ingest.py} (85%)
 delete mode 100644 src/collective/elastic/ingest/ingest/section.py
 delete mode 100644 src/collective/elastic/ingest/ingest/vocabularyfields.py

diff --git a/CHANGES.rst b/CHANGES.rst
index 77aadef..4014e4f 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -6,7 +6,10 @@ Changelog
 
 - Fix: Plone schema caching logic was broken. [jensens]
 - Fix: Remove unnecessary ``batching`` on preprocessing. [jensens]
-- Fix: Add missigit sng ``zope.schema._field.Set`` to examples mapping. [jensens]
+- Fix: Add missing ``zope.schema._field.Set`` to examples mapping. [jensens]
+- Enhancement: Move vocabulary and section handling to preprocessing.
+  Turn ``ingest`` package into module. [jensens]
+- Fix: Get OPENSEARCH from c.e.ingest module and do not dup here. [jensens]
 
 2.0.0rc4 (2023-11-28)
 ---------------------
diff --git a/src/collective/elastic/ingest/__init__.py b/src/collective/elastic/ingest/__init__.py
index cf5b831..e69de29 100644
--- a/src/collective/elastic/ingest/__init__.py
+++ b/src/collective/elastic/ingest/__init__.py
@@ -1,4 +0,0 @@
-import os
-
-
-OPENSEARCH = os.environ.get("INDEX_OPENSEARCH") == "1"
diff --git a/src/collective/elastic/ingest/client.py b/src/collective/elastic/ingest/client.py
index b0624d4..74a5915 100644
--- a/src/collective/elastic/ingest/client.py
+++ b/src/collective/elastic/ingest/client.py
@@ -1,5 +1,5 @@
-from . import OPENSEARCH
 from .logging import logger
+from collective.elastic.ingest import OPENSEARCH
 
 import os
 import threading
diff --git a/src/collective/elastic/ingest/ingest/__init__.py b/src/collective/elastic/ingest/ingest.py
similarity index 85%
rename from src/collective/elastic/ingest/ingest/__init__.py
rename to src/collective/elastic/ingest/ingest.py
index 6fff56b..bbf3c33 100644
--- a/src/collective/elastic/ingest/ingest/__init__.py
+++ b/src/collective/elastic/ingest/ingest.py
@@ -1,14 +1,12 @@
-from .. import OPENSEARCH
-from ..client import get_client
-from ..logging import logger
-from ..mapping import create_or_update_mapping
-from ..mapping import EXPANSION_FIELDS
-from ..mapping import get_field_map
-from ..mapping import iterate_schema
-from ..postprocessing import postprocess
-from ..preprocessing import preprocess
-from .section import enrichWithSection
-from .vocabularyfields import stripVocabularyTermTitles
+from .client import get_client
+from .logging import logger
+from .mapping import create_or_update_mapping
+from .mapping import EXPANSION_FIELDS
+from .mapping import get_field_map
+from .mapping import iterate_schema
+from .postprocessing import postprocess
+from .preprocessing import preprocess
+from collective.elastic.ingest import OPENSEARCH
 from pprint import pformat
 
 import time
@@ -84,13 +82,8 @@ def ingest(content, full_schema, index_name):
     then postprocess and finally index the content.
     """
 
-    logger.debug(f"Process content: {pformat(content)}")
     start = time.time()
-
-    # special preprocessing logic for section and vocabulary fields
-    # TODO: refactor as special preprocessing
-    enrichWithSection(content)
-    stripVocabularyTermTitles(content)
+    logger.debug(f"Process content: {pformat(content)}")
 
     # generic preprocessing accrording to rule in preprocessings.json
     preprocess(content, full_schema)
diff --git a/src/collective/elastic/ingest/ingest/section.py b/src/collective/elastic/ingest/ingest/section.py
deleted file mode 100644
index 3511a08..0000000
--- a/src/collective/elastic/ingest/ingest/section.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import os
-
-
-def enrichWithSection(content):
-    base = "/".join(
-        [
-            str(os.environ.get("PLONE_SERVICE")),
-            str(os.environ.get("PLONE_PATH")),
-        ]
-    ).strip("/")
-    content_url = content["@id"]
-    path = content_url.replace(base, "")
-    content["section"] = path.split("/")[1] if len(path.split("/")) > 1 else "__root__"
-    return content
diff --git a/src/collective/elastic/ingest/ingest/vocabularyfields.py b/src/collective/elastic/ingest/ingest/vocabularyfields.py
deleted file mode 100644
index 4d596bc..0000000
--- a/src/collective/elastic/ingest/ingest/vocabularyfields.py
+++ /dev/null
@@ -1,15 +0,0 @@
-def stripVocabularyTermTitles(content):
-    """If field with vocabulary: Convert field value to token or list of tokens."""
-    for fieldname in content.keys():
-        if type(content[fieldname]) is dict:
-            if sorted(list(content[fieldname].keys())) == ["title", "token"]:
-                content[fieldname] = content[fieldname]["token"]
-
-        if type(content[fieldname]) is list:
-            if (
-                len(content[fieldname]) > 0
-                and type(content[fieldname][0]) is dict
-                and sorted(list(content[fieldname][0].keys())) == ["title", "token"]
-            ):
-                content[fieldname] = [el["token"] for el in content[fieldname]]
-    return content
diff --git a/src/collective/elastic/ingest/mapping.py b/src/collective/elastic/ingest/mapping.py
index 685872a..623402d 100644
--- a/src/collective/elastic/ingest/mapping.py
+++ b/src/collective/elastic/ingest/mapping.py
@@ -1,7 +1,7 @@
-from . import OPENSEARCH
 from .analysis import get_analysis
 from .client import get_client
 from .logging import logger
+from collective.elastic.ingest import OPENSEARCH
 from copy import deepcopy
 
 import json
diff --git a/src/collective/elastic/ingest/preprocessing.py b/src/collective/elastic/ingest/preprocessing.py
index 1fa7df5..41495ec 100644
--- a/src/collective/elastic/ingest/preprocessing.py
+++ b/src/collective/elastic/ingest/preprocessing.py
@@ -156,6 +156,51 @@ def action_empty_removal(content, full_schema, key):
 ACTION_FUNCTIONS["remove_empty"] = action_empty_removal
 
 
+def action_strip_vocabulary_term_titles(content, full_schema):
+    """If field with vocabulary: Convert field value to token or list of tokens."""
+    for fieldname, field in content.items():
+        if isinstance(field, dict) and set(field.keys()) == {"title", "token"}:
+            content[fieldname] = field["token"]
+        elif (
+            isinstance(field, list)
+            and len(field) > 0
+            and isinstance(field[0], dict)
+            and set(field.keys()) == {"title", "token"}
+        ):
+            content[fieldname] = [el["token"] for el in field]
+
+
+ACTION_FUNCTIONS["strip_vocabulary_term_titles"] = action_strip_vocabulary_term_titles
+
+
+def action_enrich_with_section(content, fullschema):
+    """Add section to content.
+
+    TODO: make this take the site root into account and remove this step (see below)
+
+    At the moment this is not very generic.
+    It guesses the section based on the path element after the root.
+
+    In fact, this element can be something else, e.g. a language root folder defining a subsite.
+    Same is possible with Lineage based subsites and so on.
+
+    A solution would be to add an expansion in c.e.plone to add the site and section to the content and use this information.
+    Then this step can be deprecated and later on removed and a rewrite + addtional schema would be enough.
+    """
+    base = "/".join(
+        [
+            str(os.environ.get("PLONE_SERVICE")),
+            str(os.environ.get("PLONE_PATH")),
+        ]
+    ).strip("/")
+    content_url = content["@id"]
+    path = content_url.replace(base, "")
+    content["section"] = path.split("/")[1] if len(path.split("/")) > 1 else "__root__"
+
+
+ACTION_FUNCTIONS["enrich_with_section"] = action_enrich_with_section
+
+
 def preprocess(content, full_schema):
     """run full preprocessing pipeline on content and schema"""
     for ppcfg in PREPROCESSOR_CONFIGS:
diff --git a/src/collective/elastic/ingest/preprocessings.json b/src/collective/elastic/ingest/preprocessings.json
index 2639e75..3ad7c4b 100644
--- a/src/collective/elastic/ingest/preprocessings.json
+++ b/src/collective/elastic/ingest/preprocessings.json
@@ -3,6 +3,10 @@
     "comment": "Remove all empty fields.",
     "action": "remove_empty"
   },
+  {
+    "comment": "Transform vocabularies to it token only form.",
+    "action": "strip_vocabulary_term_titles"
+  },
   {
     "comment": "ProxyIndex needs this information, essential rewrite, do not remove",
     "action": "rewrite",