Skip to content

Commit

Permalink
Move vocabulary and section handling to preprocessing and Get OPENSEA…
Browse files Browse the repository at this point in the history
…RCH from c.e.ingest module
  • Loading branch information
jensens committed Dec 1, 2023
1 parent 8c5f916 commit da9b040
Show file tree
Hide file tree
Showing 9 changed files with 65 additions and 53 deletions.
5 changes: 4 additions & 1 deletion CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@ Changelog

- Fix: Plone schema caching logic was broken. [jensens]
- Fix: Remove unnecessary ``batching`` on preprocessing. [jensens]
- Fix: Add missigit sng ``zope.schema._field.Set`` to examples mapping. [jensens]
- Fix: Add missing ``zope.schema._field.Set`` to examples mapping. [jensens]
- Enhancement: Move vocabulary and section handling to preprocessing.
Turn ``ingest`` package into module. [jensens]
- Fix: Get OPENSEARCH from c.e.ingest module and do not dup here. [jensens]

2.0.0rc4 (2023-11-28)
---------------------
Expand Down
4 changes: 0 additions & 4 deletions src/collective/elastic/ingest/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +0,0 @@
import os


OPENSEARCH = os.environ.get("INDEX_OPENSEARCH") == "1"
2 changes: 1 addition & 1 deletion src/collective/elastic/ingest/client.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from . import OPENSEARCH
from .logging import logger
from collective.elastic.ingest import OPENSEARCH

import os
import threading
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
from .. import OPENSEARCH
from ..client import get_client
from ..logging import logger
from ..mapping import create_or_update_mapping
from ..mapping import EXPANSION_FIELDS
from ..mapping import get_field_map
from ..mapping import iterate_schema
from ..postprocessing import postprocess
from ..preprocessing import preprocess
from .section import enrichWithSection
from .vocabularyfields import stripVocabularyTermTitles
from .client import get_client
from .logging import logger
from .mapping import create_or_update_mapping
from .mapping import EXPANSION_FIELDS
from .mapping import get_field_map
from .mapping import iterate_schema
from .postprocessing import postprocess
from .preprocessing import preprocess
from collective.elastic.ingest import OPENSEARCH
from pprint import pformat

import time
Expand Down Expand Up @@ -84,13 +82,8 @@ def ingest(content, full_schema, index_name):
then postprocess and finally index the content.
"""

logger.debug(f"Process content: {pformat(content)}")
start = time.time()

# special preprocessing logic for section and vocabulary fields
# TODO: refactor as special preprocessing
enrichWithSection(content)
stripVocabularyTermTitles(content)
logger.debug(f"Process content: {pformat(content)}")

# generic preprocessing accrording to rule in preprocessings.json
preprocess(content, full_schema)
Expand Down
14 changes: 0 additions & 14 deletions src/collective/elastic/ingest/ingest/section.py

This file was deleted.

15 changes: 0 additions & 15 deletions src/collective/elastic/ingest/ingest/vocabularyfields.py

This file was deleted.

2 changes: 1 addition & 1 deletion src/collective/elastic/ingest/mapping.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from . import OPENSEARCH
from .analysis import get_analysis
from .client import get_client
from .logging import logger
from collective.elastic.ingest import OPENSEARCH
from copy import deepcopy

import json
Expand Down
45 changes: 45 additions & 0 deletions src/collective/elastic/ingest/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,51 @@ def action_empty_removal(content, full_schema, key):
ACTION_FUNCTIONS["remove_empty"] = action_empty_removal


def action_strip_vocabulary_term_titles(content, full_schema):
"""If field with vocabulary: Convert field value to token or list of tokens."""
for fieldname, field in content.items():
if isinstance(field, dict) and set(field.keys()) == {"title", "token"}:
content[fieldname] = field["token"]
elif (
isinstance(field, list)
and len(field) > 0
and isinstance(field[0], dict)
and set(field.keys()) == {"title", "token"}
):
content[fieldname] = [el["token"] for el in field]


ACTION_FUNCTIONS["strip_vocabulary_term_titles"] = action_strip_vocabulary_term_titles


def action_enrich_with_section(content, fullschema):
"""Add section to content.
TODO: make this take the site root into account and remove this step (see below)
At the moment this is not very generic.
It guesses the section based on the path element after the root.
In fact, this element can be something else, e.g. a language root folder defining a subsite.
Same is possible with Lineage based subsites and so on.
A solution would be to add an expansion in c.e.plone to add the site and section to the content and use this information.
Then this step can be deprecated and later on removed and a rewrite + addtional schema would be enough.
"""
base = "/".join(
[
str(os.environ.get("PLONE_SERVICE")),
str(os.environ.get("PLONE_PATH")),
]
).strip("/")
content_url = content["@id"]
path = content_url.replace(base, "")
content["section"] = path.split("/")[1] if len(path.split("/")) > 1 else "__root__"


ACTION_FUNCTIONS["enrich_with_section"] = action_enrich_with_section


def preprocess(content, full_schema):
"""run full preprocessing pipeline on content and schema"""
for ppcfg in PREPROCESSOR_CONFIGS:
Expand Down
4 changes: 4 additions & 0 deletions src/collective/elastic/ingest/preprocessings.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
"comment": "Remove all empty fields.",
"action": "remove_empty"
},
{
"comment": "Transform vocabularies to it token only form.",
"action": "strip_vocabulary_term_titles"
},
{
"comment": "ProxyIndex needs this information, essential rewrite, do not remove",
"action": "rewrite",
Expand Down

0 comments on commit da9b040

Please sign in to comment.