WordPress · dhruvkb · Oct 23, 2023 · Oct 5, 2023 · Oct 5, 2023 · Oct 5, 2023
@@ -350,12 +350,14 @@ def search(
         ("extension", None),
         ("category", None),
         ("categories", "category"),
+        ("source", None),
+        ("license", None),
+        ("license_type", "license"),
+        # Audio-specific filters
         ("length", None),
+        # Image-specific filters
         ("aspect_ratio", None),
         ("size", None),
-        ("source", None),
-        ("license", "license__keyword"),
-        ("license_type", "license__keyword"),
     ]
     for serializer_field, es_field in filters:
         if serializer_field in search_params.data:
@@ -512,9 +514,7 @@ def related_media(uuid: str, index: str, filter_dead: bool) -> list[Hit]:
 
     # Search the default index for the item itself as it might be sensitive.
     item_search = Search(index=index)
-    # TODO: remove `__keyword` after
-    #  https://github.com/WordPress/openverse/pull/3143 is merged.
-    item_hit = item_search.query(Term(identifier__keyword=uuid)).execute().hits[0]
+    item_hit = item_search.query(Term(identifier=uuid)).execute().hits[0]
 
     # Match related using title.
     title = item_hit.title
@@ -539,9 +539,7 @@ def related_media(uuid: str, index: str, filter_dead: bool) -> list[Hit]:
     s = Search(index=f"{index}-filtered")
 
     # Exclude the current item and mature content.
-    # TODO: remove `__keyword` after
-    #  https://github.com/WordPress/openverse/pull/3143 is merged.
-    s = s.query(related_query & ~Term(identifier__keyword=uuid) & ~Term(mature=True))
+    s = s.query(related_query & ~Term(identifier=uuid) & ~Term(mature=True))
     # Exclude the dynamically disabled sources.
     s = _exclude_filtered(s)
 
@@ -579,7 +577,7 @@ def get_sources(index):
         aggs = {
             "unique_sources": {
                 "terms": {
-                    "field": "source.keyword",
+                    "field": "source",
                     "size": size,
                     "order": {"_key": "desc"},
                 }

@@ -35,13 +35,7 @@ def build(
             # Use `identifier` rather than the document `id` due to
             # `id` instability between refreshes:
             # https://github.com/WordPress/openverse/issues/2306
-            # `identifier` is mapped as `text` which will match fuzzily.
-            # Use `identifier.keyword` to match _exactly_
-            # cf: https://github.com/WordPress/openverse/issues/2154
-            Q(
-                "terms",
-                **{"identifier.keyword": all_result_identifiers},
-            )
+            Q("terms", identifier=all_result_identifiers)
         )
 
         # The default query size is 10, so we need to slice the query

@@ -97,26 +97,38 @@ def get_instance_attrs(row, schema):
         # cleanup tests in CI: test/unit_tests/test_cleanup.py
         category = row[schema["category"]] if "category" in schema else None
 
+        provider = row[schema["provider"]]
+        authority_boost = Media.get_authority_boost(meta, provider)
+
+        # This matches the order of fields defined in ``es_mapping.py``.
         return {
             "_id": row[schema["id"]],
             "id": row[schema["id"]],
+            "created_on": row[schema["created_on"]],
+            "mature": Media.get_maturity(meta, row[schema["mature"]]),
+            # Keyword fields
             "identifier": row[schema["identifier"]],
+            "license": row[schema["license"]].lower(),
+            "provider": provider,
+            "source": row[schema["source"]],
+            "category": category,
+            # Text-based fields
             "title": row[schema["title"]],
-            "foreign_landing_url": row[schema["foreign_landing_url"]],
             "description": Media.parse_description(meta),
             "creator": row[schema["creator"]],
-            "creator_url": row[schema["creator_url"]],
+            # Rank feature fields
+            "standardized_popularity": popularity,
+            "authority_boost": authority_boost,
+            "max_boost": max(popularity or 1, authority_boost or 1),
+            "min_boost": min(popularity or 1, authority_boost or 1),
+            # Nested fields
+            "tags": Media.parse_detailed_tags(row[schema["tags"]]),
+            # Extra fields, not indexed
             "url": row[schema["url"]],
-            "license": row[schema["license"]].lower(),
+            "foreign_landing_url": row[schema["foreign_landing_url"]],
+            "creator_url": row[schema["creator_url"]],
             "license_version": row[schema["license_version"]],
             "license_url": Media.get_license_url(meta),
-            "provider": row[schema["provider"]],
-            "source": row[schema["source"]],
-            "category": category,
-            "created_on": row[schema["created_on"]],
-            "tags": Media.parse_detailed_tags(row[schema["tags"]]),
-            "mature": Media.get_maturity(meta, row[schema["mature"]]),
-            "standardized_popularity": popularity,
         }
 
     @staticmethod
@@ -187,7 +199,7 @@ def get_popularity(raw):
     @staticmethod
     def parse_detailed_tags(json_tags):
         if not json_tags:
-            return None
+            return []
         parsed_tags = []
         for tag in json_tags:
             if "name" in tag:
@@ -230,28 +242,18 @@ class Index:
     @staticmethod
     def database_row_to_elasticsearch_doc(row, schema):
         extension = Image.get_extension(row[schema["url"]])
-
         height = row[schema["height"]]
         width = row[schema["width"]]
         aspect_ratio = Image.get_aspect_ratio(height, width)
         size = Image.get_size(height, width)
-
-        meta = row[schema["meta_data"]]
-        provider = row[schema["provider"]]
-        authority_boost = Image.get_authority_boost(meta, provider)
-
         attrs = Image.get_instance_attrs(row, schema)
-        attrs["category"] = attrs["category"]
-        popularity = attrs["standardized_popularity"]
 
         return Image(
-            thumbnail=row[schema["thumbnail"]],
             aspect_ratio=aspect_ratio,
             extension=extension,
             size=size,
-            authority_boost=authority_boost,
-            max_boost=max(popularity or 1, authority_boost or 1),
-            min_boost=min(popularity or 1, authority_boost or 1),
+            # Extra fields, not indexed
+            thumbnail=row[schema["thumbnail"]],
             **attrs,
         )
 
@@ -319,27 +321,18 @@ def database_row_to_elasticsearch_doc(row, schema):
         alt_files = row[schema["alt_files"]]
         filetype = row[schema["filetype"]]
         extension = Audio.get_extensions(filetype, alt_files)
-
-        meta = row[schema["meta_data"]]
-        provider = row[schema["provider"]]
-        authority_boost = Audio.get_authority_boost(meta, provider)
-
         attrs = Audio.get_instance_attrs(row, schema)
-        popularity = attrs["standardized_popularity"]
-
         length = Audio.get_length(row[schema["duration"]])
 
         return Audio(
+            length=length,
+            filetype=filetype,
+            extension=extension,
+            # Extra fields, not indexed
             bit_rate=row[schema["bit_rate"]],
             sample_rate=row[schema["sample_rate"]],
             genres=row[schema["genres"]],
             duration=row[schema["duration"]],
-            length=length,
-            filetype=filetype,
-            extension=extension,
-            authority_boost=authority_boost,
-            max_boost=max(popularity or 1, authority_boost or 1),
-            min_boost=min(popularity or 1, authority_boost or 1),
             **attrs,
         )
 

@@ -1,13 +1,22 @@
-def index_settings(table_name):
+from ingestion_server.constants.media_types import AUDIO_TYPE, IMAGE_TYPE, MediaType
+
+
+def index_settings(media_type: MediaType):
     """
     Return the Elasticsearch mapping for a given table in the database.
 
-    :param table_name: The name of the table in the upstream database.
-    :return:
+    :param media_type: The name of the table in the upstream database.
+    :return: the settings for the ES mapping
     """
+
+    number_of_shards: dict[MediaType, int] = {
+        IMAGE_TYPE: 18,
+        AUDIO_TYPE: 1,
+    }
+
     settings = {
         "index": {
-            "number_of_shards": 18,
+            "number_of_shards": number_of_shards[media_type],
             "number_of_replicas": 0,
             "refresh_interval": "-1",
         },
@@ -51,109 +60,89 @@ def index_settings(table_name):
         },
     }
     common_mappings = {
+        "dynamic": False,  # extra fields are stored in ``_source`` but not indexed
         "properties": {
             "id": {"type": "long"},
+            "created_on": {"type": "date"},
+            "mature": {"type": "boolean"},
+            # Keyword fields
             "identifier": {
-                "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
-                "type": "text",
+                # TODO: Remove subfield when API is updated
+                "fields": {"keyword": {"type": "keyword"}},
+                "type": "keyword",
             },
+            "extension": {"type": "keyword"},
+            "license": {
+                # TODO: Remove subfield when API is updated
+                "fields": {"keyword": {"type": "keyword"}},
+                "type": "keyword",
+            },
+            "provider": {"type": "keyword"},
+            "source": {
+                # TODO: Remove subfield when API is updated
+                "fields": {"keyword": {"type": "keyword"}},
+                "type": "keyword",
+            },
+            "filetype": {"type": "keyword"},
+            "category": {"type": "keyword"},
+            # Text-based fields
             "title": {
                 "type": "text",
+                "analyzer": "custom_english",
                 "similarity": "boolean",
                 "fields": {
                     "keyword": {"type": "keyword", "ignore_above": 256},
                     "raw": {"type": "text", "index": True},
                 },
-                "analyzer": "custom_english",
-            },
-            "foreign_landing_url": {
-                "fields": {"keyword": {"ignore_above": 256, "type": "keyword"}},
-                "type": "text",
             },
             "description": {
-                "fields": {
-                    "keyword": {"type": "keyword", "similarity": "boolean"},
-                    "raw": {"type": "text", "index": True},
-                },
                 "type": "text",
                 "analyzer": "custom_english",
+                "similarity": "boolean",
+                "fields": {"raw": {"type": "text", "index": True}},
             },
             "creator": {
                 "type": "text",
                 "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
             },
-            "url": {
-                "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
-                "type": "text",
-            },
-            "extension": {
-                "fields": {"keyword": {"ignore_above": 8, "type": "keyword"}},
-                "type": "text",
-            },
-            "license": {
-                "fields": {"keyword": {"ignore_above": 256, "type": "keyword"}},
-                "type": "text",
-            },
-            "license_version": {
-                "type": "text",
-                "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
-            },
-            "license_url": {
-                "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
-                "type": "text",
-            },
-            "provider": {
-                "type": "text",
-                "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
-            },
-            "source": {
-                "fields": {"keyword": {"ignore_above": 256, "type": "keyword"}},
-                "type": "text",
+            # Rank feature fields
+            "standardized_popularity": {"type": "rank_feature"},
+            "authority_boost": {"type": "rank_feature"},
+            "authority_penalty": {
+                "type": "rank_feature",
+                "positive_score_impact": False,
             },
-            "filetype": {"type": "keyword"},
-            "created_on": {"type": "date"},
+            "max_boost": {"type": "rank_feature"},
+            "min_boost": {"type": "rank_feature"},
+            # Nested fields
             "tags": {
                 "properties": {
                     "accuracy": {"type": "float"},
+                    # Text-based fields
                     "name": {
                         "type": "text",
+                        "analyzer": "custom_english",
                         "fields": {
                             "keyword": {"type": "keyword", "ignore_above": 256},
                             "raw": {"type": "text", "index": True},
                         },
-                        "analyzer": "custom_english",
                     },
                 }
             },
-            "mature": {"type": "boolean"},
-            "standardized_popularity": {"type": "rank_feature"},
-            "authority_boost": {"type": "rank_feature"},
-            "authority_penalty": {
-                "type": "rank_feature",
-                "positive_score_impact": False,
-            },
-            "max_boost": {"type": "rank_feature"},
-            "min_boost": {"type": "rank_feature"},
-            "category": {"type": "keyword"},
-        }
+        },
     }
     media_properties = {
         "image": {
-            "aspect_ratio": {
-                "fields": {"keyword": {"type": "keyword"}},
-                "type": "text",
-            },
-            "size": {"fields": {"keyword": {"type": "keyword"}}, "type": "text"},
+            # Keyword fields
+            "aspect_ratio": {"type": "keyword"},
+            "size": {"type": "keyword"},
         },
         "audio": {
-            "bit_rate": {"type": "integer"},
-            "sample_rate": {"type": "integer"},
-            "genres": {"fields": {"keyword": {"type": "keyword"}}, "type": "text"},
-            "duration": {"type": "integer"},
+            # Keyword fields
             "length": {"type": "keyword"},
         },
     }
     media_mappings = common_mappings.copy()
-    media_mappings["properties"].update(media_properties[table_name])
+    media_mappings["properties"].update(media_properties[media_type])
     result = {"settings": settings.copy(), "mappings": media_mappings}
     return result