WordPress · dhruvkb · Oct 19, 2023 · Oct 5, 2023 · Oct 5, 2023 · Oct 5, 2023
@@ -97,26 +97,38 @@ def get_instance_attrs(row, schema):
         # cleanup tests in CI: test/unit_tests/test_cleanup.py
         category = row[schema["category"]] if "category" in schema else None
 
+        provider = row[schema["provider"]]
+        authority_boost = Media.get_authority_boost(meta, provider)
+
+        # This matches the order of fields defined in ``es_mapping.py``.
         return {
             "_id": row[schema["id"]],
             "id": row[schema["id"]],
+            "created_on": row[schema["created_on"]],
+            "mature": Media.get_maturity(meta, row[schema["mature"]]),
+            # Keyword fields
             "identifier": row[schema["identifier"]],
+            "license": row[schema["license"]].lower(),
+            "provider": provider,
+            "source": row[schema["source"]],
+            "category": category,
+            # Text-based fields
             "title": row[schema["title"]],
-            "foreign_landing_url": row[schema["foreign_landing_url"]],
             "description": Media.parse_description(meta),
             "creator": row[schema["creator"]],
-            "creator_url": row[schema["creator_url"]],
+            # Rank feature fields
+            "standardized_popularity": popularity,
+            "authority_boost": authority_boost,
+            "max_boost": max(popularity or 1, authority_boost or 1),
+            "min_boost": min(popularity or 1, authority_boost or 1),
+            # Nested fields
+            "tags": Media.parse_detailed_tags(row[schema["tags"]]),
+            # Extra fields, not indexed
             "url": row[schema["url"]],
-            "license": row[schema["license"]].lower(),
+            "foreign_landing_url": row[schema["foreign_landing_url"]],
+            "creator_url": row[schema["creator_url"]],
             "license_version": row[schema["license_version"]],
             "license_url": Media.get_license_url(meta),
-            "provider": row[schema["provider"]],
-            "source": row[schema["source"]],
-            "category": category,
-            "created_on": row[schema["created_on"]],
-            "tags": Media.parse_detailed_tags(row[schema["tags"]]),
-            "mature": Media.get_maturity(meta, row[schema["mature"]]),
-            "standardized_popularity": popularity,
         }
 
     @staticmethod
@@ -230,28 +242,18 @@ class Index:
     @staticmethod
     def database_row_to_elasticsearch_doc(row, schema):
         extension = Image.get_extension(row[schema["url"]])
-
         height = row[schema["height"]]
         width = row[schema["width"]]
         aspect_ratio = Image.get_aspect_ratio(height, width)
         size = Image.get_size(height, width)
-
-        meta = row[schema["meta_data"]]
-        provider = row[schema["provider"]]
-        authority_boost = Image.get_authority_boost(meta, provider)
-
         attrs = Image.get_instance_attrs(row, schema)
-        attrs["category"] = attrs["category"]
-        popularity = attrs["standardized_popularity"]
 
         return Image(
-            thumbnail=row[schema["thumbnail"]],
             aspect_ratio=aspect_ratio,
             extension=extension,
             size=size,
-            authority_boost=authority_boost,
-            max_boost=max(popularity or 1, authority_boost or 1),
-            min_boost=min(popularity or 1, authority_boost or 1),
+            # Extra fields, not indexed
+            thumbnail=row[schema["thumbnail"]],
             **attrs,
         )
 
@@ -319,27 +321,18 @@ def database_row_to_elasticsearch_doc(row, schema):
         alt_files = row[schema["alt_files"]]
         filetype = row[schema["filetype"]]
         extension = Audio.get_extensions(filetype, alt_files)
-
-        meta = row[schema["meta_data"]]
-        provider = row[schema["provider"]]
-        authority_boost = Audio.get_authority_boost(meta, provider)
-
         attrs = Audio.get_instance_attrs(row, schema)
-        popularity = attrs["standardized_popularity"]
-
         length = Audio.get_length(row[schema["duration"]])
 
         return Audio(
+            length=length,
+            filetype=filetype,
+            extension=extension,
+            # Extra fields, not indexed
             bit_rate=row[schema["bit_rate"]],
             sample_rate=row[schema["sample_rate"]],
             genres=row[schema["genres"]],
             duration=row[schema["duration"]],
-            length=length,
-            filetype=filetype,
-            extension=extension,
-            authority_boost=authority_boost,
-            max_boost=max(popularity or 1, authority_boost or 1),
-            min_boost=min(popularity or 1, authority_boost or 1),
             **attrs,
         )
 

@@ -1,13 +1,22 @@
-def index_settings(table_name):
+from ingestion_server.constants.media_types import AUDIO_TYPE, IMAGE_TYPE, MediaType
+
+
+def index_settings(media_type: MediaType):
     """
     Return the Elasticsearch mapping for a given table in the database.
 
-    :param table_name: The name of the table in the upstream database.
-    :return:
+    :param media_type: The name of the table in the upstream database.
+    :return: the settings for the ES mapping
     """
+
+    number_of_shards: dict[MediaType, int] = {
+        IMAGE_TYPE: 18,
+        AUDIO_TYPE: 1,
+    }
+
     settings = {
         "index": {
-            "number_of_shards": 18,
+            "number_of_shards": number_of_shards[media_type],
             "number_of_replicas": 0,
             "refresh_interval": "-1",
         },
@@ -51,109 +60,89 @@ def index_settings(table_name):
         },
     }
     common_mappings = {
+        "dynamic": False,  # extra fields are stored in ``_source`` but not indexed
         "properties": {
             "id": {"type": "long"},
+            "created_on": {"type": "date"},
+            "mature": {"type": "boolean"},
+            # Keyword fields
             "identifier": {
-                "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
-                "type": "text",
+                # TODO: Remove subfield when API is updated
+                "fields": {"keyword": {"type": "keyword"}},
+                "type": "keyword",
             },
+            "extension": {"type": "keyword"},
+            "license": {
+                # TODO: Remove subfield when API is updated
+                "fields": {"keyword": {"type": "keyword"}},
+                "type": "keyword",
+            },
+            "provider": {"type": "keyword"},
+            "source": {
+                # TODO: Remove subfield when API is updated
+                "fields": {"keyword": {"type": "keyword"}},
+                "type": "keyword",
+            },
+            "filetype": {"type": "keyword"},
+            "category": {"type": "keyword"},
+            # Text-based fields
             "title": {
                 "type": "text",
+                "analyzer": "custom_english",
                 "similarity": "boolean",
                 "fields": {
                     "keyword": {"type": "keyword", "ignore_above": 256},
                     "raw": {"type": "text", "index": True},
                 },
-                "analyzer": "custom_english",
-            },
-            "foreign_landing_url": {
-                "fields": {"keyword": {"ignore_above": 256, "type": "keyword"}},
-                "type": "text",
             },
             "description": {
-                "fields": {
-                    "keyword": {"type": "keyword", "similarity": "boolean"},
-                    "raw": {"type": "text", "index": True},
-                },
                 "type": "text",
                 "analyzer": "custom_english",
+                "similarity": "boolean",
+                "fields": {"raw": {"type": "text", "index": True}},
             },
             "creator": {
                 "type": "text",
                 "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
             },
-            "url": {
-                "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
-                "type": "text",
-            },
-            "extension": {
-                "fields": {"keyword": {"ignore_above": 8, "type": "keyword"}},
-                "type": "text",
-            },
-            "license": {
-                "fields": {"keyword": {"ignore_above": 256, "type": "keyword"}},
-                "type": "text",
-            },
-            "license_version": {
-                "type": "text",
-                "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
-            },
-            "license_url": {
-                "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
-                "type": "text",
-            },
-            "provider": {
-                "type": "text",
-                "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
-            },
-            "source": {
-                "fields": {"keyword": {"ignore_above": 256, "type": "keyword"}},
-                "type": "text",
+            # Rank feature fields
+            "standardized_popularity": {"type": "rank_feature"},
+            "authority_boost": {"type": "rank_feature"},
+            "authority_penalty": {
+                "type": "rank_feature",
+                "positive_score_impact": False,
             },
-            "filetype": {"type": "keyword"},
-            "created_on": {"type": "date"},
+            "max_boost": {"type": "rank_feature"},
+            "min_boost": {"type": "rank_feature"},
+            # Nested fields
             "tags": {
                 "properties": {
                     "accuracy": {"type": "float"},
+                    # Text-based fields
                     "name": {
                         "type": "text",
+                        "analyzer": "custom_english",
                         "fields": {
                             "keyword": {"type": "keyword", "ignore_above": 256},
                             "raw": {"type": "text", "index": True},
                         },
-                        "analyzer": "custom_english",
                     },
                 }
             },
-            "mature": {"type": "boolean"},
-            "standardized_popularity": {"type": "rank_feature"},
-            "authority_boost": {"type": "rank_feature"},
-            "authority_penalty": {
-                "type": "rank_feature",
-                "positive_score_impact": False,
-            },
-            "max_boost": {"type": "rank_feature"},
-            "min_boost": {"type": "rank_feature"},
-            "category": {"type": "keyword"},
-        }
+        },
     }
     media_properties = {
         "image": {
-            "aspect_ratio": {
-                "fields": {"keyword": {"type": "keyword"}},
-                "type": "text",
-            },
-            "size": {"fields": {"keyword": {"type": "keyword"}}, "type": "text"},
+            # Keyword fields
+            "aspect_ratio": {"type": "keyword"},
+            "size": {"type": "keyword"},
         },
         "audio": {
-            "bit_rate": {"type": "integer"},
-            "sample_rate": {"type": "integer"},
-            "genres": {"fields": {"keyword": {"type": "keyword"}}, "type": "text"},
-            "duration": {"type": "integer"},
+            # Keyword fields
             "length": {"type": "keyword"},
         },
     }
     media_mappings = common_mappings.copy()
-    media_mappings["properties"].update(media_properties[table_name])
+    media_mappings["properties"].update(media_properties[media_type])
     result = {"settings": settings.copy(), "mappings": media_mappings}
     return result