From 055ac4cf5027ac9be32114006b03086d16423e90 Mon Sep 17 00:00:00 2001
From: Dhruv Bhanushali <dhruv_b@live.com>
Date: Thu, 5 Oct 2023 09:53:32 +0400
Subject: [PATCH 01/15] Allow setting number of shards per media type

---
 .../ingestion_server/es_mapping.py            | 25 +++++++++++++++----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/ingestion_server/ingestion_server/es_mapping.py b/ingestion_server/ingestion_server/es_mapping.py
index b780088a8e7..ec6cfdf0881 100644
--- a/ingestion_server/ingestion_server/es_mapping.py
+++ b/ingestion_server/ingestion_server/es_mapping.py
@@ -1,13 +1,28 @@
-def index_settings(table_name):
+from ingestion_server.constants.media_types import (
+    AUDIO_TYPE,
+    IMAGE_TYPE,
+    MODEL_3D_TYPE,
+    MediaType,
+)
+
+
+def index_settings(media_type: MediaType):
     """
     Return the Elasticsearch mapping for a given table in the database.
 
-    :param table_name: The name of the table in the upstream database.
-    :return:
+    :param media_type: The name of the table in the upstream database.
+    :return: the settings for the ES mapping
     """
+
+    number_of_shards: dict[MediaType, int] = {
+        IMAGE_TYPE: 18,
+        AUDIO_TYPE: 1,
+        MODEL_3D_TYPE: 1,
+    }
+
     settings = {
         "index": {
-            "number_of_shards": 18,
+            "number_of_shards": number_of_shards[media_type],
             "number_of_replicas": 0,
             "refresh_interval": "-1",
         },
@@ -154,6 +169,6 @@ def index_settings(table_name):
         },
     }
     media_mappings = common_mappings.copy()
-    media_mappings["properties"].update(media_properties[table_name])
+    media_mappings["properties"].update(media_properties[media_type])
     result = {"settings": settings.copy(), "mappings": media_mappings}
     return result

From 81912f4fe01c627b6a90c71be10f24d310a5672b Mon Sep 17 00:00:00 2001
From: Dhruv Bhanushali <dhruv_b@live.com>
Date: Thu, 5 Oct 2023 10:25:13 +0400
Subject: [PATCH 02/15] Simplify and organise index properties

---
 .../ingestion_server/es_mapping.py            | 93 +++++++------------
 1 file changed, 34 insertions(+), 59 deletions(-)

diff --git a/ingestion_server/ingestion_server/es_mapping.py b/ingestion_server/ingestion_server/es_mapping.py
index ec6cfdf0881..c0ab485785b 100644
--- a/ingestion_server/ingestion_server/es_mapping.py
+++ b/ingestion_server/ingestion_server/es_mapping.py
@@ -66,105 +66,80 @@ def index_settings(media_type: MediaType):
         },
     }
     common_mappings = {
+        "dynamic": False,  # extra fields are stored in ``_source`` but not indexed
         "properties": {
             "id": {"type": "long"},
-            "identifier": {
-                "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
-                "type": "text",
-            },
+            "created_on": {"type": "date"},
+            "mature": {"type": "boolean"},
+            # Keyword fields
+            "identifier": {"type": "keyword"},
+            "extension": {"type": "keyword"},
+            "license": {"type": "keyword"},
+            "provider": {"type": "keyword"},
+            "source": {"type": "keyword"},
+            "filetype": {"type": "keyword"},
+            "category": {"type": "keyword"},
+            # Text-based fields
             "title": {
                 "type": "text",
+                "analyzer": "custom_english",
                 "similarity": "boolean",
                 "fields": {
                     "keyword": {"type": "keyword", "ignore_above": 256},
                     "raw": {"type": "text", "index": True},
                 },
-                "analyzer": "custom_english",
-            },
-            "foreign_landing_url": {
-                "fields": {"keyword": {"ignore_above": 256, "type": "keyword"}},
-                "type": "text",
             },
             "description": {
+                "type": "text",
+                "analyzer": "custom_english",
+                "similarity": "boolean",
                 "fields": {
-                    "keyword": {"type": "keyword", "similarity": "boolean"},
+                    "keyword": {"type": "keyword", "ignore_above": 256},
                     "raw": {"type": "text", "index": True},
                 },
-                "type": "text",
-                "analyzer": "custom_english",
             },
             "creator": {
                 "type": "text",
                 "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
             },
-            "url": {
-                "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
-                "type": "text",
-            },
-            "extension": {
-                "fields": {"keyword": {"ignore_above": 8, "type": "keyword"}},
-                "type": "text",
-            },
-            "license": {
-                "fields": {"keyword": {"ignore_above": 256, "type": "keyword"}},
-                "type": "text",
-            },
-            "license_version": {
-                "type": "text",
-                "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
-            },
-            "license_url": {
-                "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
-                "type": "text",
-            },
-            "provider": {
-                "type": "text",
-                "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
-            },
-            "source": {
-                "fields": {"keyword": {"ignore_above": 256, "type": "keyword"}},
-                "type": "text",
+            # Rank feature fields
+            "standardized_popularity": {"type": "rank_feature"},
+            "authority_boost": {"type": "rank_feature"},
+            "authority_penalty": {
+                "type": "rank_feature",
+                "positive_score_impact": False,
             },
-            "filetype": {"type": "keyword"},
-            "created_on": {"type": "date"},
+            "max_boost": {"type": "rank_feature"},
+            "min_boost": {"type": "rank_feature"},
+            # Nested fields
             "tags": {
                 "properties": {
                     "accuracy": {"type": "float"},
+                    # Text-based fields
                     "name": {
                         "type": "text",
+                        "analyzer": "custom_english",
                         "fields": {
                             "keyword": {"type": "keyword", "ignore_above": 256},
                             "raw": {"type": "text", "index": True},
                         },
-                        "analyzer": "custom_english",
                     },
                 }
             },
-            "mature": {"type": "boolean"},
-            "standardized_popularity": {"type": "rank_feature"},
-            "authority_boost": {"type": "rank_feature"},
-            "authority_penalty": {
-                "type": "rank_feature",
-                "positive_score_impact": False,
-            },
-            "max_boost": {"type": "rank_feature"},
-            "min_boost": {"type": "rank_feature"},
-            "category": {"type": "keyword"},
-        }
+        },
     }
     media_properties = {
         "image": {
-            "aspect_ratio": {
-                "fields": {"keyword": {"type": "keyword"}},
-                "type": "text",
-            },
-            "size": {"fields": {"keyword": {"type": "keyword"}}, "type": "text"},
+            # Keyword fields
+            "aspect_ratio": {"type": "keyword"},
+            "size": {"type": "keyword"},
         },
         "audio": {
             "bit_rate": {"type": "integer"},
             "sample_rate": {"type": "integer"},
-            "genres": {"fields": {"keyword": {"type": "keyword"}}, "type": "text"},
             "duration": {"type": "integer"},
+            # Keyword fields
+            "genres": {"type": "keyword"},
             "length": {"type": "keyword"},
         },
     }

From df51d1097e5603d824c9e450f764260f3ed7a3af Mon Sep 17 00:00:00 2001
From: Dhruv Bhanushali <dhruv_b@live.com>
Date: Thu, 5 Oct 2023 14:03:58 +0400
Subject: [PATCH 03/15] Remove fields not supported by search request
 serializer

---
 ingestion_server/ingestion_server/es_mapping.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/ingestion_server/ingestion_server/es_mapping.py b/ingestion_server/ingestion_server/es_mapping.py
index c0ab485785b..a2fcb08cd08 100644
--- a/ingestion_server/ingestion_server/es_mapping.py
+++ b/ingestion_server/ingestion_server/es_mapping.py
@@ -135,11 +135,7 @@ def index_settings(media_type: MediaType):
             "size": {"type": "keyword"},
         },
         "audio": {
-            "bit_rate": {"type": "integer"},
-            "sample_rate": {"type": "integer"},
-            "duration": {"type": "integer"},
             # Keyword fields
-            "genres": {"type": "keyword"},
             "length": {"type": "keyword"},
         },
     }

From 2e9da514643499ed3dc4bdec9b6487715e8a2e20 Mon Sep 17 00:00:00 2001
From: Dhruv Bhanushali <dhruv_b@live.com>
Date: Thu, 5 Oct 2023 14:05:54 +0400
Subject: [PATCH 04/15] Update field mapping in search controller

---
 api/api/controllers/search_controller.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/api/api/controllers/search_controller.py b/api/api/controllers/search_controller.py
index f0cf402e105..a624afc8299 100644
--- a/api/api/controllers/search_controller.py
+++ b/api/api/controllers/search_controller.py
@@ -349,12 +349,14 @@ def search(
         ("extension", None),
         ("category", None),
         ("categories", "category"),
+        ("source", None),
+        ("license", None),
+        ("license_type", "license"),
+        # Audio-specific filters
         ("length", None),
+        # Image-specific filters
         ("aspect_ratio", None),
         ("size", None),
-        ("source", None),
-        ("license", "license__keyword"),
-        ("license_type", "license__keyword"),
     ]
     for serializer_field, es_field in filters:
         if serializer_field in search_params.data:

From 1d276913f96db981885c845315ce5879516751bc Mon Sep 17 00:00:00 2001
From: Dhruv Bhanushali <dhruv_b@live.com>
Date: Thu, 5 Oct 2023 14:12:19 +0400
Subject: [PATCH 05/15] Remove `.keyword` from source

---
 api/api/controllers/search_controller.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/api/api/controllers/search_controller.py b/api/api/controllers/search_controller.py
index a624afc8299..3efb8c7decf 100644
--- a/api/api/controllers/search_controller.py
+++ b/api/api/controllers/search_controller.py
@@ -561,7 +561,7 @@ def get_sources(index):
         aggs = {
             "unique_sources": {
                 "terms": {
-                    "field": "source.keyword",
+                    "field": "source",
                     "size": size,
                     "order": {"_key": "desc"},
                 }

From 560fa69855df5ec7bb87684178737185b03ab1a4 Mon Sep 17 00:00:00 2001
From: Dhruv Bhanushali <dhruv_b@live.com>
Date: Thu, 5 Oct 2023 14:23:22 +0400
Subject: [PATCH 06/15] Remove `.keyword` from identifier

---
 api/api/utils/search_context.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/api/api/utils/search_context.py b/api/api/utils/search_context.py
index d1505c476d8..9beccee6076 100644
--- a/api/api/utils/search_context.py
+++ b/api/api/utils/search_context.py
@@ -35,12 +35,11 @@ def build(
             # Use `identifier` rather than the document `id` due to
             # `id` instability between refreshes:
             # https://github.com/WordPress/openverse/issues/2306
-            # `identifier` is mapped as `text` which will match fuzzily.
-            # Use `identifier.keyword` to match _exactly_
+            # `identifier` is mapped as `keyword` which will match exactly.
             # cf: https://github.com/WordPress/openverse/issues/2154
             Q(
                 "terms",
-                **{"identifier.keyword": all_result_identifiers},
+                **{"identifier": all_result_identifiers},
             )
         )
 

From 4aa9fb197fcaac73870d942417091ab920c68c65 Mon Sep 17 00:00:00 2001
From: Dhruv Bhanushali <dhruv_b@live.com>
Date: Thu, 5 Oct 2023 15:31:35 +0400
Subject: [PATCH 07/15] Remove keyword field from description

---
 ingestion_server/ingestion_server/es_mapping.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/ingestion_server/ingestion_server/es_mapping.py b/ingestion_server/ingestion_server/es_mapping.py
index a2fcb08cd08..13f25c29541 100644
--- a/ingestion_server/ingestion_server/es_mapping.py
+++ b/ingestion_server/ingestion_server/es_mapping.py
@@ -93,10 +93,7 @@ def index_settings(media_type: MediaType):
                 "type": "text",
                 "analyzer": "custom_english",
                 "similarity": "boolean",
-                "fields": {
-                    "keyword": {"type": "keyword", "ignore_above": 256},
-                    "raw": {"type": "text", "index": True},
-                },
+                "fields": {"raw": {"type": "text", "index": True}},
             },
             "creator": {
                 "type": "text",

From 997324fc78291653d8b2dff31b822c40ec924ab2 Mon Sep 17 00:00:00 2001
From: Dhruv Bhanushali <dhruv_b@live.com>
Date: Thu, 5 Oct 2023 15:35:12 +0400
Subject: [PATCH 08/15] Delete unwanted entry from `number_of_shards`

---
 ingestion_server/ingestion_server/es_mapping.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/ingestion_server/ingestion_server/es_mapping.py b/ingestion_server/ingestion_server/es_mapping.py
index 13f25c29541..d7ec3272f7b 100644
--- a/ingestion_server/ingestion_server/es_mapping.py
+++ b/ingestion_server/ingestion_server/es_mapping.py
@@ -1,9 +1,4 @@
-from ingestion_server.constants.media_types import (
-    AUDIO_TYPE,
-    IMAGE_TYPE,
-    MODEL_3D_TYPE,
-    MediaType,
-)
+from ingestion_server.constants.media_types import AUDIO_TYPE, IMAGE_TYPE, MediaType
 
 
 def index_settings(media_type: MediaType):
@@ -17,7 +12,6 @@ def index_settings(media_type: MediaType):
     number_of_shards: dict[MediaType, int] = {
         IMAGE_TYPE: 18,
         AUDIO_TYPE: 1,
-        MODEL_3D_TYPE: 1,
     }
 
     settings = {

From b0c0df97c5e27a6f11b4ec0164ce9e0b30fbc591 Mon Sep 17 00:00:00 2001
From: Dhruv Bhanushali <dhruv_b@live.com>
Date: Fri, 6 Oct 2023 12:48:47 +0400
Subject: [PATCH 09/15] Remove extraneous fields

---
 ingestion_server/ingestion_server/elasticsearch_models.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/ingestion_server/ingestion_server/elasticsearch_models.py b/ingestion_server/ingestion_server/elasticsearch_models.py
index 0397830f639..83a0513f06b 100644
--- a/ingestion_server/ingestion_server/elasticsearch_models.py
+++ b/ingestion_server/ingestion_server/elasticsearch_models.py
@@ -245,7 +245,6 @@ def database_row_to_elasticsearch_doc(row, schema):
         popularity = attrs["standardized_popularity"]
 
         return Image(
-            thumbnail=row[schema["thumbnail"]],
             aspect_ratio=aspect_ratio,
             extension=extension,
             size=size,
@@ -330,10 +329,6 @@ def database_row_to_elasticsearch_doc(row, schema):
         length = Audio.get_length(row[schema["duration"]])
 
         return Audio(
-            bit_rate=row[schema["bit_rate"]],
-            sample_rate=row[schema["sample_rate"]],
-            genres=row[schema["genres"]],
-            duration=row[schema["duration"]],
             length=length,
             filetype=filetype,
             extension=extension,

From 020350226db682114ff32ec7a6c914aebd9282a4 Mon Sep 17 00:00:00 2001
From: Dhruv Bhanushali <dhruv_b@live.com>
Date: Fri, 6 Oct 2023 12:54:29 +0400
Subject: [PATCH 10/15] Move boosts to parent `Media` class

---
 .../ingestion_server/elasticsearch_models.py  | 29 +++++--------------
 1 file changed, 7 insertions(+), 22 deletions(-)

diff --git a/ingestion_server/ingestion_server/elasticsearch_models.py b/ingestion_server/ingestion_server/elasticsearch_models.py
index 83a0513f06b..16279e5d6a8 100644
--- a/ingestion_server/ingestion_server/elasticsearch_models.py
+++ b/ingestion_server/ingestion_server/elasticsearch_models.py
@@ -97,6 +97,9 @@ def get_instance_attrs(row, schema):
         # cleanup tests in CI: test/unit_tests/test_cleanup.py
         category = row[schema["category"]] if "category" in schema else None
 
+        provider = row[schema["provider"]]
+        authority_boost = Media.get_authority_boost(meta, provider)
+
         return {
             "_id": row[schema["id"]],
             "id": row[schema["id"]],
@@ -110,13 +113,16 @@ def get_instance_attrs(row, schema):
             "license": row[schema["license"]].lower(),
             "license_version": row[schema["license_version"]],
             "license_url": Media.get_license_url(meta),
-            "provider": row[schema["provider"]],
+            "provider": provider,
             "source": row[schema["source"]],
             "category": category,
             "created_on": row[schema["created_on"]],
             "tags": Media.parse_detailed_tags(row[schema["tags"]]),
             "mature": Media.get_maturity(meta, row[schema["mature"]]),
             "standardized_popularity": popularity,
+            "authority_boost": authority_boost,
+            "max_boost": max(popularity or 1, authority_boost or 1),
+            "min_boost": min(popularity or 1, authority_boost or 1),
         }
 
     @staticmethod
@@ -230,27 +236,16 @@ class Index:
     @staticmethod
     def database_row_to_elasticsearch_doc(row, schema):
         extension = Image.get_extension(row[schema["url"]])
-
         height = row[schema["height"]]
         width = row[schema["width"]]
         aspect_ratio = Image.get_aspect_ratio(height, width)
         size = Image.get_size(height, width)
-
-        meta = row[schema["meta_data"]]
-        provider = row[schema["provider"]]
-        authority_boost = Image.get_authority_boost(meta, provider)
-
         attrs = Image.get_instance_attrs(row, schema)
-        attrs["category"] = attrs["category"]
-        popularity = attrs["standardized_popularity"]
 
         return Image(
             aspect_ratio=aspect_ratio,
             extension=extension,
             size=size,
-            authority_boost=authority_boost,
-            max_boost=max(popularity or 1, authority_boost or 1),
-            min_boost=min(popularity or 1, authority_boost or 1),
             **attrs,
         )
 
@@ -318,23 +313,13 @@ def database_row_to_elasticsearch_doc(row, schema):
         alt_files = row[schema["alt_files"]]
         filetype = row[schema["filetype"]]
         extension = Audio.get_extensions(filetype, alt_files)
-
-        meta = row[schema["meta_data"]]
-        provider = row[schema["provider"]]
-        authority_boost = Audio.get_authority_boost(meta, provider)
-
         attrs = Audio.get_instance_attrs(row, schema)
-        popularity = attrs["standardized_popularity"]
-
         length = Audio.get_length(row[schema["duration"]])
 
         return Audio(
             length=length,
             filetype=filetype,
             extension=extension,
-            authority_boost=authority_boost,
-            max_boost=max(popularity or 1, authority_boost or 1),
-            min_boost=min(popularity or 1, authority_boost or 1),
             **attrs,
         )
 

From 3b5d1d2a8202c2c4a574cb1450e3131227bdba63 Mon Sep 17 00:00:00 2001
From: Dhruv Bhanushali <dhruv_b@live.com>
Date: Fri, 6 Oct 2023 14:38:38 +0400
Subject: [PATCH 11/15] Remove unused fields and sort as per `es_mapping.py`

---
 .../ingestion_server/elasticsearch_models.py  | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/ingestion_server/ingestion_server/elasticsearch_models.py b/ingestion_server/ingestion_server/elasticsearch_models.py
index 16279e5d6a8..4e3b33e24bc 100644
--- a/ingestion_server/ingestion_server/elasticsearch_models.py
+++ b/ingestion_server/ingestion_server/elasticsearch_models.py
@@ -100,29 +100,31 @@ def get_instance_attrs(row, schema):
         provider = row[schema["provider"]]
         authority_boost = Media.get_authority_boost(meta, provider)
 
+        # This matches the order of fields defined in ``es_mapping.py``.
         return {
             "_id": row[schema["id"]],
             "id": row[schema["id"]],
+            "created_on": row[schema["created_on"]],
+            "mature": Media.get_maturity(meta, row[schema["mature"]]),
+            # Keyword fields
             "identifier": row[schema["identifier"]],
-            "title": row[schema["title"]],
-            "foreign_landing_url": row[schema["foreign_landing_url"]],
-            "description": Media.parse_description(meta),
-            "creator": row[schema["creator"]],
-            "creator_url": row[schema["creator_url"]],
-            "url": row[schema["url"]],
             "license": row[schema["license"]].lower(),
-            "license_version": row[schema["license_version"]],
-            "license_url": Media.get_license_url(meta),
             "provider": provider,
             "source": row[schema["source"]],
             "category": category,
-            "created_on": row[schema["created_on"]],
-            "tags": Media.parse_detailed_tags(row[schema["tags"]]),
-            "mature": Media.get_maturity(meta, row[schema["mature"]]),
+            # Text-based fields
+            "title": row[schema["title"]],
+            "description": Media.parse_description(meta),
+            "creator": row[schema["creator"]],
+            # Rank feature fields
             "standardized_popularity": popularity,
             "authority_boost": authority_boost,
             "max_boost": max(popularity or 1, authority_boost or 1),
             "min_boost": min(popularity or 1, authority_boost or 1),
+            # Nested fields
+            "tags": Media.parse_detailed_tags(row[schema["tags"]]),
+            # Extra fields, not indexed
+            "url": row[schema["url"]],
         }
 
     @staticmethod

From 618604d93873eb712d7fa0b359d5b2cb23696385 Mon Sep 17 00:00:00 2001
From: Dhruv Bhanushali <dhruv_b@live.com>
Date: Sat, 7 Oct 2023 09:52:37 +0400
Subject: [PATCH 12/15] Return subfields to avoid API changes

---
 api/api/controllers/search_controller.py       | 10 ++++------
 api/api/utils/search_context.py                |  5 +++--
 .../ingestion_server/es_mapping.py             | 18 +++++++++++++++---
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/api/api/controllers/search_controller.py b/api/api/controllers/search_controller.py
index 3efb8c7decf..f0cf402e105 100644
--- a/api/api/controllers/search_controller.py
+++ b/api/api/controllers/search_controller.py
@@ -349,14 +349,12 @@ def search(
         ("extension", None),
         ("category", None),
         ("categories", "category"),
-        ("source", None),
-        ("license", None),
-        ("license_type", "license"),
-        # Audio-specific filters
         ("length", None),
-        # Image-specific filters
         ("aspect_ratio", None),
         ("size", None),
+        ("source", None),
+        ("license", "license__keyword"),
+        ("license_type", "license__keyword"),
     ]
     for serializer_field, es_field in filters:
         if serializer_field in search_params.data:
@@ -561,7 +559,7 @@ def get_sources(index):
         aggs = {
             "unique_sources": {
                 "terms": {
-                    "field": "source",
+                    "field": "source.keyword",
                     "size": size,
                     "order": {"_key": "desc"},
                 }
diff --git a/api/api/utils/search_context.py b/api/api/utils/search_context.py
index 9beccee6076..d1505c476d8 100644
--- a/api/api/utils/search_context.py
+++ b/api/api/utils/search_context.py
@@ -35,11 +35,12 @@ def build(
             # Use `identifier` rather than the document `id` due to
             # `id` instability between refreshes:
             # https://github.com/WordPress/openverse/issues/2306
-            # `identifier` is mapped as `keyword` which will match exactly.
+            # `identifier` is mapped as `text` which will match fuzzily.
+            # Use `identifier.keyword` to match _exactly_
             # cf: https://github.com/WordPress/openverse/issues/2154
             Q(
                 "terms",
-                **{"identifier": all_result_identifiers},
+                **{"identifier.keyword": all_result_identifiers},
             )
         )
 
diff --git a/ingestion_server/ingestion_server/es_mapping.py b/ingestion_server/ingestion_server/es_mapping.py
index d7ec3272f7b..c70e4c1a4fd 100644
--- a/ingestion_server/ingestion_server/es_mapping.py
+++ b/ingestion_server/ingestion_server/es_mapping.py
@@ -66,11 +66,23 @@ def index_settings(media_type: MediaType):
             "created_on": {"type": "date"},
             "mature": {"type": "boolean"},
             # Keyword fields
-            "identifier": {"type": "keyword"},
+            "identifier": {
+                # TODO: Remove subfield when API is updated
+                "fields": {"keyword": {"type": "keyword"}},
+                "type": "keyword",
+            },
             "extension": {"type": "keyword"},
-            "license": {"type": "keyword"},
+            "license": {
+                # TODO: Remove subfield when API is updated
+                "fields": {"keyword": {"type": "keyword"}},
+                "type": "keyword",
+            },
             "provider": {"type": "keyword"},
-            "source": {"type": "keyword"},
+            "source": {
+                # TODO: Remove subfield when API is updated
+                "fields": {"keyword": {"type": "keyword"}},
+                "type": "keyword",
+            },
             "filetype": {"type": "keyword"},
             "category": {"type": "keyword"},
             # Text-based fields

From c67f3ee749cbf9b7c3b28bca943626585960e77f Mon Sep 17 00:00:00 2001
From: Dhruv Bhanushali <dhruv_b@live.com>
Date: Mon, 9 Oct 2023 09:33:48 +0400
Subject: [PATCH 13/15] Re-add extra fields that are not indexed

---
 .../ingestion_server/elasticsearch_models.py          | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/ingestion_server/ingestion_server/elasticsearch_models.py b/ingestion_server/ingestion_server/elasticsearch_models.py
index 4e3b33e24bc..32fc8491cd6 100644
--- a/ingestion_server/ingestion_server/elasticsearch_models.py
+++ b/ingestion_server/ingestion_server/elasticsearch_models.py
@@ -125,6 +125,10 @@ def get_instance_attrs(row, schema):
             "tags": Media.parse_detailed_tags(row[schema["tags"]]),
             # Extra fields, not indexed
             "url": row[schema["url"]],
+            "foreign_landing_url": row[schema["foreign_landing_url"]],
+            "creator_url": row[schema["creator_url"]],
+            "license_version": row[schema["license_version"]],
+            "license_url": Media.get_license_url(meta),
         }
 
     @staticmethod
@@ -248,6 +252,8 @@ def database_row_to_elasticsearch_doc(row, schema):
             aspect_ratio=aspect_ratio,
             extension=extension,
             size=size,
+            # Extra fields, not indexed
+            thumbnail=row[schema["thumbnail"]],
             **attrs,
         )
 
@@ -322,6 +328,11 @@ def database_row_to_elasticsearch_doc(row, schema):
             length=length,
             filetype=filetype,
             extension=extension,
+            # Extra fields, not indexed
+            bit_rate=row[schema["bit_rate"]],
+            sample_rate=row[schema["sample_rate"]],
+            genres=row[schema["genres"]],
+            duration=row[schema["duration"]],
             **attrs,
         )
 

From ba3efc359246d3527c0ddae9c1a28c875b059443 Mon Sep 17 00:00:00 2001
From: Dhruv Bhanushali <hi@dhruvkb.dev>
Date: Mon, 9 Oct 2023 18:09:33 +0000
Subject: [PATCH 14/15] Use blank array instead of `None` for tags

---
 ingestion_server/ingestion_server/elasticsearch_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ingestion_server/ingestion_server/elasticsearch_models.py b/ingestion_server/ingestion_server/elasticsearch_models.py
index 32fc8491cd6..c758d847ee6 100644
--- a/ingestion_server/ingestion_server/elasticsearch_models.py
+++ b/ingestion_server/ingestion_server/elasticsearch_models.py
@@ -199,7 +199,7 @@ def get_popularity(raw):
     @staticmethod
     def parse_detailed_tags(json_tags):
         if not json_tags:
-            return None
+            return []
         parsed_tags = []
         for tag in json_tags:
             if "name" in tag:

From d52e73ffa347ed194f93cab0f0f881dac75daf91 Mon Sep 17 00:00:00 2001
From: Dhruv Bhanushali <hi@dhruvkb.dev>
Date: Fri, 13 Oct 2023 12:56:49 +0000
Subject: [PATCH 15/15] Revert to `None` when no tags

---
 ingestion_server/ingestion_server/elasticsearch_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ingestion_server/ingestion_server/elasticsearch_models.py b/ingestion_server/ingestion_server/elasticsearch_models.py
index c758d847ee6..32fc8491cd6 100644
--- a/ingestion_server/ingestion_server/elasticsearch_models.py
+++ b/ingestion_server/ingestion_server/elasticsearch_models.py
@@ -199,7 +199,7 @@ def get_popularity(raw):
     @staticmethod
     def parse_detailed_tags(json_tags):
         if not json_tags:
-            return []
+            return None
         parsed_tags = []
         for tag in json_tags:
             if "name" in tag: