diff --git a/api/api/controllers/search_controller.py b/api/api/controllers/search_controller.py index a9041d01d03..f87a329280a 100644 --- a/api/api/controllers/search_controller.py +++ b/api/api/controllers/search_controller.py @@ -350,12 +350,14 @@ def search( ("extension", None), ("category", None), ("categories", "category"), + ("source", None), + ("license", None), + ("license_type", "license"), + # Audio-specific filters ("length", None), + # Image-specific filters ("aspect_ratio", None), ("size", None), - ("source", None), - ("license", "license__keyword"), - ("license_type", "license__keyword"), ] for serializer_field, es_field in filters: if serializer_field in search_params.data: @@ -512,9 +514,7 @@ def related_media(uuid: str, index: str, filter_dead: bool) -> list[Hit]: # Search the default index for the item itself as it might be sensitive. item_search = Search(index=index) - # TODO: remove `__keyword` after - # https://github.com/WordPress/openverse/pull/3143 is merged. - item_hit = item_search.query(Term(identifier__keyword=uuid)).execute().hits[0] + item_hit = item_search.query(Term(identifier=uuid)).execute().hits[0] # Match related using title. title = item_hit.title @@ -539,9 +539,7 @@ def related_media(uuid: str, index: str, filter_dead: bool) -> list[Hit]: s = Search(index=f"{index}-filtered") # Exclude the current item and mature content. - # TODO: remove `__keyword` after - # https://github.com/WordPress/openverse/pull/3143 is merged. - s = s.query(related_query & ~Term(identifier__keyword=uuid) & ~Term(mature=True)) + s = s.query(related_query & ~Term(identifier=uuid) & ~Term(mature=True)) # Exclude the dynamically disabled sources. s = _exclude_filtered(s) @@ -579,7 +577,7 @@ def get_sources(index): aggs = { "unique_sources": { "terms": { - "field": "source.keyword", + "field": "source", "size": size, "order": {"_key": "desc"}, } diff --git a/api/api/utils/search_context.py b/api/api/utils/search_context.py index d1505c476d8..d733df801b4 100644 --- a/api/api/utils/search_context.py +++ b/api/api/utils/search_context.py @@ -35,13 +35,7 @@ def build( # Use `identifier` rather than the document `id` due to # `id` instability between refreshes: # https://github.com/WordPress/openverse/issues/2306 - # `identifier` is mapped as `text` which will match fuzzily. - # Use `identifier.keyword` to match _exactly_ - # cf: https://github.com/WordPress/openverse/issues/2154 - Q( - "terms", - **{"identifier.keyword": all_result_identifiers}, - ) + Q("terms", identifier=all_result_identifiers) ) # The default query size is 10, so we need to slice the query diff --git a/ingestion_server/ingestion_server/elasticsearch_models.py b/ingestion_server/ingestion_server/elasticsearch_models.py index 0397830f639..c758d847ee6 100644 --- a/ingestion_server/ingestion_server/elasticsearch_models.py +++ b/ingestion_server/ingestion_server/elasticsearch_models.py @@ -97,26 +97,38 @@ def get_instance_attrs(row, schema): # cleanup tests in CI: test/unit_tests/test_cleanup.py category = row[schema["category"]] if "category" in schema else None + provider = row[schema["provider"]] + authority_boost = Media.get_authority_boost(meta, provider) + + # This matches the order of fields defined in ``es_mapping.py``. return { "_id": row[schema["id"]], "id": row[schema["id"]], + "created_on": row[schema["created_on"]], + "mature": Media.get_maturity(meta, row[schema["mature"]]), + # Keyword fields "identifier": row[schema["identifier"]], + "license": row[schema["license"]].lower(), + "provider": provider, + "source": row[schema["source"]], + "category": category, + # Text-based fields "title": row[schema["title"]], - "foreign_landing_url": row[schema["foreign_landing_url"]], "description": Media.parse_description(meta), "creator": row[schema["creator"]], - "creator_url": row[schema["creator_url"]], + # Rank feature fields + "standardized_popularity": popularity, + "authority_boost": authority_boost, + "max_boost": max(popularity or 1, authority_boost or 1), + "min_boost": min(popularity or 1, authority_boost or 1), + # Nested fields + "tags": Media.parse_detailed_tags(row[schema["tags"]]), + # Extra fields, not indexed "url": row[schema["url"]], - "license": row[schema["license"]].lower(), + "foreign_landing_url": row[schema["foreign_landing_url"]], + "creator_url": row[schema["creator_url"]], "license_version": row[schema["license_version"]], "license_url": Media.get_license_url(meta), - "provider": row[schema["provider"]], - "source": row[schema["source"]], - "category": category, - "created_on": row[schema["created_on"]], - "tags": Media.parse_detailed_tags(row[schema["tags"]]), - "mature": Media.get_maturity(meta, row[schema["mature"]]), - "standardized_popularity": popularity, } @staticmethod @@ -187,7 +199,7 @@ def get_popularity(raw): @staticmethod def parse_detailed_tags(json_tags): if not json_tags: - return None + return [] parsed_tags = [] for tag in json_tags: if "name" in tag: @@ -230,28 +242,18 @@ class Index: @staticmethod def database_row_to_elasticsearch_doc(row, schema): extension = Image.get_extension(row[schema["url"]]) - height = row[schema["height"]] width = row[schema["width"]] aspect_ratio = Image.get_aspect_ratio(height, width) size = Image.get_size(height, width) - - meta = row[schema["meta_data"]] - provider = row[schema["provider"]] - authority_boost = Image.get_authority_boost(meta, provider) - attrs = Image.get_instance_attrs(row, schema) - attrs["category"] = attrs["category"] - popularity = attrs["standardized_popularity"] return Image( - thumbnail=row[schema["thumbnail"]], aspect_ratio=aspect_ratio, extension=extension, size=size, - authority_boost=authority_boost, - max_boost=max(popularity or 1, authority_boost or 1), - min_boost=min(popularity or 1, authority_boost or 1), + # Extra fields, not indexed + thumbnail=row[schema["thumbnail"]], **attrs, ) @@ -319,27 +321,18 @@ def database_row_to_elasticsearch_doc(row, schema): alt_files = row[schema["alt_files"]] filetype = row[schema["filetype"]] extension = Audio.get_extensions(filetype, alt_files) - - meta = row[schema["meta_data"]] - provider = row[schema["provider"]] - authority_boost = Audio.get_authority_boost(meta, provider) - attrs = Audio.get_instance_attrs(row, schema) - popularity = attrs["standardized_popularity"] - length = Audio.get_length(row[schema["duration"]]) return Audio( + length=length, + filetype=filetype, + extension=extension, + # Extra fields, not indexed bit_rate=row[schema["bit_rate"]], sample_rate=row[schema["sample_rate"]], genres=row[schema["genres"]], duration=row[schema["duration"]], - length=length, - filetype=filetype, - extension=extension, - authority_boost=authority_boost, - max_boost=max(popularity or 1, authority_boost or 1), - min_boost=min(popularity or 1, authority_boost or 1), **attrs, ) diff --git a/ingestion_server/ingestion_server/es_mapping.py b/ingestion_server/ingestion_server/es_mapping.py index b780088a8e7..c70e4c1a4fd 100644 --- a/ingestion_server/ingestion_server/es_mapping.py +++ b/ingestion_server/ingestion_server/es_mapping.py @@ -1,13 +1,22 @@ -def index_settings(table_name): +from ingestion_server.constants.media_types import AUDIO_TYPE, IMAGE_TYPE, MediaType + + +def index_settings(media_type: MediaType): """ Return the Elasticsearch mapping for a given table in the database. - :param table_name: The name of the table in the upstream database. - :return: + :param media_type: The name of the table in the upstream database. + :return: the settings for the ES mapping """ + + number_of_shards: dict[MediaType, int] = { + IMAGE_TYPE: 18, + AUDIO_TYPE: 1, + } + settings = { "index": { - "number_of_shards": 18, + "number_of_shards": number_of_shards[media_type], "number_of_replicas": 0, "refresh_interval": "-1", }, @@ -51,109 +60,89 @@ def index_settings(table_name): }, } common_mappings = { + "dynamic": False, # extra fields are stored in ``_source`` but not indexed "properties": { "id": {"type": "long"}, + "created_on": {"type": "date"}, + "mature": {"type": "boolean"}, + # Keyword fields "identifier": { - "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}, - "type": "text", + # TODO: Remove subfield when API is updated + "fields": {"keyword": {"type": "keyword"}}, + "type": "keyword", }, + "extension": {"type": "keyword"}, + "license": { + # TODO: Remove subfield when API is updated + "fields": {"keyword": {"type": "keyword"}}, + "type": "keyword", + }, + "provider": {"type": "keyword"}, + "source": { + # TODO: Remove subfield when API is updated + "fields": {"keyword": {"type": "keyword"}}, + "type": "keyword", + }, + "filetype": {"type": "keyword"}, + "category": {"type": "keyword"}, + # Text-based fields "title": { "type": "text", + "analyzer": "custom_english", "similarity": "boolean", "fields": { "keyword": {"type": "keyword", "ignore_above": 256}, "raw": {"type": "text", "index": True}, }, - "analyzer": "custom_english", - }, - "foreign_landing_url": { - "fields": {"keyword": {"ignore_above": 256, "type": "keyword"}}, - "type": "text", }, "description": { - "fields": { - "keyword": {"type": "keyword", "similarity": "boolean"}, - "raw": {"type": "text", "index": True}, - }, "type": "text", "analyzer": "custom_english", + "similarity": "boolean", + "fields": {"raw": {"type": "text", "index": True}}, }, "creator": { "type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}, }, - "url": { - "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}, - "type": "text", - }, - "extension": { - "fields": {"keyword": {"ignore_above": 8, "type": "keyword"}}, - "type": "text", - }, - "license": { - "fields": {"keyword": {"ignore_above": 256, "type": "keyword"}}, - "type": "text", - }, - "license_version": { - "type": "text", - "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}, - }, - "license_url": { - "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}, - "type": "text", - }, - "provider": { - "type": "text", - "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}, - }, - "source": { - "fields": {"keyword": {"ignore_above": 256, "type": "keyword"}}, - "type": "text", + # Rank feature fields + "standardized_popularity": {"type": "rank_feature"}, + "authority_boost": {"type": "rank_feature"}, + "authority_penalty": { + "type": "rank_feature", + "positive_score_impact": False, }, - "filetype": {"type": "keyword"}, - "created_on": {"type": "date"}, + "max_boost": {"type": "rank_feature"}, + "min_boost": {"type": "rank_feature"}, + # Nested fields "tags": { "properties": { "accuracy": {"type": "float"}, + # Text-based fields "name": { "type": "text", + "analyzer": "custom_english", "fields": { "keyword": {"type": "keyword", "ignore_above": 256}, "raw": {"type": "text", "index": True}, }, - "analyzer": "custom_english", }, } }, - "mature": {"type": "boolean"}, - "standardized_popularity": {"type": "rank_feature"}, - "authority_boost": {"type": "rank_feature"}, - "authority_penalty": { - "type": "rank_feature", - "positive_score_impact": False, - }, - "max_boost": {"type": "rank_feature"}, - "min_boost": {"type": "rank_feature"}, - "category": {"type": "keyword"}, - } + }, } media_properties = { "image": { - "aspect_ratio": { - "fields": {"keyword": {"type": "keyword"}}, - "type": "text", - }, - "size": {"fields": {"keyword": {"type": "keyword"}}, "type": "text"}, + # Keyword fields + "aspect_ratio": {"type": "keyword"}, + "size": {"type": "keyword"}, }, "audio": { - "bit_rate": {"type": "integer"}, - "sample_rate": {"type": "integer"}, - "genres": {"fields": {"keyword": {"type": "keyword"}}, "type": "text"}, - "duration": {"type": "integer"}, + # Keyword fields "length": {"type": "keyword"}, }, } media_mappings = common_mappings.copy() - media_mappings["properties"].update(media_properties[table_name]) + media_mappings["properties"].update(media_properties[media_type]) result = {"settings": settings.copy(), "mappings": media_mappings} return result