From be6387b0c5ae17499c559c03afd9442ebfa583ea Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Thu, 5 Oct 2023 21:29:02 +0300 Subject: [PATCH 1/4] Simplify the related search --- api/api/controllers/search_controller.py | 16 +++++----------- api/api/views/media_views.py | 7 ++++--- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/api/api/controllers/search_controller.py b/api/api/controllers/search_controller.py index 921b50400ee..6135deaa480 100644 --- a/api/api/controllers/search_controller.py +++ b/api/api/controllers/search_controller.py @@ -283,7 +283,8 @@ def _exclude_filtered(s: Search): key=filter_cache_key, timeout=FILTER_CACHE_TIMEOUT, value=filtered_providers ) to_exclude = [f["provider_identifier"] for f in filtered_providers] - s = s.exclude("terms", provider=to_exclude) + if to_exclude: + s = s.exclude("terms", provider=to_exclude) return s @@ -495,7 +496,7 @@ def search( return results, page_count, result_count, search_context.asdict() -def related_media(uuid, index, filter_dead): +def related_media(uuid: str, index: str, filter_dead: bool) -> tuple[list[Hit], int]: """Given a UUID, find related search results.""" search_client = Search(index=index) @@ -508,14 +509,12 @@ def related_media(uuid, index, filter_dead): s = search_client s = s.query( MoreLikeThis( - fields=["tags.name", "title", "creator"], + fields=["tags.name", "title"], like={"_index": index, "_id": _id}, min_term_freq=1, max_query_terms=50, ) ) - # Never show mature content in recommendations. - s = s.exclude("term", mature=True) s = _exclude_filtered(s) page_size = 10 page = 1 @@ -526,12 +525,7 @@ def related_media(uuid, index, filter_dead): result_count, _ = _get_result_and_page_count(response, results, page_size, page) - if not results: - results = [] - - result_ids = [result.identifier for result in results] - search_context = SearchContext.build(result_ids, index) - return results, result_count, search_context.asdict() + return results or [], result_count def get_sources(index): diff --git a/api/api/views/media_views.py b/api/api/views/media_views.py index 3ee2414ac73..30319155fe3 100644 --- a/api/api/views/media_views.py +++ b/api/api/views/media_views.py @@ -158,9 +158,10 @@ def stats(self, *_, **__): @action(detail=True) def related(self, request, identifier=None, *_, **__): try: - results, num_results, search_context = search_controller.related_media( + index = f"{self.default_index}-filtered" + results, num_results = search_controller.related_media( uuid=identifier, - index=self.default_index, + index=index, filter_dead=True, ) self.paginator.result_count = num_results @@ -173,7 +174,7 @@ def related(self, request, identifier=None, *_, **__): except IndexError: raise APIException("Could not find items.", 404) - serializer_context = search_context | self.get_serializer_context() + serializer_context = self.get_serializer_context() serializer = self.get_serializer(results, many=True, context=serializer_context) return self.get_paginated_response(serializer.data) From fb03c68c53ebe5c87cfb4d360209525d1aa0d2a0 Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Fri, 6 Oct 2023 06:45:26 +0300 Subject: [PATCH 2/4] Re-add mature filter --- api/api/controllers/search_controller.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/api/api/controllers/search_controller.py b/api/api/controllers/search_controller.py index 6135deaa480..c4bd81e312e 100644 --- a/api/api/controllers/search_controller.py +++ b/api/api/controllers/search_controller.py @@ -515,6 +515,9 @@ def related_media(uuid: str, index: str, filter_dead: bool) -> tuple[list[Hit], max_query_terms=50, ) ) + # Prevent the items that users set as `mature` from showing up in + # recommendations. + s = s.exclude("term", mature=True) s = _exclude_filtered(s) page_size = 10 page = 1 From d7a13a2a05a5945d22910f31c88be816ebbefb28 Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Sat, 7 Oct 2023 14:41:45 +0300 Subject: [PATCH 3/4] Use simple query for related (#3156) * Match related by title and tags * Use default index for item search * Handle ES hits without tags * Use the first 10 tags for the query --------- Co-authored-by: Dhruv Bhanushali --- api/api/controllers/search_controller.py | 57 +++++++++++++----------- api/api/views/media_views.py | 8 ++-- 2 files changed, 34 insertions(+), 31 deletions(-) diff --git a/api/api/controllers/search_controller.py b/api/api/controllers/search_controller.py index c4bd81e312e..5c755e82f2c 100644 --- a/api/api/controllers/search_controller.py +++ b/api/api/controllers/search_controller.py @@ -11,7 +11,7 @@ from elasticsearch.exceptions import BadRequestError, NotFoundError from elasticsearch_dsl import Q, Search -from elasticsearch_dsl.query import EMPTY_QUERY, MoreLikeThis, Query +from elasticsearch_dsl.query import EMPTY_QUERY, Match, Query, SimpleQueryString, Term from elasticsearch_dsl.response import Hit, Response import api.models as models @@ -496,39 +496,42 @@ def search( return results, page_count, result_count, search_context.asdict() -def related_media(uuid: str, index: str, filter_dead: bool) -> tuple[list[Hit], int]: - """Given a UUID, find related search results.""" +def related_media(uuid: str, index: str, filter_dead: bool) -> list[Hit]: + """Given a UUID, find related search results based on title and tags.""" - search_client = Search(index=index) + # Search the default index for the item itself as it might be sensitive. + item_search = Search(index=index) + item_hit = item_search.query("match", identifier=uuid).execute().hits[0] - # Convert UUID to sequential ID. - item = search_client - item = item.query("match", identifier=uuid) - _id = item.execute().hits[0].id + # Match related using title. + title = item_hit.title + title_query = SimpleQueryString(query=title, fields=["title"]) + related_query = title_query - s = search_client - s = s.query( - MoreLikeThis( - fields=["tags.name", "title"], - like={"_index": index, "_id": _id}, - min_term_freq=1, - max_query_terms=50, - ) - ) - # Prevent the items that users set as `mature` from showing up in - # recommendations. - s = s.exclude("term", mature=True) + # Match related using tags, if the item has any. + if tags := getattr(item_hit, "tags", None): + # Only use the first 10 tags + tags = ",".join([tag.name for tag in tags[:10]]) + tags_query = SimpleQueryString(fields=["tags.name"], query=tags) + related_query |= tags_query + + # Search the filtered index for related items. + s = Search(index=f"{index}-filtered") + + # Exclude the current item and mature content. + # TODO: remove `__keyword` after + # https://github.com/WordPress/openverse/pull/3143 is merged. + s = s.query(related_query & ~Match(identifier__keyword=uuid) & ~Term(mature=True)) + # Exclude the dynamically disabled sources. s = _exclude_filtered(s) - page_size = 10 - page = 1 + + page, page_size = 1, 10 start, end = _get_query_slice(s, page_size, page, filter_dead) s = s[start:end] + response = s.execute() results = _post_process_results(s, start, end, page_size, response, filter_dead) - - result_count, _ = _get_result_and_page_count(response, results, page_size, page) - - return results or [], result_count + return results or [] def get_sources(index): @@ -576,7 +579,7 @@ def _get_result_and_page_count( response_obj: Response, results: list[Hit] | None, page_size: int, page: int ) -> tuple[int, int]: """ - Adjust related page count because ES disallows deep pagination of ranked queries. + Adjust page count because ES disallows deep pagination of ranked queries. :param response_obj: The original Elasticsearch response object. :param results: The list of filtered result Hits. diff --git a/api/api/views/media_views.py b/api/api/views/media_views.py index 30319155fe3..96278b7a3b3 100644 --- a/api/api/views/media_views.py +++ b/api/api/views/media_views.py @@ -158,16 +158,16 @@ def stats(self, *_, **__): @action(detail=True) def related(self, request, identifier=None, *_, **__): try: - index = f"{self.default_index}-filtered" - results, num_results = search_controller.related_media( + results = search_controller.related_media( uuid=identifier, - index=index, + index=self.default_index, filter_dead=True, ) - self.paginator.result_count = num_results self.paginator.page_count = 1 # `page_size` refers to the maximum number of related images to return. self.paginator.page_size = 10 + # `result_count` is hard-coded and is equal to the page size. + self.paginator.result_count = 10 except ValueError as e: raise APIException(getattr(e, "message", str(e))) # If there are no hits in the search controller From 07d6b6521c2da6bb4a61493bccf1d248a63e50a3 Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Mon, 9 Oct 2023 14:02:35 +0300 Subject: [PATCH 4/4] Add handling of the case when title and tags are absent --- api/api/controllers/search_controller.py | 42 +++++++++++++++++------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/api/api/controllers/search_controller.py b/api/api/controllers/search_controller.py index 5c755e82f2c..a9041d01d03 100644 --- a/api/api/controllers/search_controller.py +++ b/api/api/controllers/search_controller.py @@ -497,23 +497,43 @@ def search( def related_media(uuid: str, index: str, filter_dead: bool) -> list[Hit]: - """Given a UUID, find related search results based on title and tags.""" + """ + Given a UUID, finds 10 related search results based on title and tags. + + Uses Match query for title or SimpleQueryString for tags. + If the item has no title and no tags, returns items by the same creator. + If the item has no title, no tags or no creator, returns empty list. + + :param uuid: The UUID of the item to find related results for. + :param index: The Elasticsearch index to search (e.g. 'image') + :param filter_dead: Whether dead links should be removed. + :return: List of related results. + """ # Search the default index for the item itself as it might be sensitive. item_search = Search(index=index) - item_hit = item_search.query("match", identifier=uuid).execute().hits[0] + # TODO: remove `__keyword` after + # https://github.com/WordPress/openverse/pull/3143 is merged. + item_hit = item_search.query(Term(identifier__keyword=uuid)).execute().hits[0] # Match related using title. title = item_hit.title - title_query = SimpleQueryString(query=title, fields=["title"]) - related_query = title_query + tags = getattr(item_hit, "tags", None) + creator = item_hit.creator + + if not title and not tags: + if not creator: + return [] + related_query = Term(creator__keyword=creator) + else: + related_query = None if not title else Match(title=title) - # Match related using tags, if the item has any. - if tags := getattr(item_hit, "tags", None): - # Only use the first 10 tags - tags = ",".join([tag.name for tag in tags[:10]]) - tags_query = SimpleQueryString(fields=["tags.name"], query=tags) - related_query |= tags_query + # Match related using tags, if the item has any. + if tags: + # Only use the first 10 tags + tags = " | ".join([tag.name for tag in tags[:10]]) + tags_query = SimpleQueryString(fields=["tags.name"], query=tags) + related_query = related_query | tags_query if related_query else tags_query # Search the filtered index for related items. s = Search(index=f"{index}-filtered") @@ -521,7 +541,7 @@ def related_media(uuid: str, index: str, filter_dead: bool) -> list[Hit]: # Exclude the current item and mature content. # TODO: remove `__keyword` after # https://github.com/WordPress/openverse/pull/3143 is merged. - s = s.query(related_query & ~Match(identifier__keyword=uuid) & ~Term(mature=True)) + s = s.query(related_query & ~Term(identifier__keyword=uuid) & ~Term(mature=True)) # Exclude the dynamically disabled sources. s = _exclude_filtered(s)