From be6387b0c5ae17499c559c03afd9442ebfa583ea Mon Sep 17 00:00:00 2001
From: Olga Bulat <obulat@gmail.com>
Date: Thu, 5 Oct 2023 21:29:02 +0300
Subject: [PATCH 1/4] Simplify the related search

---
 api/api/controllers/search_controller.py | 16 +++++-----------
 api/api/views/media_views.py             |  7 ++++---
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/api/api/controllers/search_controller.py b/api/api/controllers/search_controller.py
index 921b50400ee..6135deaa480 100644
--- a/api/api/controllers/search_controller.py
+++ b/api/api/controllers/search_controller.py
@@ -283,7 +283,8 @@ def _exclude_filtered(s: Search):
             key=filter_cache_key, timeout=FILTER_CACHE_TIMEOUT, value=filtered_providers
         )
     to_exclude = [f["provider_identifier"] for f in filtered_providers]
-    s = s.exclude("terms", provider=to_exclude)
+    if to_exclude:
+        s = s.exclude("terms", provider=to_exclude)
     return s
 
 
@@ -495,7 +496,7 @@ def search(
     return results, page_count, result_count, search_context.asdict()
 
 
-def related_media(uuid, index, filter_dead):
+def related_media(uuid: str, index: str, filter_dead: bool) -> tuple[list[Hit], int]:
     """Given a UUID, find related search results."""
 
     search_client = Search(index=index)
@@ -508,14 +509,12 @@ def related_media(uuid, index, filter_dead):
     s = search_client
     s = s.query(
         MoreLikeThis(
-            fields=["tags.name", "title", "creator"],
+            fields=["tags.name", "title"],
             like={"_index": index, "_id": _id},
             min_term_freq=1,
             max_query_terms=50,
         )
     )
-    # Never show mature content in recommendations.
-    s = s.exclude("term", mature=True)
     s = _exclude_filtered(s)
     page_size = 10
     page = 1
@@ -526,12 +525,7 @@ def related_media(uuid, index, filter_dead):
 
     result_count, _ = _get_result_and_page_count(response, results, page_size, page)
 
-    if not results:
-        results = []
-
-    result_ids = [result.identifier for result in results]
-    search_context = SearchContext.build(result_ids, index)
-    return results, result_count, search_context.asdict()
+    return results or [], result_count
 
 
 def get_sources(index):
diff --git a/api/api/views/media_views.py b/api/api/views/media_views.py
index 3ee2414ac73..30319155fe3 100644
--- a/api/api/views/media_views.py
+++ b/api/api/views/media_views.py
@@ -158,9 +158,10 @@ def stats(self, *_, **__):
     @action(detail=True)
     def related(self, request, identifier=None, *_, **__):
         try:
-            results, num_results, search_context = search_controller.related_media(
+            index = f"{self.default_index}-filtered"
+            results, num_results = search_controller.related_media(
                 uuid=identifier,
-                index=self.default_index,
+                index=index,
                 filter_dead=True,
             )
             self.paginator.result_count = num_results
@@ -173,7 +174,7 @@ def related(self, request, identifier=None, *_, **__):
         except IndexError:
             raise APIException("Could not find items.", 404)
 
-        serializer_context = search_context | self.get_serializer_context()
+        serializer_context = self.get_serializer_context()
 
         serializer = self.get_serializer(results, many=True, context=serializer_context)
         return self.get_paginated_response(serializer.data)

From fb03c68c53ebe5c87cfb4d360209525d1aa0d2a0 Mon Sep 17 00:00:00 2001
From: Olga Bulat <obulat@gmail.com>
Date: Fri, 6 Oct 2023 06:45:26 +0300
Subject: [PATCH 2/4] Re-add mature filter

---
 api/api/controllers/search_controller.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/api/api/controllers/search_controller.py b/api/api/controllers/search_controller.py
index 6135deaa480..c4bd81e312e 100644
--- a/api/api/controllers/search_controller.py
+++ b/api/api/controllers/search_controller.py
@@ -515,6 +515,9 @@ def related_media(uuid: str, index: str, filter_dead: bool) -> tuple[list[Hit],
             max_query_terms=50,
         )
     )
+    # Prevent the items that users set as `mature` from showing up in
+    # recommendations.
+    s = s.exclude("term", mature=True)
     s = _exclude_filtered(s)
     page_size = 10
     page = 1

From d7a13a2a05a5945d22910f31c88be816ebbefb28 Mon Sep 17 00:00:00 2001
From: Olga Bulat <obulat@gmail.com>
Date: Sat, 7 Oct 2023 14:41:45 +0300
Subject: [PATCH 3/4] Use simple query for related (#3156)

* Match related by title and tags

* Use default index for item search

* Handle ES hits without tags

* Use the first 10 tags for the query

---------

Co-authored-by: Dhruv Bhanushali <dhruv_b@live.com>
---
 api/api/controllers/search_controller.py | 57 +++++++++++++-----------
 api/api/views/media_views.py             |  8 ++--
 2 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/api/api/controllers/search_controller.py b/api/api/controllers/search_controller.py
index c4bd81e312e..5c755e82f2c 100644
--- a/api/api/controllers/search_controller.py
+++ b/api/api/controllers/search_controller.py
@@ -11,7 +11,7 @@
 
 from elasticsearch.exceptions import BadRequestError, NotFoundError
 from elasticsearch_dsl import Q, Search
-from elasticsearch_dsl.query import EMPTY_QUERY, MoreLikeThis, Query
+from elasticsearch_dsl.query import EMPTY_QUERY, Match, Query, SimpleQueryString, Term
 from elasticsearch_dsl.response import Hit, Response
 
 import api.models as models
@@ -496,39 +496,42 @@ def search(
     return results, page_count, result_count, search_context.asdict()
 
 
-def related_media(uuid: str, index: str, filter_dead: bool) -> tuple[list[Hit], int]:
-    """Given a UUID, find related search results."""
+def related_media(uuid: str, index: str, filter_dead: bool) -> list[Hit]:
+    """Given a UUID, find related search results based on title and tags."""
 
-    search_client = Search(index=index)
+    # Search the default index for the item itself as it might be sensitive.
+    item_search = Search(index=index)
+    item_hit = item_search.query("match", identifier=uuid).execute().hits[0]
 
-    # Convert UUID to sequential ID.
-    item = search_client
-    item = item.query("match", identifier=uuid)
-    _id = item.execute().hits[0].id
+    # Match related using title.
+    title = item_hit.title
+    title_query = SimpleQueryString(query=title, fields=["title"])
+    related_query = title_query
 
-    s = search_client
-    s = s.query(
-        MoreLikeThis(
-            fields=["tags.name", "title"],
-            like={"_index": index, "_id": _id},
-            min_term_freq=1,
-            max_query_terms=50,
-        )
-    )
-    # Prevent the items that users set as `mature` from showing up in
-    # recommendations.
-    s = s.exclude("term", mature=True)
+    # Match related using tags, if the item has any.
+    if tags := getattr(item_hit, "tags", None):
+        # Only use the first 10 tags
+        tags = ",".join([tag.name for tag in tags[:10]])
+        tags_query = SimpleQueryString(fields=["tags.name"], query=tags)
+        related_query |= tags_query
+
+    # Search the filtered index for related items.
+    s = Search(index=f"{index}-filtered")
+
+    # Exclude the current item and mature content.
+    # TODO: remove `__keyword` after
+    #  https://github.com/WordPress/openverse/pull/3143 is merged.
+    s = s.query(related_query & ~Match(identifier__keyword=uuid) & ~Term(mature=True))
+    # Exclude the dynamically disabled sources.
     s = _exclude_filtered(s)
-    page_size = 10
-    page = 1
+
+    page, page_size = 1, 10
     start, end = _get_query_slice(s, page_size, page, filter_dead)
     s = s[start:end]
+
     response = s.execute()
     results = _post_process_results(s, start, end, page_size, response, filter_dead)
-
-    result_count, _ = _get_result_and_page_count(response, results, page_size, page)
-
-    return results or [], result_count
+    return results or []
 
 
 def get_sources(index):
@@ -576,7 +579,7 @@ def _get_result_and_page_count(
     response_obj: Response, results: list[Hit] | None, page_size: int, page: int
 ) -> tuple[int, int]:
     """
-    Adjust related page count because ES disallows deep pagination of ranked queries.
+    Adjust page count because ES disallows deep pagination of ranked queries.
 
     :param response_obj: The original Elasticsearch response object.
     :param results: The list of filtered result Hits.
diff --git a/api/api/views/media_views.py b/api/api/views/media_views.py
index 30319155fe3..96278b7a3b3 100644
--- a/api/api/views/media_views.py
+++ b/api/api/views/media_views.py
@@ -158,16 +158,16 @@ def stats(self, *_, **__):
     @action(detail=True)
     def related(self, request, identifier=None, *_, **__):
         try:
-            index = f"{self.default_index}-filtered"
-            results, num_results = search_controller.related_media(
+            results = search_controller.related_media(
                 uuid=identifier,
-                index=index,
+                index=self.default_index,
                 filter_dead=True,
             )
-            self.paginator.result_count = num_results
             self.paginator.page_count = 1
             # `page_size` refers to the maximum number of related images to return.
             self.paginator.page_size = 10
+            # `result_count` is hard-coded and is equal to the page size.
+            self.paginator.result_count = 10
         except ValueError as e:
             raise APIException(getattr(e, "message", str(e)))
         # If there are no hits in the search controller

From 07d6b6521c2da6bb4a61493bccf1d248a63e50a3 Mon Sep 17 00:00:00 2001
From: Olga Bulat <obulat@gmail.com>
Date: Mon, 9 Oct 2023 14:02:35 +0300
Subject: [PATCH 4/4] Add handling of the case when title and tags are absent

---
 api/api/controllers/search_controller.py | 42 +++++++++++++++++-------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/api/api/controllers/search_controller.py b/api/api/controllers/search_controller.py
index 5c755e82f2c..a9041d01d03 100644
--- a/api/api/controllers/search_controller.py
+++ b/api/api/controllers/search_controller.py
@@ -497,23 +497,43 @@ def search(
 
 
 def related_media(uuid: str, index: str, filter_dead: bool) -> list[Hit]:
-    """Given a UUID, find related search results based on title and tags."""
+    """
+    Given a UUID, finds 10 related search results based on title and tags.
+
+    Uses Match query for title or SimpleQueryString for tags.
+    If the item has no title and no tags, returns items by the same creator.
+    If the item has no title, no tags or no creator, returns empty list.
+
+    :param uuid: The UUID of the item to find related results for.
+    :param index: The Elasticsearch index to search (e.g. 'image')
+    :param filter_dead: Whether dead links should be removed.
+    :return: List of related results.
+    """
 
     # Search the default index for the item itself as it might be sensitive.
     item_search = Search(index=index)
-    item_hit = item_search.query("match", identifier=uuid).execute().hits[0]
+    # TODO: remove `__keyword` after
+    #  https://github.com/WordPress/openverse/pull/3143 is merged.
+    item_hit = item_search.query(Term(identifier__keyword=uuid)).execute().hits[0]
 
     # Match related using title.
     title = item_hit.title
-    title_query = SimpleQueryString(query=title, fields=["title"])
-    related_query = title_query
+    tags = getattr(item_hit, "tags", None)
+    creator = item_hit.creator
+
+    if not title and not tags:
+        if not creator:
+            return []
+        related_query = Term(creator__keyword=creator)
+    else:
+        related_query = None if not title else Match(title=title)
 
-    # Match related using tags, if the item has any.
-    if tags := getattr(item_hit, "tags", None):
-        # Only use the first 10 tags
-        tags = ",".join([tag.name for tag in tags[:10]])
-        tags_query = SimpleQueryString(fields=["tags.name"], query=tags)
-        related_query |= tags_query
+        # Match related using tags, if the item has any.
+        if tags:
+            # Only use the first 10 tags
+            tags = " | ".join([tag.name for tag in tags[:10]])
+            tags_query = SimpleQueryString(fields=["tags.name"], query=tags)
+            related_query = related_query | tags_query if related_query else tags_query
 
     # Search the filtered index for related items.
     s = Search(index=f"{index}-filtered")
@@ -521,7 +541,7 @@ def related_media(uuid: str, index: str, filter_dead: bool) -> list[Hit]:
     # Exclude the current item and mature content.
     # TODO: remove `__keyword` after
     #  https://github.com/WordPress/openverse/pull/3143 is merged.
-    s = s.query(related_query & ~Match(identifier__keyword=uuid) & ~Term(mature=True))
+    s = s.query(related_query & ~Term(identifier__keyword=uuid) & ~Term(mature=True))
     # Exclude the dynamically disabled sources.
     s = _exclude_filtered(s)