Skip to content

Commit d7a13a2

Browse files
obulatdhruvkb
andcommitted
Use simple query for related (#3156)
* Match related by title and tags * Use default index for item search * Handle ES hits without tags * Use the first 10 tags for the query --------- Co-authored-by: Dhruv Bhanushali <dhruv_b@live.com>
1 parent fb03c68 commit d7a13a2

File tree

2 files changed

+34
-31
lines changed

2 files changed

+34
-31
lines changed

api/api/controllers/search_controller.py

Lines changed: 30 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
from elasticsearch.exceptions import BadRequestError, NotFoundError
1313
from elasticsearch_dsl import Q, Search
14-
from elasticsearch_dsl.query import EMPTY_QUERY, MoreLikeThis, Query
14+
from elasticsearch_dsl.query import EMPTY_QUERY, Match, Query, SimpleQueryString, Term
1515
from elasticsearch_dsl.response import Hit, Response
1616

1717
import api.models as models
@@ -496,39 +496,42 @@ def search(
496496
return results, page_count, result_count, search_context.asdict()
497497

498498

499-
def related_media(uuid: str, index: str, filter_dead: bool) -> tuple[list[Hit], int]:
500-
"""Given a UUID, find related search results."""
499+
def related_media(uuid: str, index: str, filter_dead: bool) -> list[Hit]:
500+
"""Given a UUID, find related search results based on title and tags."""
501501

502-
search_client = Search(index=index)
502+
# Search the default index for the item itself as it might be sensitive.
503+
item_search = Search(index=index)
504+
item_hit = item_search.query("match", identifier=uuid).execute().hits[0]
503505

504-
# Convert UUID to sequential ID.
505-
item = search_client
506-
item = item.query("match", identifier=uuid)
507-
_id = item.execute().hits[0].id
506+
# Match related using title.
507+
title = item_hit.title
508+
title_query = SimpleQueryString(query=title, fields=["title"])
509+
related_query = title_query
508510

509-
s = search_client
510-
s = s.query(
511-
MoreLikeThis(
512-
fields=["tags.name", "title"],
513-
like={"_index": index, "_id": _id},
514-
min_term_freq=1,
515-
max_query_terms=50,
516-
)
517-
)
518-
# Prevent the items that users set as `mature` from showing up in
519-
# recommendations.
520-
s = s.exclude("term", mature=True)
511+
# Match related using tags, if the item has any.
512+
if tags := getattr(item_hit, "tags", None):
513+
# Only use the first 10 tags
514+
tags = ",".join([tag.name for tag in tags[:10]])
515+
tags_query = SimpleQueryString(fields=["tags.name"], query=tags)
516+
related_query |= tags_query
517+
518+
# Search the filtered index for related items.
519+
s = Search(index=f"{index}-filtered")
520+
521+
# Exclude the current item and mature content.
522+
# TODO: remove `__keyword` after
523+
# https://github.com/WordPress/openverse/pull/3143 is merged.
524+
s = s.query(related_query & ~Match(identifier__keyword=uuid) & ~Term(mature=True))
525+
# Exclude the dynamically disabled sources.
521526
s = _exclude_filtered(s)
522-
page_size = 10
523-
page = 1
527+
528+
page, page_size = 1, 10
524529
start, end = _get_query_slice(s, page_size, page, filter_dead)
525530
s = s[start:end]
531+
526532
response = s.execute()
527533
results = _post_process_results(s, start, end, page_size, response, filter_dead)
528-
529-
result_count, _ = _get_result_and_page_count(response, results, page_size, page)
530-
531-
return results or [], result_count
534+
return results or []
532535

533536

534537
def get_sources(index):
@@ -576,7 +579,7 @@ def _get_result_and_page_count(
576579
response_obj: Response, results: list[Hit] | None, page_size: int, page: int
577580
) -> tuple[int, int]:
578581
"""
579-
Adjust related page count because ES disallows deep pagination of ranked queries.
582+
Adjust page count because ES disallows deep pagination of ranked queries.
580583
581584
:param response_obj: The original Elasticsearch response object.
582585
:param results: The list of filtered result Hits.

api/api/views/media_views.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -158,16 +158,16 @@ def stats(self, *_, **__):
158158
@action(detail=True)
159159
def related(self, request, identifier=None, *_, **__):
160160
try:
161-
index = f"{self.default_index}-filtered"
162-
results, num_results = search_controller.related_media(
161+
results = search_controller.related_media(
163162
uuid=identifier,
164-
index=index,
163+
index=self.default_index,
165164
filter_dead=True,
166165
)
167-
self.paginator.result_count = num_results
168166
self.paginator.page_count = 1
169167
# `page_size` refers to the maximum number of related images to return.
170168
self.paginator.page_size = 10
169+
# `result_count` is hard-coded and is equal to the page size.
170+
self.paginator.result_count = 10
171171
except ValueError as e:
172172
raise APIException(getattr(e, "message", str(e)))
173173
# If there are no hits in the search controller

0 commit comments

Comments
 (0)