Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify the related search #3151

Merged
merged 4 commits into from
Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 52 additions & 32 deletions api/api/controllers/search_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from elasticsearch.exceptions import BadRequestError, NotFoundError
from elasticsearch_dsl import Q, Search
from elasticsearch_dsl.query import EMPTY_QUERY, MoreLikeThis, Query
from elasticsearch_dsl.query import EMPTY_QUERY, Match, Query, SimpleQueryString, Term
from elasticsearch_dsl.response import Hit, Response

import api.models as models
Expand Down Expand Up @@ -283,7 +283,8 @@ def _exclude_filtered(s: Search):
key=filter_cache_key, timeout=FILTER_CACHE_TIMEOUT, value=filtered_providers
)
to_exclude = [f["provider_identifier"] for f in filtered_providers]
s = s.exclude("terms", provider=to_exclude)
if to_exclude:
s = s.exclude("terms", provider=to_exclude)
return s


Expand Down Expand Up @@ -495,43 +496,62 @@ def search(
return results, page_count, result_count, search_context.asdict()


def related_media(uuid, index, filter_dead):
"""Given a UUID, find related search results."""
def related_media(uuid: str, index: str, filter_dead: bool) -> list[Hit]:
"""
Given a UUID, finds 10 related search results based on title and tags.

search_client = Search(index=index)
Uses Match query for title or SimpleQueryString for tags.
If the item has no title and no tags, returns items by the same creator.
If the item has no title, no tags or no creator, returns empty list.

# Convert UUID to sequential ID.
item = search_client
item = item.query("match", identifier=uuid)
_id = item.execute().hits[0].id
:param uuid: The UUID of the item to find related results for.
:param index: The Elasticsearch index to search (e.g. 'image')
:param filter_dead: Whether dead links should be removed.
:return: List of related results.
"""

s = search_client
s = s.query(
MoreLikeThis(
fields=["tags.name", "title", "creator"],
like={"_index": index, "_id": _id},
min_term_freq=1,
max_query_terms=50,
)
)
# Never show mature content in recommendations.
s = s.exclude("term", mature=True)
# Search the default index for the item itself as it might be sensitive.
item_search = Search(index=index)
# TODO: remove `__keyword` after
# https://github.com/WordPress/openverse/pull/3143 is merged.
item_hit = item_search.query(Term(identifier__keyword=uuid)).execute().hits[0]

# Match related using title.
title = item_hit.title
tags = getattr(item_hit, "tags", None)
creator = item_hit.creator

if not title and not tags:
if not creator:
return []
related_query = Term(creator__keyword=creator)
else:
related_query = None if not title else Match(title=title)

# Match related using tags, if the item has any.
if tags:
# Only use the first 10 tags
tags = " | ".join([tag.name for tag in tags[:10]])
tags_query = SimpleQueryString(fields=["tags.name"], query=tags)
related_query = related_query | tags_query if related_query else tags_query

# Search the filtered index for related items.
s = Search(index=f"{index}-filtered")

# Exclude the current item and mature content.
# TODO: remove `__keyword` after
# https://github.com/WordPress/openverse/pull/3143 is merged.
s = s.query(related_query & ~Term(identifier__keyword=uuid) & ~Term(mature=True))
# Exclude the dynamically disabled sources.
s = _exclude_filtered(s)
page_size = 10
page = 1

page, page_size = 1, 10
start, end = _get_query_slice(s, page_size, page, filter_dead)
s = s[start:end]

response = s.execute()
results = _post_process_results(s, start, end, page_size, response, filter_dead)

result_count, _ = _get_result_and_page_count(response, results, page_size, page)

if not results:
results = []

result_ids = [result.identifier for result in results]
search_context = SearchContext.build(result_ids, index)
return results, result_count, search_context.asdict()
return results or []


def get_sources(index):
Expand Down Expand Up @@ -579,7 +599,7 @@ def _get_result_and_page_count(
response_obj: Response, results: list[Hit] | None, page_size: int, page: int
) -> tuple[int, int]:
"""
Adjust related page count because ES disallows deep pagination of ranked queries.
Adjust page count because ES disallows deep pagination of ranked queries.

:param response_obj: The original Elasticsearch response object.
:param results: The list of filtered result Hits.
Expand Down
7 changes: 4 additions & 3 deletions api/api/views/media_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,22 +158,23 @@ def stats(self, *_, **__):
@action(detail=True)
def related(self, request, identifier=None, *_, **__):
try:
results, num_results, search_context = search_controller.related_media(
results = search_controller.related_media(
uuid=identifier,
index=self.default_index,
filter_dead=True,
)
self.paginator.result_count = num_results
self.paginator.page_count = 1
# `page_size` refers to the maximum number of related images to return.
self.paginator.page_size = 10
# `result_count` is hard-coded and is equal to the page size.
self.paginator.result_count = 10
except ValueError as e:
raise APIException(getattr(e, "message", str(e)))
# If there are no hits in the search controller
except IndexError:
raise APIException("Could not find items.", 404)

serializer_context = search_context | self.get_serializer_context()
serializer_context = self.get_serializer_context()

serializer = self.get_serializer(results, many=True, context=serializer_context)
return self.get_paginated_response(serializer.data)
Expand Down