|
11 | 11 |
|
12 | 12 | from elasticsearch.exceptions import BadRequestError, NotFoundError
|
13 | 13 | from elasticsearch_dsl import Q, Search
|
14 |
| -from elasticsearch_dsl.query import EMPTY_QUERY, MoreLikeThis, Query |
| 14 | +from elasticsearch_dsl.query import EMPTY_QUERY, Match, Query, SimpleQueryString, Term |
15 | 15 | from elasticsearch_dsl.response import Hit, Response
|
16 | 16 |
|
17 | 17 | import api.models as models
|
@@ -496,39 +496,42 @@ def search(
|
496 | 496 | return results, page_count, result_count, search_context.asdict()
|
497 | 497 |
|
498 | 498 |
|
499 |
| -def related_media(uuid: str, index: str, filter_dead: bool) -> tuple[list[Hit], int]: |
500 |
| - """Given a UUID, find related search results.""" |
| 499 | +def related_media(uuid: str, index: str, filter_dead: bool) -> list[Hit]: |
| 500 | + """Given a UUID, find related search results based on title and tags.""" |
501 | 501 |
|
502 |
| - search_client = Search(index=index) |
| 502 | + # Search the default index for the item itself as it might be sensitive. |
| 503 | + item_search = Search(index=index) |
| 504 | + item_hit = item_search.query("match", identifier=uuid).execute().hits[0] |
503 | 505 |
|
504 |
| - # Convert UUID to sequential ID. |
505 |
| - item = search_client |
506 |
| - item = item.query("match", identifier=uuid) |
507 |
| - _id = item.execute().hits[0].id |
| 506 | + # Match related using title. |
| 507 | + title = item_hit.title |
| 508 | + title_query = SimpleQueryString(query=title, fields=["title"]) |
| 509 | + related_query = title_query |
508 | 510 |
|
509 |
| - s = search_client |
510 |
| - s = s.query( |
511 |
| - MoreLikeThis( |
512 |
| - fields=["tags.name", "title"], |
513 |
| - like={"_index": index, "_id": _id}, |
514 |
| - min_term_freq=1, |
515 |
| - max_query_terms=50, |
516 |
| - ) |
517 |
| - ) |
518 |
| - # Prevent the items that users set as `mature` from showing up in |
519 |
| - # recommendations. |
520 |
| - s = s.exclude("term", mature=True) |
| 511 | + # Match related using tags, if the item has any. |
| 512 | + if tags := getattr(item_hit, "tags", None): |
| 513 | + # Only use the first 10 tags |
| 514 | + tags = ",".join([tag.name for tag in tags[:10]]) |
| 515 | + tags_query = SimpleQueryString(fields=["tags.name"], query=tags) |
| 516 | + related_query |= tags_query |
| 517 | + |
| 518 | + # Search the filtered index for related items. |
| 519 | + s = Search(index=f"{index}-filtered") |
| 520 | + |
| 521 | + # Exclude the current item and mature content. |
| 522 | + # TODO: remove `__keyword` after |
| 523 | + # https://github.com/WordPress/openverse/pull/3143 is merged. |
| 524 | + s = s.query(related_query & ~Match(identifier__keyword=uuid) & ~Term(mature=True)) |
| 525 | + # Exclude the dynamically disabled sources. |
521 | 526 | s = _exclude_filtered(s)
|
522 |
| - page_size = 10 |
523 |
| - page = 1 |
| 527 | + |
| 528 | + page, page_size = 1, 10 |
524 | 529 | start, end = _get_query_slice(s, page_size, page, filter_dead)
|
525 | 530 | s = s[start:end]
|
| 531 | + |
526 | 532 | response = s.execute()
|
527 | 533 | results = _post_process_results(s, start, end, page_size, response, filter_dead)
|
528 |
| - |
529 |
| - result_count, _ = _get_result_and_page_count(response, results, page_size, page) |
530 |
| - |
531 |
| - return results or [], result_count |
| 534 | + return results or [] |
532 | 535 |
|
533 | 536 |
|
534 | 537 | def get_sources(index):
|
@@ -576,7 +579,7 @@ def _get_result_and_page_count(
|
576 | 579 | response_obj: Response, results: list[Hit] | None, page_size: int, page: int
|
577 | 580 | ) -> tuple[int, int]:
|
578 | 581 | """
|
579 |
| - Adjust related page count because ES disallows deep pagination of ranked queries. |
| 582 | + Adjust page count because ES disallows deep pagination of ranked queries. |
580 | 583 |
|
581 | 584 | :param response_obj: The original Elasticsearch response object.
|
582 | 585 | :param results: The list of filtered result Hits.
|
|
0 commit comments