Merge branch 'main' into 4455-redis-memory-spike-from-task-throttling…

…-and-queue-buildup
freelawproject · Sep 20, 2024 · 6d6df0f · 6d6df0f
2 parents b196519 + e3a22de
commit 6d6df0f
Show file tree

Hide file tree

Showing 15 changed files with 1,774 additions and 1,302 deletions.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -22,7 +22,7 @@ jobs:
       - name: Install Poetry
         uses: snok/install-poetry@v1
         with:
-          version: 1.7.1
+          version: 1.8.3
           virtualenvs-create: true
           virtualenvs-in-project: true
 

diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py
@@ -9,16 +9,14 @@
 from functools import reduce, wraps
 from typing import Any, Callable, Dict, List, Literal
 
-from asgiref.sync import sync_to_async
+from asgiref.sync import async_to_sync
 from django.conf import settings
-from django.core.cache import caches
-from django.core.paginator import EmptyPage, Page
-from django.db.models import Case, CharField
+from django.core.paginator import Page
+from django.db.models import Case
 from django.db.models import Q as QObject
-from django.db.models import QuerySet, TextField, Value, When
+from django.db.models import QuerySet, TextField, When
 from django.db.models.functions import Substr
 from django.forms.boundfield import BoundField
-from django.http import HttpRequest
 from django.http.request import QueryDict
 from django.utils.html import strip_tags
 from django_elasticsearch_dsl.search import Search
@@ -31,9 +29,7 @@
 
 from cl.audio.models import Audio
 from cl.custom_filters.templatetags.text_filters import html_decode
-from cl.lib.bot_detector import is_bot
 from cl.lib.date_time import midnight_pt
-from cl.lib.paginators import ESPaginator
 from cl.lib.string_utils import trunc
 from cl.lib.types import (
     ApiPositionMapping,
@@ -59,6 +55,7 @@
     RELATED_PATTERN,
     SEARCH_ALERTS_ORAL_ARGUMENT_ES_HL_FIELDS,
     SEARCH_HL_TAG,
+    SEARCH_MLT_OPINION_QUERY_FIELDS,
     SEARCH_OPINION_HL_FIELDS,
     SEARCH_OPINION_QUERY_FIELDS,
     SEARCH_ORAL_ARGUMENT_ES_HL_FIELDS,
@@ -81,7 +78,6 @@
 )
 from cl.search.forms import SearchForm
 from cl.search.models import (
-    PRECEDENTIAL_STATUS,
     SEARCH_TYPES,
     Court,
     Opinion,
@@ -169,24 +165,33 @@ def build_daterange_query(
     return []
 
 
-def build_more_like_this_query(related_id: list[str]):
-    document_list = [{"_id": f"o_{id}"} for id in related_id]
-    more_like_this_fields = SEARCH_OPINION_QUERY_FIELDS.copy()
-    more_like_this_fields.extend(
-        [
-            "type",
-            "text",
-            "caseName",
-            "docketNumber",
-        ]
-    )
-    return Q(
+async def build_more_like_this_query(related_ids: list[str]) -> Query:
+    """Build an ES "more like this" query based on related Opinion IDs.
+
+    :param related_ids: A list of related Opinion IDs to build the query on.
+    :return: An ES query object with "more like this" query and
+    exclusions for specific opinion clusters.
+    """
+
+    document_list = [{"_id": f"o_{id}"} for id in related_ids]
+    more_like_this_fields = SEARCH_MLT_OPINION_QUERY_FIELDS.copy()
+    mlt_query = Q(
         "more_like_this",
         fields=more_like_this_fields,
         like=document_list,
         min_term_freq=1,
         max_query_terms=12,
     )
+    # Exclude opinion clusters to which the related IDs to query belong.
+    cluster_ids_to_exclude = (
+        OpinionCluster.objects.filter(sub_opinions__pk__in=related_ids)
+        .distinct("pk")
+        .values_list("pk", flat=True)
+    )
+    cluster_ids_list = [pk async for pk in cluster_ids_to_exclude.aiterator()]
+    exclude_cluster_ids = [Q("terms", cluster_id=cluster_ids_list)]
+    bool_query = Q("bool", must=[mlt_query], must_not=exclude_cluster_ids)
+    return bool_query
 
 
 def make_es_boost_list(fields: Dict[str, float]) -> list[str]:
@@ -1177,7 +1182,19 @@ def build_es_base_query(
             mlt_query = None
             if related_match:
                 cluster_pks = related_match.group("pks").split(",")
-                mlt_query = build_more_like_this_query(cluster_pks)
+                mlt_query = async_to_sync(build_more_like_this_query)(
+                    cluster_pks
+                )
+                main_query, join_query = build_full_join_es_queries(
+                    cd,
+                    {"opinion": []},
+                    [],
+                    mlt_query,
+                    child_highlighting=False,
+                    api_version=api_version,
+                )
+                return search_query.query(main_query), join_query
+
             opinion_search_fields = SEARCH_OPINION_QUERY_FIELDS
             child_fields = opinion_search_fields.copy()
             child_fields.extend(
@@ -1441,6 +1458,12 @@ def add_es_highlighting(
     :param highlighting: Whether highlighting should be enabled in docs.
     :return: The modified Elasticsearch search query object with highlights set
     """
+
+    # Avoid highlighting for the related cluster query.
+    related_match = RELATED_PATTERN.search(cd.get("q", ""))
+    if related_match:
+        return search_query
+
     highlighting_fields = {}
     highlighting_keyword_fields = []
     hl_tag = ALERTS_HL_TAG if alerts else SEARCH_HL_TAG
@@ -2035,7 +2058,6 @@ def fetch_es_results(
         main_doc_count_query = build_cardinality_count(
             main_doc_count_query, parent_unique_field
         )
-
         if child_docs_count_query:
             child_unique_field = cardinality_query_unique_ids[
                 SEARCH_TYPES.RECAP_DOCUMENT
@@ -2461,12 +2483,13 @@ def build_full_join_es_queries(
         child_filters_original = deepcopy(child_filters)
         # Build child text query.
         child_fields = child_query_fields[child_type]
-        child_text_query = build_fulltext_query(
-            child_fields, cd.get("q", ""), only_queries=True
-        )
 
         if mlt_query:
-            child_text_query.append(mlt_query)
+            child_text_query = [mlt_query]
+        else:
+            child_text_query = build_fulltext_query(
+                child_fields, cd.get("q", ""), only_queries=True
+            )
 
         # Build parent filters.
         parent_filters = build_join_es_filters(cd)
@@ -2602,7 +2625,7 @@ def build_full_join_es_queries(
                     should=string_query,
                     minimum_should_match=1,
                 )
-        if parent_query:
+        if parent_query and not mlt_query:
             q_should.append(parent_query)
 
     if not q_should:
@@ -2758,91 +2781,6 @@ def merge_opinion_and_cluster(results: Page | dict) -> None:
         result["status_exact"] = result["status"]
 
 
-async def get_related_clusters_with_cache_and_es(
-    search: Search,
-    cluster: OpinionCluster,
-    request: HttpRequest,
-) -> tuple[Page | list, list[int], dict[str, str]]:
-    """Retrieve related opinion clusters from ES or cache.
-
-    :param search: The ES Search object.
-    :param cluster: The current OpinionCluster.
-    :param request: The HttpRequest object.
-    :return: A three tuple containing a Page containing opinion clusters or an
-    empty list. A list containing the cluster sub opinions ids. A dic containing
-    the url_search_params.
-    """
-
-    # By default, all statuses are included. Retrieve the PRECEDENTIAL_STATUS
-    # attributes (since they're indexed in ES) instead of the NAMES values.
-    available_statuses = [status[0] for status in PRECEDENTIAL_STATUS.NAMES]
-    url_search_params = {f"stat_{v}": "on" for v in available_statuses}
-    search_params: CleanData = {}
-    # Opinions that belong to the targeted cluster
-    sub_opinion_ids = cluster.sub_opinions.values_list("pk", flat=True)
-    sub_opinion_pks = [pk async for pk in sub_opinion_ids]
-    if is_bot(request) or not sub_opinion_pks:
-        # If it is a bot or lacks sub-opinion IDs, return empty results
-        return [], [], url_search_params
-
-    # Use cache if enabled
-    cache = caches["db_cache"]
-    mlt_cache_key = f"mlt-cluster-es:{cluster.pk}"
-    related_clusters = (
-        await cache.aget(mlt_cache_key) if settings.RELATED_USE_CACHE else None
-    )
-
-    if settings.RELATED_FILTER_BY_STATUS:
-        # Filter results by status (e.g., Precedential)
-        # Update URL parameters accordingly
-        search_params[
-            f"stat_{PRECEDENTIAL_STATUS.get_status_value(settings.RELATED_FILTER_BY_STATUS)}"
-        ] = True
-        url_search_params = {
-            f"stat_{PRECEDENTIAL_STATUS.get_status_value(settings.RELATED_FILTER_BY_STATUS)}": "on"
-        }
-
-    if related_clusters is None:
-        sub_opinion_queries = ",".join(str(pk) for pk in sub_opinion_pks)
-        search_params["q"] = f"related:{sub_opinion_queries}"
-        search_params["type"] = SEARCH_TYPES.OPINION
-        query_dict = QueryDict("", mutable=True)
-        query_dict.update(search_params)
-        search_query, child_docs_count_query, _ = await sync_to_async(
-            build_es_main_query
-        )(search, search_params)
-        hits, _, error, total_query_results, _ = await sync_to_async(
-            fetch_es_results
-        )(
-            query_dict,
-            search_query,
-            child_docs_count_query,
-            1,
-            settings.RELATED_COUNT,
-        )
-        if error:
-            return [], [], url_search_params
-
-        @sync_to_async
-        def paginate_related_clusters(total_results: int, results: Response):
-            paginator = ESPaginator(
-                total_results, results, settings.RELATED_COUNT
-            )
-            try:
-                return paginator.page(1)
-            except EmptyPage:
-                return paginator.page(paginator.num_pages)
-
-        related_clusters = await paginate_related_clusters(
-            total_query_results, hits
-        )
-
-        await cache.aset(
-            mlt_cache_key, related_clusters, settings.RELATED_CACHE_TIMEOUT
-        )
-    return related_clusters, sub_opinion_pks, url_search_params
-
-
 def make_es_stats_variable(
     search_form: SearchForm,
     results: Page | Response,

diff --git a/cl/opinion_page/templates/includes/opinions_sidebar.html b/cl/opinion_page/templates/includes/opinions_sidebar.html
@@ -1,27 +1,15 @@
 {% load text_filters %}
 {% load waffle_tags %}
 <ul>
-  {% flag "o-es-active" %}
-    {% for opinion in opinions.object_list %}
-        <li>
-            <a href="{{ opinion.absolute_url }}{% querystring %}">
-                {% with opinion.title as title  %}
-                  {{ opinion.caseName|default:title|default_if_none:"N/A"|safe|truncatewords:10|v_wrapper }}
-                {% endwith %}
-            </a>
-        </li>
-    {% endfor %}
-  {% else %}
-    {% for opinion in opinions %}
-        <li>
-            <a href="{{ opinion.absolute_url }}{% querystring %}">
-                {% with opinion.title as title  %}
-                  {{ opinion.caseName|default:title|default_if_none:"N/A"|safe|truncatewords:10|v_wrapper }}
-                {% endwith %}
-            </a>
-        </li>
-    {% endfor %}
-  {% endflag %}
+  {% for opinion in opinions %}
+      <li>
+        <a href="{{ opinion.absolute_url }}{% querystring %}">
+              {% with opinion.title as title  %}
+                {{ opinion.caseName|default:title|default_if_none:"N/A"|safe|truncatewords:10|v_wrapper }}
+              {% endwith %}
+          </a>
+      </li>
+  {% endfor %}
 </ul>
 <p>
     <a href="{{ full_list_url }}" class="btn btn-default">

diff --git a/cl/opinion_page/templates/opinion.html b/cl/opinion_page/templates/opinion.html
@@ -134,7 +134,17 @@ <h3>
               >View Citing Opinions</a>
             </p>
           {% else %}
-            <p>This case has not yet been cited in our system.</p>
+            {% if queries_timeout %}
+              <p>Unable to retrieve citing clusters. Please try by clicking the button below:</p>
+              <p>
+                <a href="/?q=cites%3A({{ cluster.sub_opinions.all|OR_join }})"
+                   rel="nofollow"
+                   class="btn btn-default"
+                >View Citing Opinions</a>
+              </p>
+            {% else %}
+               <p>This case has not yet been cited in our system.</p>
+            {% endif %}
           {% endif %}
           <div class="btn-group">
             <a href="/?show_alert_modal=yes&q=cites%3A({{ cluster.sub_opinions.all|OR_join }})"
@@ -159,14 +169,13 @@ <h3>
 
 
         {# Related opinions #}
-        {% if related_clusters %}
+        {% if related_clusters or queries_timeout %}
             <div id="recommendations" class="sidebar-section">
                 <h3><span>Related Case Law</span></h3>
-
-                <p class="bottom">The following case law covers similar topics:</p>
-
+                {% if not related_clusters and queries_timeout %}
+                    <p class="bottom">Unable to retrieve related clusters. Please try by clicking the button below:</p>
+                {% endif %}
                 {% url 'show_results' as show_results_url %}
-
                 {% with sub_opinion_ids_list=sub_opinion_ids|join:',' pk_str=cluster.pk|stringformat:"s" %}
                 {% with opinions=related_clusters full_list_url=show_results_url|add:"?q=related:"|add:sub_opinion_ids_list|add:related_search_params %}
 

diff --git a/cl/opinion_page/tests.py b/cl/opinion_page/tests.py
@@ -14,7 +14,7 @@
 from django.contrib.auth.models import Group, User
 from django.core.files.uploadedfile import SimpleUploadedFile
 from django.core.management import call_command
-from django.test import RequestFactory, override_settings
+from django.test import AsyncRequestFactory, RequestFactory, override_settings
 from django.test.client import AsyncClient
 from django.urls import reverse
 from django.utils.text import slugify
@@ -38,7 +38,7 @@
     TennWorkCompClUploadForm,
 )
 from cl.opinion_page.utils import (
-    es_get_citing_clusters_with_cache,
+    es_get_citing_and_related_clusters_with_cache,
     generate_docket_entries_csv_data,
     make_docket_title,
 )
@@ -191,13 +191,17 @@ async def test_simple_opinion_page(self) -> None:
         self.assertIn("33 state 1", response.content.decode())
 
     async def test_es_get_citing_clusters_with_cache(self) -> None:
-        """Does es_get_citing_clusters_with_cache return the correct clusters
-        citing and the total cites count?
+        """Does es_get_citing_and_related_clusters_with_cache return the
+        correct clusters citing and the total cites count?
         """
 
-        clusters, count = await es_get_citing_clusters_with_cache(
-            self.o_cluster_3
+        request = AsyncRequestFactory().get("/")
+        result = await es_get_citing_and_related_clusters_with_cache(
+            self.o_cluster_3, request
         )
+        clusters = result.citing_clusters
+        count = result.citing_cluster_count
+
         c_list_names = [c["caseName"] for c in clusters]
         expected_clusters = [
             self.o_cluster_1.case_name,