Optimise APIV1ChunkedPackageCache memory usage

Reduce the amount of data Django's ORM keeps in memory at the same time by processing the PackageListings in smaller, consecutive QuerySets.
thunderstore-io · Sep 3, 2024 · 01b309d · 01b309d
1 parent dc985ea
commit 01b309d
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 25 deletions.
diff --git a/django/thunderstore/repository/api/v1/tests/test_caches.py b/django/thunderstore/repository/api/v1/tests/test_caches.py
@@ -1,6 +1,7 @@
 import gzip
 import json
 from datetime import timedelta
+from random import shuffle
 from typing import Any
 
 import pytest
@@ -17,6 +18,7 @@
     update_api_v1_chunked_package_caches,
 )
 from thunderstore.repository.models import APIV1ChunkedPackageCache, APIV1PackageCache
+from thunderstore.repository.models.cache import get_package_listing_chunk
 
 
 @pytest.mark.django_db
@@ -210,3 +212,18 @@ def test_api_v1_chunked_package_cache__drops_stale_caches() -> None:
     second_cache.refresh_from_db()
     assert first_cache.is_deleted
     assert not second_cache.is_deleted
+
+
+@pytest.mark.django_db
+@pytest.mark.parametrize("count", (0, 1, 2, 3, 5, 8, 13))
+def test_get_package_listing_chunk__retains_received_ordering(count: int) -> None:
+    assert not PackageListing.objects.exists()
+    for _ in range(count):
+        PackageListingFactory()
+
+    ordering = list(PackageListing.objects.all().values_list("id", flat=True))
+    shuffle(ordering)
+    listings = get_package_listing_chunk(ordering)
+
+    for i, listing in enumerate(listings):
+        assert listing.id == ordering[i]
diff --git a/django/thunderstore/repository/cache.py b/django/thunderstore/repository/cache.py
@@ -26,11 +26,19 @@ def order_package_listing_queryset(
     )
 
 
+def get_package_listing_base_queryset(
+    community_identifier: str,
+) -> QuerySet[PackageListing]:
+    return (
+        PackageListing.objects.active()
+        .filter_by_community_approval_rule()
+        .exclude(~Q(community__identifier=community_identifier))
+    )
+
+
 def get_package_listing_queryset(community_identifier: str) -> QuerySet[PackageListing]:
     return order_package_listing_queryset(
         prefetch_package_listing_queryset(
-            PackageListing.objects.active()
-            .filter_by_community_approval_rule()
-            .exclude(~Q(community__identifier=community_identifier)),
+            get_package_listing_base_queryset(community_identifier),
         ),
     )
diff --git a/django/thunderstore/repository/models/cache.py b/django/thunderstore/repository/models/cache.py
@@ -2,7 +2,7 @@
 import io
 import json
 from datetime import timedelta
-from typing import Any, List, Optional
+from typing import Any, Iterable, List, Optional
 
 from django.core.files.base import ContentFile
 from django.db import models
@@ -11,10 +11,11 @@
 from thunderstore.community.models import Community, PackageListing
 from thunderstore.core.mixins import S3FileMixin, SafeDeleteMixin
 from thunderstore.repository.cache import (
-    get_package_listing_queryset,
+    get_package_listing_base_queryset,
     order_package_listing_queryset,
 )
 from thunderstore.storage.models import DataBlob, DataBlobGroup
+from thunderstore.utils.batch import batch
 
 
 class APIExperimentalPackageIndexCache(S3FileMixin):
@@ -177,19 +178,22 @@ def finalize_blob() -> None:
                 content_encoding="gzip",
             )
 
-        for listing in get_package_listings(community):
-            listing_bytes = listing_to_json(listing)
-
-            # Always add the first listing regardless of the size limit.
-            if not chunk_content:
-                chunk_content.extend(listing_bytes)
-            # Start new blob if adding current chunck would exceed the size limit.
-            # +2 for opening and closing brackets
-            elif len(chunk_content) + len(listing_bytes) + 2 > uncompressed_blob_size:
-                finalize_blob()
-                chunk_content = bytearray(listing_bytes)
-            else:
-                chunk_content.extend(b"," + listing_bytes)
+        for listing_ids in get_package_listing_ids(community):
+            for listing in get_package_listing_chunk(listing_ids):
+                listing_bytes = listing_to_json(listing)
+
+                # Always add the first listing regardless of the size limit.
+                if not chunk_content:
+                    chunk_content.extend(listing_bytes)
+                # Start new blob if adding current chunck would exceed the size limit.
+                # +2 for opening and closing brackets
+                elif (
+                    len(chunk_content) + len(listing_bytes) + 2 > uncompressed_blob_size
+                ):
+                    finalize_blob()
+                    chunk_content = bytearray(listing_bytes)
+                else:
+                    chunk_content.extend(b"," + listing_bytes)
 
         if len(chunk_content) or not group.entries.exists():
             finalize_blob()
@@ -231,14 +235,28 @@ def get_blob_content(cls, blob: DataBlob) -> List[Any]:
             return json.loads(f.read())
 
 
-def get_package_listings(community: Community) -> models.QuerySet["PackageListing"]:
-    listing_ids = get_package_listing_queryset(community.identifier).values_list(
-        "id",
-        flat=True,
+def get_package_listing_ids(community: Community) -> Iterable[List[int]]:
+    """
+    Iterate over the PackageListing in chunks to limit the amount of
+    data Django keeps in memory concurrently.
+    """
+    listing_ids = order_package_listing_queryset(
+        get_package_listing_base_queryset(community.identifier)
+    ).values_list("id", flat=True)
+
+    yield from batch(1000, listing_ids)
+
+
+def get_package_listing_chunk(
+    listing_ids: List[int],
+) -> models.QuerySet["PackageListing"]:
+    # Keep the ordering as it was when the whole id list was read.
+    ordering = models.Case(
+        *[models.When(id=id, then=pos) for pos, id in enumerate(listing_ids)]
     )
     listing_ref = PackageListing.objects.filter(pk=models.OuterRef("pk"))
 
-    return order_package_listing_queryset(
+    return (
         PackageListing.objects.filter(id__in=listing_ids)
         .select_related("community", "package", "package__owner")
         .prefetch_related("categories", "community__sites", "package__versions")
@@ -248,7 +266,8 @@ def get_package_listings(community: Community) -> models.QuerySet["PackageListin
                     ratings=models.Count("package__package_ratings"),
                 ).values("ratings"),
             ),
-        ),
+        )
+        .order_by(ordering)
     )
 
 
@@ -268,7 +287,7 @@ def listing_to_json(listing: PackageListing) -> bytes:
             "is_deprecated": listing.package.is_deprecated,
             "has_nsfw_content": listing.has_nsfw_content,
             "categories": [c.name for c in listing.categories.all()],
-            # TODO: god-awful performance from OVER NINE THOUSAAAAND database hits
+            # TODO: this generates awfully lot of database hits
             "versions": [
                 {
                     "name": version.name,