Skip to content

Commit

Permalink
Optimise APIV1ChunkedPackageCache memory usage
Browse files Browse the repository at this point in the history
Reduce the amount of data Django's ORM keeps in memory at the same time
by processing the PackageListings in smaller, consecutive QuerySets.
  • Loading branch information
anttimaki committed Sep 3, 2024
1 parent dc985ea commit 01b309d
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 25 deletions.
17 changes: 17 additions & 0 deletions django/thunderstore/repository/api/v1/tests/test_caches.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import gzip
import json
from datetime import timedelta
from random import shuffle
from typing import Any

import pytest
Expand All @@ -17,6 +18,7 @@
update_api_v1_chunked_package_caches,
)
from thunderstore.repository.models import APIV1ChunkedPackageCache, APIV1PackageCache
from thunderstore.repository.models.cache import get_package_listing_chunk


@pytest.mark.django_db
Expand Down Expand Up @@ -210,3 +212,18 @@ def test_api_v1_chunked_package_cache__drops_stale_caches() -> None:
second_cache.refresh_from_db()
assert first_cache.is_deleted
assert not second_cache.is_deleted


@pytest.mark.django_db
@pytest.mark.parametrize("count", (0, 1, 2, 3, 5, 8, 13))
def test_get_package_listing_chunk__retains_received_ordering(count: int) -> None:
assert not PackageListing.objects.exists()
for _ in range(count):
PackageListingFactory()

ordering = list(PackageListing.objects.all().values_list("id", flat=True))
shuffle(ordering)
listings = get_package_listing_chunk(ordering)

for i, listing in enumerate(listings):
assert listing.id == ordering[i]
14 changes: 11 additions & 3 deletions django/thunderstore/repository/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,19 @@ def order_package_listing_queryset(
)


def get_package_listing_base_queryset(
community_identifier: str,
) -> QuerySet[PackageListing]:
return (
PackageListing.objects.active()
.filter_by_community_approval_rule()
.exclude(~Q(community__identifier=community_identifier))
)


def get_package_listing_queryset(community_identifier: str) -> QuerySet[PackageListing]:
return order_package_listing_queryset(
prefetch_package_listing_queryset(
PackageListing.objects.active()
.filter_by_community_approval_rule()
.exclude(~Q(community__identifier=community_identifier)),
get_package_listing_base_queryset(community_identifier),
),
)
63 changes: 41 additions & 22 deletions django/thunderstore/repository/models/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import io
import json
from datetime import timedelta
from typing import Any, List, Optional
from typing import Any, Iterable, List, Optional

from django.core.files.base import ContentFile
from django.db import models
Expand All @@ -11,10 +11,11 @@
from thunderstore.community.models import Community, PackageListing
from thunderstore.core.mixins import S3FileMixin, SafeDeleteMixin
from thunderstore.repository.cache import (
get_package_listing_queryset,
get_package_listing_base_queryset,
order_package_listing_queryset,
)
from thunderstore.storage.models import DataBlob, DataBlobGroup
from thunderstore.utils.batch import batch


class APIExperimentalPackageIndexCache(S3FileMixin):
Expand Down Expand Up @@ -177,19 +178,22 @@ def finalize_blob() -> None:
content_encoding="gzip",
)

for listing in get_package_listings(community):
listing_bytes = listing_to_json(listing)

# Always add the first listing regardless of the size limit.
if not chunk_content:
chunk_content.extend(listing_bytes)
# Start new blob if adding current chunck would exceed the size limit.
# +2 for opening and closing brackets
elif len(chunk_content) + len(listing_bytes) + 2 > uncompressed_blob_size:
finalize_blob()
chunk_content = bytearray(listing_bytes)
else:
chunk_content.extend(b"," + listing_bytes)
for listing_ids in get_package_listing_ids(community):
for listing in get_package_listing_chunk(listing_ids):
listing_bytes = listing_to_json(listing)

# Always add the first listing regardless of the size limit.
if not chunk_content:
chunk_content.extend(listing_bytes)
# Start new blob if adding current chunck would exceed the size limit.
# +2 for opening and closing brackets
elif (
len(chunk_content) + len(listing_bytes) + 2 > uncompressed_blob_size
):
finalize_blob()
chunk_content = bytearray(listing_bytes)
else:
chunk_content.extend(b"," + listing_bytes)

if len(chunk_content) or not group.entries.exists():
finalize_blob()
Expand Down Expand Up @@ -231,14 +235,28 @@ def get_blob_content(cls, blob: DataBlob) -> List[Any]:
return json.loads(f.read())


def get_package_listings(community: Community) -> models.QuerySet["PackageListing"]:
listing_ids = get_package_listing_queryset(community.identifier).values_list(
"id",
flat=True,
def get_package_listing_ids(community: Community) -> Iterable[List[int]]:
"""
Iterate over the PackageListing in chunks to limit the amount of
data Django keeps in memory concurrently.
"""
listing_ids = order_package_listing_queryset(
get_package_listing_base_queryset(community.identifier)
).values_list("id", flat=True)

yield from batch(1000, listing_ids)


def get_package_listing_chunk(
listing_ids: List[int],
) -> models.QuerySet["PackageListing"]:
# Keep the ordering as it was when the whole id list was read.
ordering = models.Case(
*[models.When(id=id, then=pos) for pos, id in enumerate(listing_ids)]
)
listing_ref = PackageListing.objects.filter(pk=models.OuterRef("pk"))

return order_package_listing_queryset(
return (
PackageListing.objects.filter(id__in=listing_ids)
.select_related("community", "package", "package__owner")
.prefetch_related("categories", "community__sites", "package__versions")
Expand All @@ -248,7 +266,8 @@ def get_package_listings(community: Community) -> models.QuerySet["PackageListin
ratings=models.Count("package__package_ratings"),
).values("ratings"),
),
),
)
.order_by(ordering)
)


Expand All @@ -268,7 +287,7 @@ def listing_to_json(listing: PackageListing) -> bytes:
"is_deprecated": listing.package.is_deprecated,
"has_nsfw_content": listing.has_nsfw_content,
"categories": [c.name for c in listing.categories.all()],
# TODO: god-awful performance from OVER NINE THOUSAAAAND database hits
# TODO: this generates awfully lot of database hits
"versions": [
{
"name": version.name,
Expand Down

0 comments on commit 01b309d

Please sign in to comment.