Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(BA-674): Implement Image Soft/Hard Delete APIs #3628

Open
wants to merge 12 commits into
base: topic/02-10-feat_add_status_to_image_imagenode_gql_field
Choose a base branch
from
1 change: 1 addition & 0 deletions changes/3628.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Update `ForgetImage`, `ForgetImageById`, `ClearImages` to perform soft delete and added `PurgeImage`, `PurgeImageById` API for hard delete.
16 changes: 16 additions & 0 deletions docs/manager/graphql-reference/schema.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -1907,6 +1907,12 @@
forget_image_by_id(image_id: String!): ForgetImageById
forget_image(architecture: String = "x86_64", reference: String!): ForgetImage

"""Added in 25.3.1"""
purge_image_by_id(image_id: String!): PurgeImageById

Check notice on line 1911 in docs/manager/graphql-reference/schema.graphql

View workflow job for this annotation

GitHub Actions / GraphQL Inspector

Field 'purge_image_by_id' was added to object type 'Mutations'

Field 'purge_image_by_id' was added to object type 'Mutations'

"""Added in 25.3.1"""
purge_image(architecture: String = "x86_64", reference: String!): PurgeImage

Check notice on line 1914 in docs/manager/graphql-reference/schema.graphql

View workflow job for this annotation

GitHub Actions / GraphQL Inspector

Field 'purge_image' was added to object type 'Mutations'

Field 'purge_image' was added to object type 'Mutations'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the difference between purge_image_by_id?

Copy link
Member Author

@jopemachine jopemachine Feb 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

purge_image is an API that deletes an image using its name as a string,
whereas purge_image_by_id directly deletes an image using its image_id (DB ID).
It follows the same structure as forget_image and forget_image_by_id.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like it's nice to gradually deprecate the name-based APIs.


"""Added in 24.03.1"""
untag_image_from_registry(image_id: String!): UntagImageFromRegistry
alias_image(alias: String!, architecture: String = "x86_64", target: String!): AliasImage
Expand Down Expand Up @@ -2462,6 +2468,16 @@
image: ImageNode
}

"""Added in 25.3.1."""
type PurgeImageById {

Check notice on line 2472 in docs/manager/graphql-reference/schema.graphql

View workflow job for this annotation

GitHub Actions / GraphQL Inspector

Type 'PurgeImageById' was added

Type 'PurgeImageById' was added
image: ImageNode
}

"""Added in 25.3.1."""
type PurgeImage {

Check notice on line 2477 in docs/manager/graphql-reference/schema.graphql

View workflow job for this annotation

GitHub Actions / GraphQL Inspector

Type 'PurgeImage' was added

Type 'PurgeImage' was added
image: ImageNode
}

"""Added in 24.03.1"""
type UntagImageFromRegistry {
ok: Boolean
Expand Down
2 changes: 1 addition & 1 deletion src/ai/backend/client/cli/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def list(ctx: CLIContext, customized: bool) -> None:
@click.argument("reference_or_id", type=str)
@click.option("--arch", type=str, default=None, help="Set an explicit architecture.")
def forget(reference_or_id, arch):
"""Forget image from server. This command will only work for image customized by user
"""Mark image as deleted from server. This command will only work for image customized by user
unless callee has superadmin privileges.

REFERENCE_OR_ID: Canonical string of image (<registry>/<project>/<name>:<tag>)"""
Expand Down
6 changes: 6 additions & 0 deletions src/ai/backend/common/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,12 @@ def __init__(self, tags: Iterable[str], value: Optional[str] = None) -> None:
value = ""
self._data[key] = value

def __repr__(self):
return self.__str__()

def __str__(self):
return f"PlatformTagSet({str(self._data)})"

def has(self, key: str, version: Optional[str] = None):
if version is None:
return key in self._data
Expand Down
12 changes: 11 additions & 1 deletion src/ai/backend/manager/cli/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .image_impl import forget_image as forget_image_impl
from .image_impl import inspect_image as inspect_image_impl
from .image_impl import list_images as list_images_impl
from .image_impl import purge_image as purge_image_impl
from .image_impl import rescan_images as rescan_images_impl
from .image_impl import set_image_resource_limit as set_image_resource_limit_impl
from .image_impl import validate_image_alias as validate_image_alias_impl
Expand Down Expand Up @@ -51,10 +52,19 @@ def inspect(cli_ctx, canonical_or_alias, architecture) -> None:
@click.argument("architecture")
@click.pass_obj
def forget(cli_ctx, canonical_or_alias, architecture) -> None:
"""Forget (delete) a specific image."""
"""Forget (soft-delete) a specific image."""
asyncio.run(forget_image_impl(cli_ctx, canonical_or_alias, architecture))


@cli.command()
@click.argument("canonical_or_alias")
@click.argument("architecture")
@click.pass_obj
def purge(cli_ctx, canonical_or_alias, architecture) -> None:
"""Purge (hard-delete) a specific image."""
asyncio.run(purge_image_impl(cli_ctx, canonical_or_alias, architecture))


@cli.command()
@click.argument("canonical_or_alias")
@click.argument("slot_type")
Expand Down
21 changes: 21 additions & 0 deletions src/ai/backend/manager/cli/image_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,27 @@ async def forget_image(cli_ctx, canonical_or_alias, architecture):
ImageAlias(canonical_or_alias),
],
)
await image_row.mark_as_deleted(session)
except UnknownImageReference:
log.exception("Image not found.")
except Exception:
log.exception("An error occurred.")


async def purge_image(cli_ctx, canonical_or_alias, architecture):
async with (
connect_database(cli_ctx.local_config) as db,
db.begin_session() as session,
):
try:
image_row = await ImageRow.resolve(
session,
[
ImageIdentifier(canonical_or_alias, architecture),
ImageAlias(canonical_or_alias),
],
load_only_active=False,
)
await session.delete(image_row)
except UnknownImageReference:
log.exception("Image not found.")
Expand Down
4 changes: 4 additions & 0 deletions src/ai/backend/manager/models/gql.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@
ImagePermissionValueField,
ModifyImage,
PreloadImage,
PurgeImage,
PurgeImageById,
RescanImages,
UnloadImage,
UntagImageFromRegistry,
Expand Down Expand Up @@ -295,6 +297,8 @@ class Mutations(graphene.ObjectType):
modify_image = ModifyImage.Field()
forget_image_by_id = ForgetImageById.Field(description="Added in 24.03.0")
forget_image = ForgetImage.Field()
purge_image_by_id = PurgeImageById.Field(description="Added in 25.3.1")
purge_image = PurgeImage.Field(description="Added in 25.3.1")
untag_image_from_registry = UntagImageFromRegistry.Field(description="Added in 24.03.1")
alias_image = AliasImage.Field()
dealias_image = DealiasImage.Field()
Expand Down
20 changes: 20 additions & 0 deletions src/ai/backend/manager/models/gql_models/base.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
from __future__ import annotations

from typing import Any, Optional
from uuid import UUID

import graphene
import graphql
from graphene.types import Scalar
from graphene.types.scalars import MAX_INT, MIN_INT
from graphql.language.ast import IntValueNode

from ...api.exceptions import ObjectNotFound
from ..gql_relay import AsyncNode

SAFE_MIN_INT = -9007199254740991
SAFE_MAX_INT = 9007199254740991

Expand Down Expand Up @@ -89,3 +93,19 @@ class ImageRefType(graphene.InputObjectType):
name = graphene.String(required=True)
registry = graphene.String()
architecture = graphene.String()


def extract_object_uuid(info: graphene.ResolveInfo, global_id: str, object_name: str) -> UUID:
"""
Converts a GraphQL global ID to its corresponding UUID.
If the global ID is not valid, raises an error using the provided object name.
"""

_, raw_id = AsyncNode.resolve_global_id(info, global_id)
if not raw_id:
raw_id = global_id

try:
return UUID(raw_id)
except ValueError:
raise ObjectNotFound(object_name)
143 changes: 95 additions & 48 deletions src/ai/backend/manager/models/gql_models/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from ai.backend.manager.models.rbac.context import ClientContext
from ai.backend.manager.models.rbac.permission_defs import ImagePermission

from ...api.exceptions import ImageNotFound, ObjectNotFound
from ...api.exceptions import GenericForbidden, ImageNotFound, ObjectNotFound
from ...defs import DEFAULT_IMAGE_ARCH
from ..base import (
FilterExprArg,
Expand All @@ -57,6 +57,7 @@
ImageIdentifier,
ImageLoadFilter,
ImageRow,
ImageStatus,
ImageType,
get_permission_ctx,
rescan_images,
Expand All @@ -69,6 +70,7 @@
KVPairInput,
ResourceLimit,
ResourceLimitInput,
extract_object_uuid,
)

if TYPE_CHECKING:
Expand All @@ -85,6 +87,8 @@
"RescanImages",
"ForgetImage",
"ForgetImageById",
"PurgeImage",
"PurgeImageById",
"UntagImageFromRegistry",
"ModifyImage",
"AliasImage",
Expand Down Expand Up @@ -668,16 +672,9 @@ async def mutate(
info: graphene.ResolveInfo,
image_id: str,
) -> ForgetImageById:
_, raw_image_id = AsyncNode.resolve_global_id(info, image_id)
if not raw_image_id:
raw_image_id = image_id

try:
_image_id = UUID(raw_image_id)
except ValueError:
raise ObjectNotFound("image")

log.info("forget image {0} by API request", image_id)
_image_id = extract_object_uuid(info, image_id, "image")

ctx: GraphQueryContext = info.context
client_role = ctx.user["role"]

Expand All @@ -686,15 +683,9 @@ async def mutate(
if not image_row:
raise ObjectNotFound("image")
if client_role != UserRole.SUPERADMIN:
customized_image_owner = (image_row.labels or {}).get(
"ai.backend.customized-image.owner"
)
if (
not customized_image_owner
or customized_image_owner != f"user:{ctx.user['uuid']}"
):
if not image_row.is_customized_by(ctx.user["uuid"]):
return ForgetImageById(ok=False, msg="Forbidden")
await session.delete(image_row)
await image_row.mark_as_deleted(session)
return ForgetImageById(ok=True, msg="", image=ImageNode.from_row(image_row))


Expand Down Expand Up @@ -733,18 +724,90 @@ async def mutate(
],
)
if client_role != UserRole.SUPERADMIN:
customized_image_owner = (image_row.labels or {}).get(
"ai.backend.customized-image.owner"
)
if (
not customized_image_owner
or customized_image_owner != f"user:{ctx.user['uuid']}"
):
if not image_row.is_customized_by(ctx.user["uuid"]):
return ForgetImage(ok=False, msg="Forbidden")
await session.delete(image_row)
await image_row.mark_as_deleted(session)
return ForgetImage(ok=True, msg="", image=ImageNode.from_row(image_row))


class PurgeImage(graphene.Mutation):
"""Added in 25.3.1."""

allowed_roles = (
UserRole.SUPERADMIN,
UserRole.ADMIN,
UserRole.USER,
)

class Arguments:
reference = graphene.String(required=True)
architecture = graphene.String(default_value=DEFAULT_IMAGE_ARCH)

image = graphene.Field(ImageNode)

@staticmethod
async def mutate(
root: Any,
info: graphene.ResolveInfo,
reference: str,
architecture: str,
) -> PurgeImage:
log.info("purge image {0} by API request", reference)
ctx: GraphQueryContext = info.context
client_role = ctx.user["role"]

async with ctx.db.begin_session() as session:
image_row = await ImageRow.resolve(
session,
[
ImageIdentifier(reference, architecture),
ImageAlias(reference),
],
)
if client_role != UserRole.SUPERADMIN:
if not image_row.is_customized_by(ctx.user["uuid"]):
return PurgeImage(ok=False, msg="Forbidden")
await session.delete(image_row)
return PurgeImage(image=ImageNode.from_row(image_row))


class PurgeImageById(graphene.Mutation):
"""Added in 25.3.1."""

allowed_roles = (
UserRole.SUPERADMIN,
UserRole.ADMIN,
UserRole.USER,
)

class Arguments:
image_id = graphene.String(required=True)

image = graphene.Field(ImageNode)

@staticmethod
async def mutate(
root: Any,
info: graphene.ResolveInfo,
image_id: str,
) -> PurgeImageById:
log.info("purge image {0} by API request", image_id)
_image_id = extract_object_uuid(info, image_id, "image")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would actually be less confusing if you changed the variable name to a different one.


ctx: GraphQueryContext = info.context
client_role = ctx.user["role"]

async with ctx.db.begin_session() as session:
image_row = await ImageRow.get(session, _image_id, load_aliases=True)
if not image_row:
raise ObjectNotFound("image")
if client_role != UserRole.SUPERADMIN:
if not image_row.is_customized_by(ctx.user["uuid"]):
raise GenericForbidden("Image is not owned by your account.")
await session.delete(image_row)
return PurgeImageById(image=ImageNode.from_row(image_row))


class UntagImageFromRegistry(graphene.Mutation):
"""Added in 24.03.1"""

Expand All @@ -769,14 +832,7 @@ async def mutate(
) -> UntagImageFromRegistry:
from ai.backend.manager.container_registry.harbor import HarborRegistry_v2

_, raw_image_id = AsyncNode.resolve_global_id(info, image_id)
if not raw_image_id:
raw_image_id = image_id

try:
_image_id = UUID(raw_image_id)
except ValueError:
raise ObjectNotFound("image")
_image_id = extract_object_uuid(info, image_id, "image")

log.info("remove image from registry {0} by API request", str(_image_id))
ctx: GraphQueryContext = info.context
Expand All @@ -787,13 +843,7 @@ async def mutate(
if not image_row:
raise ImageNotFound
if client_role != UserRole.SUPERADMIN:
customized_image_owner = (image_row.labels or {}).get(
"ai.backend.customized-image.owner"
)
if (
not customized_image_owner
or customized_image_owner != f"user:{ctx.user['uuid']}"
):
if not image_row.is_customized_by(ctx.user["uuid"]):
return UntagImageFromRegistry(ok=False, msg="Forbidden")

query = sa.select(ContainerRegistryRow).where(
Expand Down Expand Up @@ -968,15 +1018,12 @@ async def mutate(
ctx: GraphQueryContext = info.context
try:
async with ctx.db.begin_session() as session:
result = await session.execute(
sa.select(ImageRow).where(ImageRow.registry == registry)
)
image_ids = [x.id for x in result.scalars().all()]

await session.execute(
sa.delete(ImageAliasRow).where(ImageAliasRow.image_id.in_(image_ids))
sa.update(ImageRow)
.where(ImageRow.registry == registry)
.where(ImageRow.status != ImageStatus.DELETED)
.values(status=ImageStatus.DELETED)
)
await session.execute(sa.delete(ImageRow).where(ImageRow.registry == registry))
except ValueError as e:
return ClearImages(ok=False, msg=str(e))
return ClearImages(ok=True, msg="")
Expand Down
Loading
Loading