Skip to content

feat(BA-674): Implement Image Soft/Hard Delete APIs #3628

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
bba1fc5
docs: Add news fragment
jopemachine Feb 10, 2025
9ebf186
fix: Rename fragment file
jopemachine Feb 17, 2025
bbba2a9
docs: Rename news fragment
jopemachine Feb 18, 2025
8168bae
docs: Add news fragment
jopemachine Feb 10, 2025
a8d2f24
misc: Rename news fragment file
jopemachine Feb 18, 2025
2cec077
feat: Implment Image Soft/Hard Delete APIs
jopemachine Feb 10, 2025
0c535d4
docs: Add news fragment
jopemachine Feb 10, 2025
e749a59
feat: Make ClearImages to perform soft-delete
jopemachine Feb 11, 2025
0dfbe09
docs: Rename news fragment
jopemachine Feb 11, 2025
f4ddd67
fix: Remove ok, msg from PurgeImage mutation
jopemachine Feb 11, 2025
29472d7
feat: Add `purge` image cli
jopemachine Feb 11, 2025
ed6242a
misc: Rename news fragment file
jopemachine Feb 18, 2025
6e63a76
misc: Update milestone
jopemachine Feb 18, 2025
8786833
refactor: use `ImageRow.is_customized_by`
jopemachine Feb 19, 2025
2f7f7a3
refactor: use `extract_object_uuid`
jopemachine Feb 19, 2025
e0240c8
refactor: use `ImageRow.mark_as_deleted`
jopemachine Feb 19, 2025
cdea8f7
refactor: simplify `ImageRow.is_customized_by`
jopemachine Feb 19, 2025
85faff3
fix: Reflect feedbacks
jopemachine Feb 20, 2025
3c6f602
docs: Rename news fragment
jopemachine Feb 20, 2025
5d94ca4
misc: `25.03.1` -> `25.3.1`
jopemachine Feb 21, 2025
a639518
misc: `25.4.0`
jopemachine Mar 4, 2025
cb48920
fix: Add missing `ctx`
jopemachine Mar 4, 2025
fef0fd3
fix: Log error
jopemachine Mar 4, 2025
b5b855e
fix: Merge conflict
jopemachine Mar 4, 2025
b472999
fix: Merge conflict
jopemachine Mar 4, 2025
d5c12ab
fix: Merge conflict
jopemachine Mar 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/3628.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Update `ForgetImage`, `ForgetImageById`, `ClearImages` to perform soft delete and add `PurgeImageById` API for hard delete.
13 changes: 12 additions & 1 deletion docs/manager/graphql-reference/schema.graphql
Original file line number Diff line number Diff line change
@@ -1942,7 +1942,12 @@

"""Added in 24.03.0"""
forget_image_by_id(image_id: String!): ForgetImageById
forget_image(architecture: String = "x86_64", reference: String!): ForgetImage

"""Deprecated since 25.4.0. Use `forget_image_by_id` instead."""
forget_image(architecture: String = "x86_64", reference: String!): ForgetImage @deprecated(reason: "Deprecated since 25.4.0. Use `forget_image_by_id` instead.")

Check notice on line 1947 in docs/manager/graphql-reference/schema.graphql

GitHub Actions / GraphQL Inspector

Field 'Mutations.forget_image' has description 'Deprecated since 25.4.0. Use `forget_image_by_id` instead.'

Field 'Mutations.forget_image' has description 'Deprecated since 25.4.0. Use `forget_image_by_id` instead.'

Check notice on line 1947 in docs/manager/graphql-reference/schema.graphql

GitHub Actions / GraphQL Inspector

Field 'Mutations.forget_image' is deprecated

Field 'Mutations.forget_image' is deprecated

Check notice on line 1947 in docs/manager/graphql-reference/schema.graphql

GitHub Actions / GraphQL Inspector

Field 'Mutations.forget_image' has deprecation reason 'Deprecated since 25.4.0. Use `forget_image_by_id` instead.'

Field 'Mutations.forget_image' has deprecation reason 'Deprecated since 25.4.0. Use `forget_image_by_id` instead.'

"""Added in 25.4.0"""
purge_image_by_id(image_id: String!): PurgeImageById

Check notice on line 1950 in docs/manager/graphql-reference/schema.graphql

GitHub Actions / GraphQL Inspector

Field 'purge_image_by_id' was added to object type 'Mutations'

Field 'purge_image_by_id' was added to object type 'Mutations'

"""Added in 24.03.1"""
untag_image_from_registry(image_id: String!): UntagImageFromRegistry
@@ -2496,7 +2501,8 @@
image: ImageNode
}

"""Deprecated since 25.4.0. Use `forget_image_by_id` instead."""
type ForgetImage {

Check notice on line 2505 in docs/manager/graphql-reference/schema.graphql

GitHub Actions / GraphQL Inspector

Object type 'ForgetImage' has description 'Deprecated since 25.4.0. Use `forget_image_by_id` instead.'

Object type 'ForgetImage' has description 'Deprecated since 25.4.0. Use `forget_image_by_id` instead.'
ok: Boolean
msg: String

@@ -2504,6 +2510,11 @@
image: ImageNode
}

"""Added in 25.4.0."""
type PurgeImageById {

Check notice on line 2514 in docs/manager/graphql-reference/schema.graphql

GitHub Actions / GraphQL Inspector

Type 'PurgeImageById' was added

Type 'PurgeImageById' was added
image: ImageNode
}

"""Added in 24.03.1"""
type UntagImageFromRegistry {
ok: Boolean
2 changes: 1 addition & 1 deletion src/ai/backend/client/cli/image.py
Original file line number Diff line number Diff line change
@@ -68,7 +68,7 @@ def list(ctx: CLIContext, customized: bool) -> None:
@click.argument("reference_or_id", type=str)
@click.option("--arch", type=str, default=None, help="Set an explicit architecture.")
def forget(reference_or_id, arch):
"""Forget image from server. This command will only work for image customized by user
"""Mark image as deleted from server. This command will only work for image customized by user
unless callee has superadmin privileges.

REFERENCE_OR_ID: Canonical string of image (<registry>/<project>/<name>:<tag>)"""
6 changes: 6 additions & 0 deletions src/ai/backend/common/docker.py
Original file line number Diff line number Diff line change
@@ -329,6 +329,12 @@ def __init__(self, tags: Iterable[str], value: Optional[str] = None) -> None:
value = ""
self._data[key] = value

def __repr__(self):
return self.__str__()

def __str__(self):
return f"PlatformTagSet({str(self._data)})"

def has(self, key: str, version: Optional[str] = None):
if version is None:
return key in self._data
12 changes: 11 additions & 1 deletion src/ai/backend/manager/cli/image.py
Original file line number Diff line number Diff line change
@@ -15,6 +15,7 @@
from .image_impl import forget_image as forget_image_impl
from .image_impl import inspect_image as inspect_image_impl
from .image_impl import list_images as list_images_impl
from .image_impl import purge_image as purge_image_impl
from .image_impl import rescan_images as rescan_images_impl
from .image_impl import set_image_resource_limit as set_image_resource_limit_impl
from .image_impl import validate_image_alias as validate_image_alias_impl
@@ -51,10 +52,19 @@ def inspect(cli_ctx, canonical_or_alias, architecture) -> None:
@click.argument("architecture")
@click.pass_obj
def forget(cli_ctx, canonical_or_alias, architecture) -> None:
"""Forget (delete) a specific image."""
"""Forget (soft-delete) a specific image."""
asyncio.run(forget_image_impl(cli_ctx, canonical_or_alias, architecture))


@cli.command()
@click.argument("canonical_or_alias")
@click.argument("architecture")
@click.pass_obj
def purge(cli_ctx, canonical_or_alias, architecture) -> None:
"""Purge (hard-delete) a specific image."""
asyncio.run(purge_image_impl(cli_ctx, canonical_or_alias, architecture))


@cli.command()
@click.argument("canonical_or_alias")
@click.argument("slot_type")
53 changes: 37 additions & 16 deletions src/ai/backend/manager/cli/image_impl.py
Original file line number Diff line number Diff line change
@@ -81,8 +81,8 @@ async def _build_smembers_pipeline(redis: Redis) -> Pipeline:
pprint(item)
if short:
print(tabulate(displayed_items, tablefmt="plain"))
except Exception:
log.exception("An error occurred.")
except Exception as e:
log.exception(f"An error occurred. Error: {e}")


async def inspect_image(cli_ctx, canonical_or_alias, architecture):
@@ -101,8 +101,8 @@ async def inspect_image(cli_ctx, canonical_or_alias, architecture):
pprint(await image_row.inspect())
except UnknownImageReference:
log.exception("Image not found.")
except Exception:
log.exception("An error occurred.")
except Exception as e:
log.exception(f"An error occurred. Error: {e}")


async def forget_image(cli_ctx, canonical_or_alias, architecture):
@@ -118,11 +118,32 @@ async def forget_image(cli_ctx, canonical_or_alias, architecture):
ImageAlias(canonical_or_alias),
],
)
await image_row.mark_as_deleted(session)
except UnknownImageReference:
log.exception("Image not found.")
except Exception as e:
log.exception(f"An error occurred. Error: {e}")


async def purge_image(cli_ctx, canonical_or_alias, architecture):
async with (
connect_database(cli_ctx.local_config) as db,
db.begin_session() as session,
):
try:
image_row = await ImageRow.resolve(
session,
[
ImageIdentifier(canonical_or_alias, architecture),
ImageAlias(canonical_or_alias),
],
load_only_active=False,
)
await session.delete(image_row)
except UnknownImageReference:
log.exception("Image not found.")
except Exception:
log.exception("An error occurred.")
except Exception as e:
log.exception(f"An error occurred. Error: {e}")


async def set_image_resource_limit(
@@ -147,8 +168,8 @@ async def set_image_resource_limit(
await image_row.set_resource_limit(slot_type, range_value)
except UnknownImageReference:
log.exception("Image not found.")
except Exception:
log.exception("An error occurred.")
except Exception as e:
log.exception(f"An error occurred. Error: {e}")


async def rescan_images(
@@ -161,8 +182,8 @@ async def rescan_images(
):
try:
await rescan_images_func(db, registry_or_image, project)
except Exception:
log.exception("An error occurred.")
except Exception as e:
log.exception(f"An error occurred. Error: {e}")


async def alias(cli_ctx, alias, target, architecture):
@@ -180,8 +201,8 @@ async def alias(cli_ctx, alias, target, architecture):
await ImageAliasRow.create(session, alias, image_row)
except UnknownImageReference:
log.exception("Image not found.")
except Exception:
log.exception("An error occurred.")
except Exception as e:
log.exception(f"An error occurred. Error: {e}")


async def dealias(cli_ctx, alias):
@@ -213,8 +234,8 @@ async def validate_image_alias(cli_ctx, alias: str) -> None:

except UnknownImageReference:
log.error(f"No images were found with alias: {alias}")
except Exception:
log.exception("An error occurred.")
except Exception as e:
log.exception(f"An error occurred. Error: {e}")


async def validate_image_canonical(
@@ -257,5 +278,5 @@ async def validate_image_canonical(

except UnknownImageReference as e:
log.error(f"{e}")
except Exception:
log.exception("An error occurred.")
except Exception as e:
log.exception(f"An error occurred. Error: {e}")
6 changes: 5 additions & 1 deletion src/ai/backend/manager/models/gql.py
Original file line number Diff line number Diff line change
@@ -119,6 +119,7 @@
ImagePermissionValueField,
ModifyImage,
PreloadImage,
PurgeImageById,
RescanImages,
UnloadImage,
UntagImageFromRegistry,
@@ -296,7 +297,10 @@ class Mutations(graphene.ObjectType):
unload_image = UnloadImage.Field()
modify_image = ModifyImage.Field()
forget_image_by_id = ForgetImageById.Field(description="Added in 24.03.0")
forget_image = ForgetImage.Field()
forget_image = ForgetImage.Field(
deprecation_reason="Deprecated since 25.4.0. Use `forget_image_by_id` instead."
)
purge_image_by_id = PurgeImageById.Field(description="Added in 25.4.0")
untag_image_from_registry = UntagImageFromRegistry.Field(description="Added in 24.03.1")
alias_image = AliasImage.Field()
dealias_image = DealiasImage.Field()
20 changes: 20 additions & 0 deletions src/ai/backend/manager/models/gql_models/base.py
Original file line number Diff line number Diff line change
@@ -2,6 +2,7 @@

import uuid
from typing import Any, Optional
from uuid import UUID

import graphene
import graphql
@@ -11,6 +12,9 @@
from graphql.language.ast import FloatValueNode, IntValueNode, ObjectValueNode, ValueNode
from graphql.language.printer import print_ast

from ...api.exceptions import ObjectNotFound
from ..gql_relay import AsyncNode

SAFE_MIN_INT = -9007199254740991
SAFE_MAX_INT = 9007199254740991

@@ -151,3 +155,19 @@ def parse_value(value: Any) -> dict[str, float]:
raise GraphQLError(f"UUIDFloatMap cannot represent value {v} as a float")
validated[key_str] = v
return validated


def extract_object_uuid(info: graphene.ResolveInfo, global_id: str, object_name: str) -> UUID:
"""
Converts a GraphQL global ID to its corresponding UUID.
If the global ID is not valid, raises an error using the provided object name.
"""

_, raw_id = AsyncNode.resolve_global_id(info, global_id)
if not raw_id:
raw_id = global_id

try:
return UUID(raw_id)
except ValueError:
raise ObjectNotFound(object_name)
111 changes: 60 additions & 51 deletions src/ai/backend/manager/models/gql_models/image.py
Original file line number Diff line number Diff line change
@@ -43,7 +43,7 @@
from ai.backend.manager.models.rbac.context import ClientContext
from ai.backend.manager.models.rbac.permission_defs import ImagePermission

from ...api.exceptions import ImageNotFound, ObjectNotFound
from ...api.exceptions import GenericForbidden, ImageNotFound, ObjectNotFound
from ...defs import DEFAULT_IMAGE_ARCH
from ..base import (
FilterExprArg,
@@ -58,6 +58,7 @@
ImageIdentifier,
ImageLoadFilter,
ImageRow,
ImageStatus,
ImageType,
get_permission_ctx,
rescan_images,
@@ -70,6 +71,7 @@
KVPairInput,
ResourceLimit,
ResourceLimitInput,
extract_object_uuid,
)

if TYPE_CHECKING:
@@ -86,6 +88,7 @@
"RescanImages",
"ForgetImage",
"ForgetImageById",
"PurgeImageById",
"UntagImageFromRegistry",
"ModifyImage",
"AliasImage",
@@ -675,37 +678,28 @@ async def mutate(
info: graphene.ResolveInfo,
image_id: str,
) -> ForgetImageById:
_, raw_image_id = AsyncNode.resolve_global_id(info, image_id)
if not raw_image_id:
raw_image_id = image_id

try:
_image_id = UUID(raw_image_id)
except ValueError:
raise ObjectNotFound("image")

log.info("forget image {0} by API request", image_id)
image_uuid = extract_object_uuid(info, image_id, "image")

ctx: GraphQueryContext = info.context
client_role = ctx.user["role"]

async with ctx.db.begin_session() as session:
image_row = await ImageRow.get(session, _image_id, load_aliases=True)
image_row = await ImageRow.get(session, image_uuid, load_aliases=True)
if not image_row:
raise ObjectNotFound("image")
if client_role != UserRole.SUPERADMIN:
customized_image_owner = (image_row.labels or {}).get(
"ai.backend.customized-image.owner"
)
if (
not customized_image_owner
or customized_image_owner != f"user:{ctx.user['uuid']}"
):
if not image_row.is_customized_by(ctx.user["uuid"]):
return ForgetImageById(ok=False, msg="Forbidden")
await session.delete(image_row)
await image_row.mark_as_deleted(session)
return ForgetImageById(ok=True, msg="", image=ImageNode.from_row(ctx, image_row))


class ForgetImage(graphene.Mutation):
"""
Deprecated since 25.4.0. Use `forget_image_by_id` instead.
"""

allowed_roles = (
UserRole.SUPERADMIN,
UserRole.ADMIN,
@@ -740,18 +734,49 @@ async def mutate(
],
)
if client_role != UserRole.SUPERADMIN:
customized_image_owner = (image_row.labels or {}).get(
"ai.backend.customized-image.owner"
)
if (
not customized_image_owner
or customized_image_owner != f"user:{ctx.user['uuid']}"
):
if not image_row.is_customized_by(ctx.user["uuid"]):
return ForgetImage(ok=False, msg="Forbidden")
await session.delete(image_row)
await image_row.mark_as_deleted(session)
return ForgetImage(ok=True, msg="", image=ImageNode.from_row(ctx, image_row))


class PurgeImageById(graphene.Mutation):
"""Added in 25.4.0."""

allowed_roles = (
UserRole.SUPERADMIN,
UserRole.ADMIN,
UserRole.USER,
)

class Arguments:
image_id = graphene.String(required=True)

image = graphene.Field(ImageNode)

@staticmethod
async def mutate(
root: Any,
info: graphene.ResolveInfo,
image_id: str,
) -> PurgeImageById:
log.info("purge image {0} by API request", image_id)
image_uuid = extract_object_uuid(info, image_id, "image")

ctx: GraphQueryContext = info.context
client_role = ctx.user["role"]

async with ctx.db.begin_session() as session:
image_row = await ImageRow.get(session, image_uuid, load_aliases=True)
if not image_row:
raise ObjectNotFound("image")
if client_role != UserRole.SUPERADMIN:
if not image_row.is_customized_by(ctx.user["uuid"]):
raise GenericForbidden("Image is not owned by your account.")
await session.delete(image_row)
return PurgeImageById(image=ImageNode.from_row(ctx, image_row))


class UntagImageFromRegistry(graphene.Mutation):
"""Added in 24.03.1"""

@@ -776,31 +801,18 @@ async def mutate(
) -> UntagImageFromRegistry:
from ai.backend.manager.container_registry.harbor import HarborRegistry_v2

_, raw_image_id = AsyncNode.resolve_global_id(info, image_id)
if not raw_image_id:
raw_image_id = image_id
image_uuid = extract_object_uuid(info, image_id, "image")

try:
_image_id = UUID(raw_image_id)
except ValueError:
raise ObjectNotFound("image")

log.info("remove image from registry {0} by API request", str(_image_id))
log.info("remove image from registry {0} by API request", str(image_uuid))
ctx: GraphQueryContext = info.context
client_role = ctx.user["role"]

async with ctx.db.begin_readonly_session() as session:
image_row = await ImageRow.get(session, _image_id, load_aliases=True)
image_row = await ImageRow.get(session, image_uuid, load_aliases=True)
if not image_row:
raise ImageNotFound
if client_role != UserRole.SUPERADMIN:
customized_image_owner = (image_row.labels or {}).get(
"ai.backend.customized-image.owner"
)
if (
not customized_image_owner
or customized_image_owner != f"user:{ctx.user['uuid']}"
):
if not image_row.is_customized_by(ctx.user["uuid"]):
return UntagImageFromRegistry(ok=False, msg="Forbidden")

query = sa.select(ContainerRegistryRow).where(
@@ -975,15 +987,12 @@ async def mutate(
ctx: GraphQueryContext = info.context
try:
async with ctx.db.begin_session() as session:
result = await session.execute(
sa.select(ImageRow).where(ImageRow.registry == registry)
)
image_ids = [x.id for x in result.scalars().all()]

await session.execute(
sa.delete(ImageAliasRow).where(ImageAliasRow.image_id.in_(image_ids))
sa.update(ImageRow)
.where(ImageRow.registry == registry)
.where(ImageRow.status != ImageStatus.DELETED)
.values(status=ImageStatus.DELETED)
)
await session.execute(sa.delete(ImageRow).where(ImageRow.registry == registry))
except ValueError as e:
return ClearImages(ok=False, msg=str(e))
return ClearImages(ok=True, msg="")
7 changes: 7 additions & 0 deletions src/ai/backend/manager/models/image.py
Original file line number Diff line number Diff line change
@@ -696,6 +696,10 @@ async def inspect(self) -> Mapping[str, Any]:
parsed_image_info["reverse_aliases"] = [x.alias for x in self.aliases]
return parsed_image_info

async def mark_as_deleted(self, db_session: AsyncSession) -> None:
self.status = ImageStatus.DELETED
await db_session.flush()

def set_resource_limit(
self,
slot_type: str,
@@ -711,6 +715,9 @@ def set_resource_limit(

self.resources = resources

def is_customized_by(self, user_id: str) -> bool:
return (self.labels or {}).get("ai.backend.customized-image.owner") == f"user:{user_id}"


async def bulk_get_image_configs(
image_refs: Iterable[ImageRef],