diff --git a/examples/README.md b/examples/README.md index 924d1017d..842286b2d 100644 --- a/examples/README.md +++ b/examples/README.md @@ -16,15 +16,25 @@ + + Projects + Open In Github + Open In Colab + Ontologies Open In Github Open In Colab - Quick Start - Open In Github - Open In Colab + Batches + Open In Github + Open In Colab + + + Custom Embeddings + Open In Github + Open In Colab Data Rows @@ -37,25 +47,15 @@ Open In Colab - Batches - Open In Github - Open In Colab - - - Projects - Open In Github - Open In Colab + Quick Start + Open In Github + Open In Colab Data Row Metadata Open In Github Open In Colab - - Custom Embeddings - Open In Github - Open In Colab - User Management Open In Github @@ -75,25 +75,25 @@ + + Export Data + Open In Github + Open In Colab + Export V1 to V2 Migration Support Open In Github Open In Colab - - Exporting to CSV - Open In Github - Open In Colab - Composite Mask Export Open In Github Open In Colab - Export Data - Open In Github - Open In Colab + Exporting to CSV + Open In Github + Open In Colab @@ -143,36 +143,11 @@ - - Tiled - Open In Github - Open In Colab - Text Open In Github Open In Colab - - PDF - Open In Github - Open In Colab - - - Video - Open In Github - Open In Colab - - - Audio - Open In Github - Open In Colab - - - Conversational - Open In Github - Open In Colab - HTML Open In Github @@ -188,11 +163,36 @@ Open In Github Open In Colab + + Video + Open In Github + Open In Colab + + + Audio + Open In Github + Open In Colab + Conversational LLM Open In Github Open In Colab + + Tiled + Open In Github + Open In Colab + + + PDF + Open In Github + Open In Colab + + + Conversational + Open In Github + Open In Colab + @@ -208,9 +208,9 @@ - Langchain - Open In Github - Open In Colab + Meta SAM + Open In Github + Open In Colab Meta SAM Video @@ -218,20 +218,20 @@ Open In Colab - Meta SAM - Open In Github - Open In Colab + Huggingface Custom Embeddings + Open In Github + Open In Colab + + + Langchain + Open In Github + Open In Colab Import YOLOv8 Annotations Open In Github Open In Colab - - Huggingface Custom Embeddings - Open In Github - Open In Colab - @@ -247,25 +247,25 @@ - Model Predictions to Project - Open In Github - Open In Colab + Custom Metrics Basics + Open In Github + Open In Colab Custom Metrics Demo Open In Github Open In Colab - - Custom Metrics Basics - Open In Github - Open In Colab - Model Slices Open In Github Open In Colab + + Model Predictions to Project + Open In Github + Open In Colab + @@ -280,25 +280,15 @@ - - HTML Predictions - Open In Github - Open In Colab - Text Predictions Open In Github Open In Colab - Video Predictions - Open In Github - Open In Colab - - - Conversational Predictions - Open In Github - Open In Colab + PDF Predictions + Open In Github + Open In Colab Geospatial Predictions @@ -306,9 +296,14 @@ Open In Colab - PDF Predictions - Open In Github - Open In Colab + Conversational Predictions + Open In Github + Open In Colab + + + Video Predictions + Open In Github + Open In Colab Image Predictions @@ -320,6 +315,11 @@ Open In Github Open In Colab + + HTML Predictions + Open In Github + Open In Colab + diff --git a/examples/annotation_import/audio.ipynb b/examples/annotation_import/audio.ipynb index 437130a9e..4f20127ee 100644 --- a/examples/annotation_import/audio.ipynb +++ b/examples/annotation_import/audio.ipynb @@ -27,6 +27,30 @@ ], "cell_type": "markdown" }, + { + "metadata": {}, + "source": [ + "\n", + " \n", + "\n" + ], + "cell_type": "markdown" + }, + { + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "" + ], + "cell_type": "markdown" + }, { "metadata": {}, "source": [ @@ -170,7 +194,7 @@ }, { "metadata": {}, - "source": "ontology_builder = lb.OntologyBuilder(classifications=[\n lb.Classification(class_type=lb.Classification.Type.TEXT,\n name=\"text_audio\"),\n lb.Classification(\n class_type=lb.Classification.Type.CHECKLIST,\n name=\"checklist_audio\",\n options=[\n lb.Option(value=\"first_checklist_answer\"),\n lb.Option(value=\"second_checklist_answer\"),\n ],\n ),\n lb.Classification(\n class_type=lb.Classification.Type.RADIO,\n name=\"radio_audio\",\n options=[\n lb.Option(value=\"first_radio_answer\"),\n lb.Option(value=\"second_radio_answer\"),\n ],\n ),\n])\n\nontology = client.create_ontology(\n \"Ontology Audio Annotations\",\n ontology_builder.asdict(),\n media_type=lb.MediaType.Audio,\n)", + "source": "ontology_builder = lb.OntologyBuilder(classifications=[\n lb.Classification(class_type=lb.Classification.Type.TEXT,\n name=\"text_audio\"),\n lb.Classification(\n class_type=lb.Classification.Type.CHECKLIST,\n name=\"checklist_audio\",\n options=[\n lb.Option(value=\"first_checklist_answer\"),\n lb.Option(value=\"second_checklist_answer\"),\n ],\n ),\n lb.Classification(\n class_type=lb.Classification.Type.RADIO,\n name=\"radio_audio\",\n options=[\n lb.Option(value=\"first_radio_answer\"),\n lb.Option(value=\"second_radio_answer\"),\n ],\n ),\n # Temporal classification for token-level annotations\n lb.Classification(\n class_type=lb.Classification.Type.TEXT,\n name=\"User Speaker\",\n scope=lb.Classification.Scope.INDEX, # INDEX scope for temporal\n ),\n])\n\nontology = client.create_ontology(\n \"Ontology Audio Annotations\",\n ontology_builder.asdict(),\n media_type=lb.MediaType.Audio,\n)", "cell_type": "code", "outputs": [], "execution_count": null @@ -223,6 +247,27 @@ ], "cell_type": "markdown" }, + { + "metadata": {}, + "source": [ + "\n" + ], + "cell_type": "markdown" + }, + { + "metadata": {}, + "source": "", + "cell_type": "code", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "source": "", + "cell_type": "code", + "outputs": [], + "execution_count": null + }, { "metadata": {}, "source": "label = []\nlabel.append(\n lb_types.Label(\n data={\"global_key\": global_key},\n annotations=[text_annotation, checklist_annotation, radio_annotation],\n ))", @@ -252,6 +297,29 @@ ], "cell_type": "markdown" }, + { + "metadata": {}, + "source": [ + "## Temporal Audio Annotations\n", + "\n", + "You can create temporal annotations for individual tokens (words) with precise timing:\n" + ], + "cell_type": "markdown" + }, + { + "metadata": {}, + "source": "# Define tokens with precise timing (from demo script)\ntokens_data = [\n (\"Hello\", 586, 770), # Hello: frames 586-770\n (\"AI\", 771, 955), # AI: frames 771-955\n (\"how\", 956, 1140), # how: frames 956-1140\n (\"are\", 1141, 1325), # are: frames 1141-1325\n (\"you\", 1326, 1510), # you: frames 1326-1510\n (\"doing\", 1511, 1695), # doing: frames 1511-1695\n (\"today\", 1696, 1880), # today: frames 1696-1880\n]\n\n# Create temporal annotations for each token\ntemporal_annotations = []\nfor token, start_frame, end_frame in tokens_data:\n token_annotation = lb_types.AudioClassificationAnnotation(\n frame=start_frame,\n end_frame=end_frame,\n name=\"User Speaker\",\n value=lb_types.Text(answer=token),\n )\n temporal_annotations.append(token_annotation)\n\nprint(f\"Created {len(temporal_annotations)} temporal token annotations\")", + "cell_type": "code", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "source": "# Create label with both regular and temporal annotations\nlabel_with_temporal = []\nlabel_with_temporal.append(\n lb_types.Label(\n data={\"global_key\": global_key},\n annotations=[text_annotation, checklist_annotation, radio_annotation] +\n temporal_annotations,\n ))\n\nprint(\n f\"Created label with {len(label_with_temporal[0].annotations)} total annotations\"\n)\nprint(f\" - Regular annotations: 3\")\nprint(f\" - Temporal annotations: {len(temporal_annotations)}\")", + "cell_type": "code", + "outputs": [], + "execution_count": null + }, { "metadata": {}, "source": [ @@ -260,6 +328,13 @@ ], "cell_type": "markdown" }, + { + "metadata": {}, + "source": "# Upload temporal annotations via MAL\ntemporal_upload_job = lb.MALPredictionImport.create_from_objects(\n client=client,\n project_id=project.uid,\n name=f\"temporal_mal_job-{str(uuid.uuid4())}\",\n predictions=label_with_temporal,\n)\n\ntemporal_upload_job.wait_until_done()\nprint(\"Temporal upload completed!\")\nprint(\"Errors:\", temporal_upload_job.errors)\nprint(\"Status:\", temporal_upload_job.statuses)", + "cell_type": "code", + "outputs": [], + "execution_count": null + }, { "metadata": {}, "source": "# Upload our label using Model-Assisted Labeling\nupload_job = lb.MALPredictionImport.create_from_objects(\n client=client,\n project_id=project.uid,\n name=f\"mal_job-{str(uuid.uuid4())}\",\n predictions=label,\n)\n\nupload_job.wait_until_done()\nprint(\"Errors:\", upload_job.errors)\nprint(\"Status of uploads: \", upload_job.statuses)", diff --git a/libs/labelbox/src/labelbox/data/annotation_types/__init__.py b/libs/labelbox/src/labelbox/data/annotation_types/__init__.py index fc75652cf..9f59b5197 100644 --- a/libs/labelbox/src/labelbox/data/annotation_types/__init__.py +++ b/libs/labelbox/src/labelbox/data/annotation_types/__init__.py @@ -19,6 +19,8 @@ from .video import MaskInstance from .video import VideoMaskAnnotation +from .audio import AudioClassificationAnnotation + from .ner import ConversationEntity from .ner import DocumentEntity from .ner import DocumentTextSelection diff --git a/libs/labelbox/src/labelbox/data/annotation_types/audio.py b/libs/labelbox/src/labelbox/data/annotation_types/audio.py new file mode 100644 index 000000000..c86fba668 --- /dev/null +++ b/libs/labelbox/src/labelbox/data/annotation_types/audio.py @@ -0,0 +1,37 @@ +from typing import Optional +from pydantic import Field, AliasChoices + +from labelbox.data.annotation_types.annotation import ( + ClassificationAnnotation, +) + + +class AudioClassificationAnnotation(ClassificationAnnotation): + """Audio classification for specific time range + + Examples: + - Speaker identification from 2500ms to 4100ms + - Audio quality assessment for a segment + - Language detection for audio segments + + Args: + name (Optional[str]): Name of the classification + feature_schema_id (Optional[Cuid]): Feature schema identifier + value (Union[Text, Checklist, Radio]): Classification value + start_frame (int): The frame index in milliseconds (e.g., 2500 = 2.5 seconds) + end_frame (Optional[int]): End frame in milliseconds (for time ranges) + segment_index (Optional[int]): Index of audio segment this annotation belongs to + extra (Dict[str, Any]): Additional metadata + """ + + start_frame: int = Field( + validation_alias=AliasChoices("start_frame", "frame"), + serialization_alias="start_frame", + ) + end_frame: Optional[int] = Field( + default=None, + validation_alias=AliasChoices("end_frame", "endFrame"), + serialization_alias="end_frame", + ) + segment_index: Optional[int] = None + diff --git a/libs/labelbox/src/labelbox/data/annotation_types/label.py b/libs/labelbox/src/labelbox/data/annotation_types/label.py index d13fb8f20..228512a5d 100644 --- a/libs/labelbox/src/labelbox/data/annotation_types/label.py +++ b/libs/labelbox/src/labelbox/data/annotation_types/label.py @@ -13,6 +13,7 @@ from .metrics import ScalarMetric, ConfusionMatrixMetric from .video import VideoClassificationAnnotation from .video import VideoObjectAnnotation, VideoMaskAnnotation +from .audio import AudioClassificationAnnotation from .mmc import MessageEvaluationTaskAnnotation from pydantic import BaseModel, field_validator @@ -44,6 +45,7 @@ class Label(BaseModel): ClassificationAnnotation, ObjectAnnotation, VideoMaskAnnotation, + AudioClassificationAnnotation, ScalarMetric, ConfusionMatrixMetric, RelationshipAnnotation, @@ -75,15 +77,23 @@ def _get_annotations_by_type(self, annotation_type): def frame_annotations( self, - ) -> Dict[str, Union[VideoObjectAnnotation, VideoClassificationAnnotation]]: + ) -> Dict[int, Union[VideoObjectAnnotation, VideoClassificationAnnotation, AudioClassificationAnnotation]]: + """Get temporal annotations organized by frame + + Returns: + Dict[int, List]: Dictionary mapping frame (milliseconds) to list of temporal annotations + + Example: + >>> label.frame_annotations() + {2500: [VideoClassificationAnnotation(...), AudioClassificationAnnotation(...)]} + """ frame_dict = defaultdict(list) for annotation in self.annotations: - if isinstance( - annotation, - (VideoObjectAnnotation, VideoClassificationAnnotation), - ): + if isinstance(annotation, (VideoObjectAnnotation, VideoClassificationAnnotation)): frame_dict[annotation.frame].append(annotation) - return frame_dict + elif isinstance(annotation, AudioClassificationAnnotation): + frame_dict[annotation.start_frame].append(annotation) + return dict(frame_dict) def add_url_to_masks(self, signer) -> "Label": """ diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py index 2f4799d13..5fc19c004 100644 --- a/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py +++ b/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py @@ -2,7 +2,7 @@ import copy from itertools import groupby from operator import itemgetter -from typing import Generator, List, Tuple, Union +from typing import Any, Dict, Generator, List, Tuple, Union from uuid import uuid4 from pydantic import BaseModel @@ -24,6 +24,11 @@ VideoMaskAnnotation, VideoObjectAnnotation, ) +from typing import List +from ...annotation_types.audio import ( + AudioClassificationAnnotation, +) +from .temporal import create_audio_ndjson_annotations from labelbox.types import DocumentRectangle, DocumentEntity from .classification import ( NDChecklistSubclass, @@ -69,6 +74,7 @@ def from_common( yield from cls._create_relationship_annotations(label) yield from cls._create_non_video_annotations(label) yield from cls._create_video_annotations(label) + yield from cls._create_audio_annotations(label) @staticmethod def _get_consecutive_frames( @@ -80,6 +86,7 @@ def _get_consecutive_frames( consecutive.append((group[0], group[-1])) return consecutive + @classmethod def _get_segment_frame_ranges( cls, @@ -159,6 +166,32 @@ def _create_video_annotations( segments.append(segment) yield NDObject.from_common(segments, label.data) + @classmethod + def _create_audio_annotations( + cls, label: Label + ) -> Generator[BaseModel, None, None]: + """Create audio annotations with nested classifications using modular hierarchy builder.""" + # Extract audio annotations from the label + audio_annotations = [ + annot for annot in label.annotations + if isinstance(annot, AudioClassificationAnnotation) + ] + + if not audio_annotations: + return + + # Use the modular hierarchy builder to create NDJSON annotations + ndjson_annotations = create_audio_ndjson_annotations( + audio_annotations, + label.data.global_key + ) + + # Yield each NDJSON annotation + for annotation in ndjson_annotations: + yield annotation + + + @classmethod def _create_non_video_annotations(cls, label: Label): non_video_annotations = [ @@ -170,6 +203,7 @@ def _create_non_video_annotations(cls, label: Label): VideoClassificationAnnotation, VideoObjectAnnotation, VideoMaskAnnotation, + AudioClassificationAnnotation, RelationshipAnnotation, ), ) @@ -187,7 +221,7 @@ def _create_non_video_annotations(cls, label: Label): yield NDMessageTask.from_common(annotation, label.data) else: raise TypeError( - f"Unable to convert object to MAL format. `{type(getattr(annotation, 'value',annotation))}`" + f"Unable to convert object to MAL format. `{type(getattr(annotation, 'value', annotation))}`" ) @classmethod diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/temporal.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/temporal.py new file mode 100644 index 000000000..da9af289d --- /dev/null +++ b/libs/labelbox/src/labelbox/data/serialization/ndjson/temporal.py @@ -0,0 +1,339 @@ +""" +Generic hierarchical classification builder for NDJSON serialization. + +This module provides reusable components for constructing nested hierarchical +classifications from temporal annotations (audio, video, etc.), separating the +complex logic from the main serialization code. +""" + +from collections import defaultdict +from typing import Any, Dict, List, Set, Tuple, Protocol, TypeVar, Generic +from pydantic import BaseModel + +from ...annotation_types.audio import AudioClassificationAnnotation + +# Generic type for temporal annotations +TemporalAnnotation = TypeVar('TemporalAnnotation', bound=Any) + + +class TemporalFrame: + """Represents a time frame in temporal annotations (audio, video, etc.).""" + + def __init__(self, start: int, end: int = None): + self.start = start + self.end = end or start + + def contains(self, other: "TemporalFrame") -> bool: + """Check if this frame contains another frame.""" + return (self.start <= other.start and + self.end is not None and other.end is not None and + self.end >= other.end) + + def strictly_contains(self, other: "TemporalFrame") -> bool: + """Check if this frame strictly contains another frame (not equal).""" + return (self.contains(other) and + (self.start < other.start or self.end > other.end)) + + def overlaps(self, other: "TemporalFrame") -> bool: + """Check if this frame overlaps with another frame.""" + return not (self.end < other.start or other.end < self.start) + + def to_dict(self) -> Dict[str, int]: + """Convert to dictionary format.""" + return {"start": self.start, "end": self.end} + + +class AnnotationGroupManager(Generic[TemporalAnnotation]): + """Manages grouping of temporal annotations by classification type.""" + + def __init__(self, annotations: List[TemporalAnnotation], frame_extractor: callable): + self.annotations = annotations + self.frame_extractor = frame_extractor # Function to extract (start, end) from annotation + self.groups = self._group_annotations() + self.root_groups = self._identify_root_groups() + + def _group_annotations(self) -> Dict[str, List[TemporalAnnotation]]: + """Group annotations by classification key (schema_id or name).""" + groups = defaultdict(list) + for annot in self.annotations: + key = annot.feature_schema_id or annot.name + groups[key].append(annot) + return dict(groups) + + def _identify_root_groups(self) -> Set[str]: + """Identify root groups that are not fully contained by other groups.""" + root_groups = set() + + for group_key, group_anns in self.groups.items(): + if not self._is_group_nested(group_key): + root_groups.add(group_key) + + return root_groups + + def _is_group_nested(self, group_key: str) -> bool: + """Check if a group is fully contained by other groups.""" + group_anns = self.groups[group_key] + + for ann in group_anns: + start, end = self.frame_extractor(ann) + ann_frame = TemporalFrame(start, end) + + # Check if this annotation is contained by any other group + contained = False + for other_key, other_anns in self.groups.items(): + if other_key == group_key: + continue + + for parent in other_anns: + parent_start, parent_end = self.frame_extractor(parent) + parent_frame = TemporalFrame(parent_start, parent_end) + if parent_frame.contains(ann_frame): + contained = True + break + + if contained: + break + + if not contained: + return False # Group is not fully nested + + return True # All annotations were contained somewhere + + def get_group_display_name(self, group_key: str) -> str: + """Get display name for a group.""" + group_anns = self.groups[group_key] + # Prefer the first non-empty annotation name + for ann in group_anns: + if ann.name: + return ann.name + return group_key + + def get_annotations_within_frames(self, frames: List[TemporalFrame], exclude_group: str = None) -> List[TemporalAnnotation]: + """Get all annotations within the given frames, excluding specified group.""" + contained = [] + + for group_key, group_anns in self.groups.items(): + if group_key == exclude_group: + continue + + for ann in group_anns: + start, end = self.frame_extractor(ann) + ann_frame = TemporalFrame(start, end) + if any(frame.contains(ann_frame) for frame in frames): + contained.append(ann) + + return contained + + +class ValueGrouper(Generic[TemporalAnnotation]): + """Handles grouping of annotations by their values and answer construction.""" + + def __init__(self, frame_extractor: callable): + self.frame_extractor = frame_extractor # Function to extract (start, end) from annotation + + def group_by_value(self, annotations: List[TemporalAnnotation]) -> List[Dict[str, Any]]: + """Group annotations by logical value and produce answer entries.""" + value_buckets = defaultdict(list) + + for ann in annotations: + key = self._get_value_key(ann) + value_buckets[key].append(ann) + + entries = [] + for _, anns in value_buckets.items(): + first = anns[0] + frames = [self.frame_extractor(a) for a in anns] + frame_dicts = [{"start": start, "end": end} for start, end in frames] + + entry = self._create_answer_entry(first, frame_dicts) + entries.append(entry) + + return entries + + def _get_value_key(self, ann: TemporalAnnotation) -> str: + """Get a stable key for grouping annotations by value.""" + if hasattr(ann.value, "answer"): + if isinstance(ann.value.answer, list): + # Checklist: stable key from selected option names + return str(sorted([opt.name for opt in ann.value.answer])) + elif hasattr(ann.value.answer, "name"): + # Radio: option name + return ann.value.answer.name + else: + # Text: the string value + return ann.value.answer + else: + return str(ann.value) + + def _create_answer_entry(self, first_ann: TemporalAnnotation, frames: List[Dict[str, int]]) -> Dict[str, Any]: + """Create an answer entry from the first annotation and frames.""" + if hasattr(first_ann.value, "answer") and isinstance(first_ann.value.answer, list): + # Checklist: emit one entry per distinct option present in this bucket + entries = [] + for opt_name in sorted([o.name for o in first_ann.value.answer]): + entries.append({"name": opt_name, "frames": frames}) + return entries[0] if len(entries) == 1 else {"options": entries, "frames": frames} + elif hasattr(first_ann.value, "answer") and hasattr(first_ann.value.answer, "name"): + # Radio + return {"name": first_ann.value.answer.name, "frames": frames} + else: + # Text + return {"value": first_ann.value.answer, "frames": frames} + + +class HierarchyBuilder(Generic[TemporalAnnotation]): + """Builds hierarchical nested classifications from temporal annotations.""" + + def __init__(self, group_manager: AnnotationGroupManager[TemporalAnnotation], value_grouper: ValueGrouper[TemporalAnnotation]): + self.group_manager = group_manager + self.value_grouper = value_grouper + + def build_hierarchy(self) -> List[Dict[str, Any]]: + """Build the complete hierarchical structure.""" + results = [] + + for group_key in self.group_manager.root_groups: + group_anns = self.group_manager.groups[group_key] + top_entries = self.value_grouper.group_by_value(group_anns) + + # Attach nested classifications to each top-level entry + for entry in top_entries: + frames = [TemporalFrame(f["start"], f["end"]) for f in entry.get("frames", [])] + nested = self._build_nested_for_frames(frames, group_key) + if nested: + entry["classifications"] = nested + + results.append({ + "name": self.group_manager.get_group_display_name(group_key), + "answer": top_entries, + }) + + return results + + def _build_nested_for_frames(self, parent_frames: List[TemporalFrame], exclude_group: str) -> List[Dict[str, Any]]: + """Recursively build nested classifications for specific parent frames.""" + nested = [] + + # Get all annotations within parent frames + all_contained = self.group_manager.get_annotations_within_frames(parent_frames, exclude_group) + + # Group by classification type and process each group + for group_key, group_anns in self.group_manager.groups.items(): + if group_key == exclude_group or group_key in self.group_manager.root_groups: + continue + + # Filter annotations that are contained by parent frames + candidate_anns = [] + for ann in group_anns: + start, end = self.group_manager.frame_extractor(ann) + ann_frame = TemporalFrame(start, end) + if any(frame.contains(ann_frame) for frame in parent_frames): + candidate_anns.append(ann) + + if not candidate_anns: + continue + + # Keep only immediate children (not strictly contained by other contained annotations) + child_anns = self._filter_immediate_children(candidate_anns, all_contained) + if not child_anns: + continue + + # Build this child classification block + child_entries = self.value_grouper.group_by_value(child_anns) + + # Recursively attach further nested classifications + for entry in child_entries: + entry_frames = [TemporalFrame(f["start"], f["end"]) for f in entry.get("frames", [])] + child_nested = self._build_nested_for_frames(entry_frames, group_key) + if child_nested: + entry["classifications"] = child_nested + + nested.append({ + "name": self.group_manager.get_group_display_name(group_key), + "answer": child_entries, + }) + + return nested + + def _filter_immediate_children(self, candidates: List[TemporalAnnotation], + all_contained: List[TemporalAnnotation]) -> List[TemporalAnnotation]: + """Filter to keep only immediate children (not strictly contained by others).""" + immediate_children = [] + + for candidate in candidates: + start, end = self.group_manager.frame_extractor(candidate) + candidate_frame = TemporalFrame(start, end) + + # Check if this candidate is strictly contained by any other contained annotation + has_closer_container = False + for other in all_contained: + if other is candidate: + continue + other_start, other_end = self.group_manager.frame_extractor(other) + other_frame = TemporalFrame(other_start, other_end) + if other_frame.strictly_contains(candidate_frame): + has_closer_container = True + break + + if not has_closer_container: + immediate_children.append(candidate) + + return immediate_children + + +class TemporalNDJSON(BaseModel): + """NDJSON format for temporal annotations (audio, video, etc.).""" + name: str + answer: List[Dict[str, Any]] + dataRow: Dict[str, str] + + +def create_temporal_ndjson_annotations(annotations: List[TemporalAnnotation], + data_global_key: str, + frame_extractor: callable) -> List[TemporalNDJSON]: + """ + Create NDJSON temporal annotations with hierarchical structure. + + Args: + annotations: List of temporal classification annotations + data_global_key: Global key for the data row + frame_extractor: Function that extracts (start, end) from annotation + + Returns: + List of TemporalNDJSON objects + """ + if not annotations: + return [] + + group_manager = AnnotationGroupManager(annotations, frame_extractor) + value_grouper = ValueGrouper(frame_extractor) + hierarchy_builder = HierarchyBuilder(group_manager, value_grouper) + hierarchy = hierarchy_builder.build_hierarchy() + + return [ + TemporalNDJSON( + name=item["name"], + answer=item["answer"], + dataRow={"globalKey": data_global_key} + ) + for item in hierarchy + ] + + +# Audio-specific convenience function +def create_audio_ndjson_annotations(annotations: List[AudioClassificationAnnotation], + data_global_key: str) -> List[TemporalNDJSON]: + """ + Create NDJSON audio annotations with hierarchical structure. + + Args: + annotations: List of audio classification annotations + data_global_key: Global key for the data row + + Returns: + List of TemporalNDJSON objects + """ + def audio_frame_extractor(ann: AudioClassificationAnnotation) -> Tuple[int, int]: + return (ann.start_frame, ann.end_frame or ann.start_frame) + + return create_temporal_ndjson_annotations(annotations, data_global_key, audio_frame_extractor) diff --git a/libs/labelbox/tests/data/serialization/ndjson/test_audio.py b/libs/labelbox/tests/data/serialization/ndjson/test_audio.py new file mode 100644 index 000000000..e392c2577 --- /dev/null +++ b/libs/labelbox/tests/data/serialization/ndjson/test_audio.py @@ -0,0 +1,363 @@ +import labelbox.types as lb_types +from labelbox.data.serialization.ndjson.converter import NDJsonConverter + + +def test_audio_nested_text_radio_checklist_structure(): + # Purpose: verify that class-based AudioClassificationAnnotation inputs serialize + # into v3-style nested NDJSON with: + # - exactly three top-level groups (text_class, radio_class, checklist_class) + # - children nested only under their closest containing parent frames + # - correct field shapes per type (Text uses "value", Radio/Checklist use "name") + + # Build annotations mirroring exec/v3.py shapes using class-based annotations + anns = [] + + # text_class top-level with multiple values + # Expect: produces an NDJSON object named "text_class" with four answer entries; + # the long segment (1500-2400) will carry nested children below. + anns.append( + lb_types.AudioClassificationAnnotation( + frame=1000, + end_frame=1100, + name="text_class", + value=lb_types.Text(answer="A"), + ) + ) + anns.append( + lb_types.AudioClassificationAnnotation( + frame=1500, + end_frame=2400, + name="text_class", + value=lb_types.Text(answer="text_class value"), + ) + ) + anns.append( + lb_types.AudioClassificationAnnotation( + frame=2500, + end_frame=2700, + name="text_class", + value=lb_types.Text(answer="C"), + ) + ) + anns.append( + lb_types.AudioClassificationAnnotation( + frame=2900, + end_frame=2999, + name="text_class", + value=lb_types.Text(answer="D"), + ) + ) + + # nested under text_class + # Expect: nested_text_class (1600-2000) nests under the 1500-2400 parent; + # nested_text_class_2 nests under nested_text_class only (no duplicates at parent level). + anns.append( + lb_types.AudioClassificationAnnotation( + frame=1600, + end_frame=2000, + name="nested_text_class", + value=lb_types.Text(answer="nested_text_class value"), + ) + ) + anns.append( + lb_types.AudioClassificationAnnotation( + frame=1800, + end_frame=2000, + name="nested_text_class_2", + value=lb_types.Text(answer="nested_text_class_2 value"), + ) + ) + + # radio_class top-level + # Expect: two answer entries for first_radio_answer (two frame segments) and + # two for second_radio_answer; children attach only to their closest container answer. + anns.append( + lb_types.AudioClassificationAnnotation( + frame=200, + end_frame=1500, + name="radio_class", + value=lb_types.Radio( + answer=lb_types.ClassificationAnswer(name="first_radio_answer") + ), + ) + ) + anns.append( + lb_types.AudioClassificationAnnotation( + frame=2000, + end_frame=2500, + name="radio_class", + value=lb_types.Radio( + answer=lb_types.ClassificationAnswer(name="first_radio_answer") + ), + ) + ) + anns.append( + lb_types.AudioClassificationAnnotation( + frame=1550, + end_frame=1700, + name="radio_class", + value=lb_types.Radio( + answer=lb_types.ClassificationAnswer(name="second_radio_answer") + ), + ) + ) + anns.append( + lb_types.AudioClassificationAnnotation( + frame=2700, + end_frame=3000, + name="radio_class", + value=lb_types.Radio( + answer=lb_types.ClassificationAnswer(name="second_radio_answer") + ), + ) + ) + + # nested radio + # Expect: sub_radio_question nests under first_radio_answer (1000-1500), and + # sub_radio_question_2 nests under sub_radio_question's first_sub_radio_answer only. + anns.append( + lb_types.AudioClassificationAnnotation( + frame=1000, + end_frame=1500, + name="sub_radio_question", + value=lb_types.Radio( + answer=lb_types.ClassificationAnswer( + name="first_sub_radio_answer" + ) + ), + ) + ) + anns.append( + lb_types.AudioClassificationAnnotation( + frame=1300, + end_frame=1500, + name="sub_radio_question_2", + value=lb_types.Radio( + answer=lb_types.ClassificationAnswer( + name="first_sub_radio_answer_2" + ) + ), + ) + ) + + # checklist_class top-level + # Expect: three answer entries (first/second/third_checklist_option) and + # nested checklist children attach to the first option segments where contained. + anns.append( + lb_types.AudioClassificationAnnotation( + frame=300, + end_frame=800, + name="checklist_class", + value=lb_types.Checklist( + answer=[ + lb_types.ClassificationAnswer(name="first_checklist_option") + ] + ), + ) + ) + anns.append( + lb_types.AudioClassificationAnnotation( + frame=1200, + end_frame=1800, + name="checklist_class", + value=lb_types.Checklist( + answer=[ + lb_types.ClassificationAnswer(name="first_checklist_option") + ] + ), + ) + ) + anns.append( + lb_types.AudioClassificationAnnotation( + frame=2200, + end_frame=2900, + name="checklist_class", + value=lb_types.Checklist( + answer=[ + lb_types.ClassificationAnswer( + name="second_checklist_option" + ) + ] + ), + ) + ) + anns.append( + lb_types.AudioClassificationAnnotation( + frame=2500, + end_frame=3500, + name="checklist_class", + value=lb_types.Checklist( + answer=[ + lb_types.ClassificationAnswer(name="third_checklist_option") + ] + ), + ) + ) + + # nested checklist + # Expect: nested_checklist options 1/2/3 attach to their containing parent frames; + # checklist_nested_text attaches under nested_option_1 only. + anns.append( + lb_types.AudioClassificationAnnotation( + frame=400, + end_frame=700, + name="nested_checklist", + value=lb_types.Checklist( + answer=[lb_types.ClassificationAnswer(name="nested_option_1")] + ), + ) + ) + anns.append( + lb_types.AudioClassificationAnnotation( + frame=1200, + end_frame=1600, + name="nested_checklist", + value=lb_types.Checklist( + answer=[lb_types.ClassificationAnswer(name="nested_option_2")] + ), + ) + ) + anns.append( + lb_types.AudioClassificationAnnotation( + frame=1400, + end_frame=1800, + name="nested_checklist", + value=lb_types.Checklist( + answer=[lb_types.ClassificationAnswer(name="nested_option_3")] + ), + ) + ) + anns.append( + lb_types.AudioClassificationAnnotation( + frame=500, + end_frame=700, + name="checklist_nested_text", + value=lb_types.Text(answer="checklist_nested_text value"), + ) + ) + + # Serialize a single Label containing all of the above annotations + label = lb_types.Label( + data={"global_key": "audio_nested_test_key"}, annotations=anns + ) + ndjson = list(NDJsonConverter.serialize([label])) + + # Assert: exactly three top-level groups, matching v3 root objects + assert {obj["name"] for obj in ndjson} == { + "text_class", + "radio_class", + "checklist_class", + } + + # Validate text_class structure: children appear under the long segment only, + # and grandchildren only under their immediate parent + text_nd = next(obj for obj in ndjson if obj["name"] == "text_class") + parent = next( + item + for item in text_nd["answer"] + if item.get("value") == "text_class value" + ) + nested = parent.get("classifications", []) + names = {c["name"] for c in nested} + assert "nested_text_class" in names + nt = next(c for c in nested if c["name"] == "nested_text_class") + nt_ans = nt["answer"][0] + assert nt_ans["value"] == "nested_text_class value" + nt_nested = nt_ans.get("classifications", []) + assert any(c["name"] == "nested_text_class_2" for c in nt_nested) + + # Validate radio_class structure and immediate-child only + radio_nd = next(obj for obj in ndjson if obj["name"] == "radio_class") + first_radio = next( + a for a in radio_nd["answer"] if a["name"] == "first_radio_answer" + ) + assert any( + c["name"] == "sub_radio_question" + for c in first_radio.get("classifications", []) + ) + # sub_radio_question_2 is nested under sub_radio_question only + sub_radio = next( + c + for c in first_radio["classifications"] + if c["name"] == "sub_radio_question" + ) + sr_first = next( + a for a in sub_radio["answer"] if a["name"] == "first_sub_radio_answer" + ) + assert any( + c["name"] == "sub_radio_question_2" + for c in sr_first.get("classifications", []) + ) + + # Validate checklist_class structure: nested_checklist exists, and nested text + # appears only under nested_option_1 (closest container) + checklist_nd = next( + obj for obj in ndjson if obj["name"] == "checklist_class" + ) + first_opt = next( + a + for a in checklist_nd["answer"] + if a["name"] == "first_checklist_option" + ) + assert any( + c["name"] == "nested_checklist" + for c in first_opt.get("classifications", []) + ) + nested_checklist = next( + c + for c in first_opt["classifications"] + if c["name"] == "nested_checklist" + ) + # Ensure nested text present under nested_checklist → nested_option_1 + opt1 = next( + a for a in nested_checklist["answer"] if a["name"] == "nested_option_1" + ) + assert any( + c["name"] == "checklist_nested_text" + for c in opt1.get("classifications", []) + ) + + +def test_audio_top_level_only_basic(): + anns = [ + lb_types.AudioClassificationAnnotation( + frame=200, + end_frame=1500, + name="radio_class", + value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name="first_radio_answer")), + ), + lb_types.AudioClassificationAnnotation( + frame=1550, + end_frame=1700, + name="radio_class", + value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name="second_radio_answer")), + ), + lb_types.AudioClassificationAnnotation( + frame=1200, + end_frame=1800, + name="checklist_class", + value=lb_types.Checklist(answer=[lb_types.ClassificationAnswer(name="angry")]), + ), + ] + + label = lb_types.Label(data={"global_key": "audio_top_level_only"}, annotations=anns) + ndjson = list(NDJsonConverter.serialize([label])) + + names = {o["name"] for o in ndjson} + assert names == {"radio_class", "checklist_class"} + + radio = next(o for o in ndjson if o["name"] == "radio_class") + r_answers = sorted(radio["answer"], key=lambda x: x["frames"][0]["start"]) + assert r_answers[0]["name"] == "first_radio_answer" + assert r_answers[0]["frames"] == [{"start": 200, "end": 1500}] + assert "classifications" not in r_answers[0] + assert r_answers[1]["name"] == "second_radio_answer" + assert r_answers[1]["frames"] == [{"start": 1550, "end": 1700}] + assert "classifications" not in r_answers[1] + + checklist = next(o for o in ndjson if o["name"] == "checklist_class") + c_answers = checklist["answer"] + assert len(c_answers) == 1 + assert c_answers[0]["name"] == "angry" + assert c_answers[0]["frames"] == [{"start": 1200, "end": 1800}] + assert "classifications" not in c_answers[0]