Labelbox · rishisurana-labelbox · Sep 3, 2025 · Sep 8, 2025 · Sep 8, 2025 · Sep 8, 2025
diff --git a/examples/annotation_import/audio.ipynb b/examples/annotation_import/audio.ipynb
@@ -170,7 +170,7 @@
     },
     {
       "metadata": {},
-      "source": "ontology_builder = lb.OntologyBuilder(classifications=[\n    lb.Classification(class_type=lb.Classification.Type.TEXT,\n                      name=\"text_audio\"),\n    lb.Classification(\n        class_type=lb.Classification.Type.CHECKLIST,\n        name=\"checklist_audio\",\n        options=[\n            lb.Option(value=\"first_checklist_answer\"),\n            lb.Option(value=\"second_checklist_answer\"),\n        ],\n    ),\n    lb.Classification(\n        class_type=lb.Classification.Type.RADIO,\n        name=\"radio_audio\",\n        options=[\n            lb.Option(value=\"first_radio_answer\"),\n            lb.Option(value=\"second_radio_answer\"),\n        ],\n    ),\n])\n\nontology = client.create_ontology(\n    \"Ontology Audio Annotations\",\n    ontology_builder.asdict(),\n    media_type=lb.MediaType.Audio,\n)",
+      "source": "ontology_builder = lb.OntologyBuilder(classifications=[\n    lb.Classification(class_type=lb.Classification.Type.TEXT,\n                      name=\"text_audio\"),\n    lb.Classification(\n        class_type=lb.Classification.Type.CHECKLIST,\n        name=\"checklist_audio\",\n        options=[\n            lb.Option(value=\"first_checklist_answer\"),\n            lb.Option(value=\"second_checklist_answer\"),\n        ],\n    ),\n    lb.Classification(\n        class_type=lb.Classification.Type.RADIO,\n        name=\"radio_audio\",\n        options=[\n            lb.Option(value=\"first_radio_answer\"),\n            lb.Option(value=\"second_radio_answer\"),\n        ],\n    ),\n    # Temporal classification for token-level annotations\n    lb.Classification(\n        class_type=lb.Classification.Type.TEXT,\n        name=\"User Speaker\",\n        scope=lb.Classification.Scope.INDEX,  # INDEX scope for temporal\n    ),\n])\n\nontology = client.create_ontology(\n    \"Ontology Audio Annotations\",\n    ontology_builder.asdict(),\n    media_type=lb.MediaType.Audio,\n)",
       "cell_type": "code",
       "outputs": [],
       "execution_count": null
@@ -223,6 +223,27 @@
       ],
       "cell_type": "markdown"
     },
+    {
+      "metadata": {},
+      "source": [
+        "\n"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": "",
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": "",
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
     {
       "metadata": {},
       "source": "label = []\nlabel.append(\n    lb_types.Label(\n        data={\"global_key\": global_key},\n        annotations=[text_annotation, checklist_annotation, radio_annotation],\n    ))",
@@ -252,6 +273,29 @@
       ],
       "cell_type": "markdown"
     },
+    {
+      "metadata": {},
+      "source": [
+        "## Temporal Audio Annotations\n",
+        "\n",
+        "You can create temporal annotations for individual tokens (words) with precise timing:\n"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": "# Define tokens with precise timing (from demo script)\ntokens_data = [\n    (\"Hello\", 586, 770),  # Hello: frames 586-770\n    (\"AI\", 771, 955),  # AI: frames 771-955\n    (\"how\", 956, 1140),  # how: frames 956-1140\n    (\"are\", 1141, 1325),  # are: frames 1141-1325\n    (\"you\", 1326, 1510),  # you: frames 1326-1510\n    (\"doing\", 1511, 1695),  # doing: frames 1511-1695\n    (\"today\", 1696, 1880),  # today: frames 1696-1880\n]\n\n# Create temporal annotations for each token\ntemporal_annotations = []\nfor token, start_frame, end_frame in tokens_data:\n    token_annotation = lb_types.AudioClassificationAnnotation(\n        frame=start_frame,\n        end_frame=end_frame,\n        name=\"User Speaker\",\n        value=lb_types.Text(answer=token),\n    )\n    temporal_annotations.append(token_annotation)\n\nprint(f\"Created {len(temporal_annotations)} temporal token annotations\")",
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": "# Create label with both regular and temporal annotations\nlabel_with_temporal = []\nlabel_with_temporal.append(\n    lb_types.Label(\n        data={\"global_key\": global_key},\n        annotations=[text_annotation, checklist_annotation, radio_annotation] +\n        temporal_annotations,\n    ))\n\nprint(\n    f\"Created label with {len(label_with_temporal[0].annotations)} total annotations\"\n)\nprint(f\"  - Regular annotations: 3\")\nprint(f\"  - Temporal annotations: {len(temporal_annotations)}\")",
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
     {
       "metadata": {},
       "source": [
@@ -260,6 +304,13 @@
       ],
       "cell_type": "markdown"
     },
+    {
+      "metadata": {},
+      "source": "# Upload temporal annotations via MAL\ntemporal_upload_job = lb.MALPredictionImport.create_from_objects(\n    client=client,\n    project_id=project.uid,\n    name=f\"temporal_mal_job-{str(uuid.uuid4())}\",\n    predictions=label_with_temporal,\n)\n\ntemporal_upload_job.wait_until_done()\nprint(\"Temporal upload completed!\")\nprint(\"Errors:\", temporal_upload_job.errors)\nprint(\"Status:\", temporal_upload_job.statuses)",
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
     {
       "metadata": {},
       "source": "# Upload our label using Model-Assisted Labeling\nupload_job = lb.MALPredictionImport.create_from_objects(\n    client=client,\n    project_id=project.uid,\n    name=f\"mal_job-{str(uuid.uuid4())}\",\n    predictions=label,\n)\n\nupload_job.wait_until_done()\nprint(\"Errors:\", upload_job.errors)\nprint(\"Status of uploads: \", upload_job.statuses)",

@@ -19,6 +19,8 @@
 from .video import MaskInstance
 from .video import VideoMaskAnnotation
 
+from .audio import AudioClassificationAnnotation
+
 from .ner import ConversationEntity
 from .ner import DocumentEntity
 from .ner import DocumentTextSelection

@@ -0,0 +1,37 @@
+from typing import Optional
+from pydantic import Field, AliasChoices
+
+from labelbox.data.annotation_types.annotation import (
+    ClassificationAnnotation,
+)
+
+
+class AudioClassificationAnnotation(ClassificationAnnotation):
+    """Audio classification for specific time range
+
+    Examples:
+    - Speaker identification from 2500ms to 4100ms
+    - Audio quality assessment for a segment
+    - Language detection for audio segments
+
+    Args:
+        name (Optional[str]): Name of the classification
+        feature_schema_id (Optional[Cuid]): Feature schema identifier
+        value (Union[Text, Checklist, Radio]): Classification value
+        start_frame (int): The frame index in milliseconds (e.g., 2500 = 2.5 seconds)
+        end_frame (Optional[int]): End frame in milliseconds (for time ranges)
+        segment_index (Optional[int]): Index of audio segment this annotation belongs to
+        extra (Dict[str, Any]): Additional metadata
+    """
+
+    start_frame: int = Field(
+        validation_alias=AliasChoices("start_frame", "frame"),
+        serialization_alias="startframe",
+    )
+    end_frame: Optional[int] = Field(
+        default=None,
+        validation_alias=AliasChoices("end_frame", "endFrame"),
+        serialization_alias="end_frame",
+    )
+    segment_index: Optional[int] = None
+
@@ -13,6 +13,7 @@
 from .metrics import ScalarMetric, ConfusionMatrixMetric
 from .video import VideoClassificationAnnotation
 from .video import VideoObjectAnnotation, VideoMaskAnnotation
+from .audio import AudioClassificationAnnotation
 from .mmc import MessageEvaluationTaskAnnotation
 from pydantic import BaseModel, field_validator
 
@@ -44,6 +45,7 @@ class Label(BaseModel):
             ClassificationAnnotation,
             ObjectAnnotation,
             VideoMaskAnnotation,
+            AudioClassificationAnnotation,
             ScalarMetric,
             ConfusionMatrixMetric,
             RelationshipAnnotation,
@@ -75,15 +77,23 @@ def _get_annotations_by_type(self, annotation_type):
 
     def frame_annotations(
         self,
-    ) -> Dict[str, Union[VideoObjectAnnotation, VideoClassificationAnnotation]]:
+    ) -> Dict[int, Union[VideoObjectAnnotation, VideoClassificationAnnotation, AudioClassificationAnnotation]]:
+        """Get temporal annotations organized by frame
+
+        Returns:
+            Dict[int, List]: Dictionary mapping frame (milliseconds) to list of temporal annotations
+
+        Example:
+            >>> label.frame_annotations()
+            {2500: [VideoClassificationAnnotation(...), AudioClassificationAnnotation(...)]}
+        """
         frame_dict = defaultdict(list)
         for annotation in self.annotations:
-            if isinstance(
-                annotation,
-                (VideoObjectAnnotation, VideoClassificationAnnotation),
-            ):
+            if isinstance(annotation, (VideoObjectAnnotation, VideoClassificationAnnotation)):
                 frame_dict[annotation.frame].append(annotation)
-        return frame_dict
+            elif isinstance(annotation, AudioClassificationAnnotation):
+                frame_dict[annotation.start_frame].append(annotation)
+        return dict(frame_dict)
 
     def add_url_to_masks(self, signer) -> "Label":
         """

@@ -60,22 +60,6 @@ def serialize_model(self, handler):
         return res
 
 
-class FrameLocation(BaseModel):
-    end: int
-    start: int
-
-
-class VideoSupported(BaseModel):
-    # Note that frames are only allowed as top level inferences for video
-    frames: Optional[List[FrameLocation]] = None
-
-    @model_serializer(mode="wrap")
-    def serialize_model(self, handler):
-        res = handler(self)
-        # This means these are no video frames ..
-        if self.frames is None:
-            res.pop("frames")
-        return res
 
 
 class NDTextSubclass(NDAnswer):
@@ -242,13 +226,14 @@ def from_common(
             name=name,
             schema_id=feature_schema_id,
             uuid=uuid,
+            frames=extra.get("frames"),
             message_id=message_id,
             confidence=text.confidence,
             custom_metrics=text.custom_metrics,
         )
 
 
-class NDChecklist(NDAnnotation, NDChecklistSubclass, VideoSupported):
+class NDChecklist(NDAnnotation, NDChecklistSubclass):
     @model_serializer(mode="wrap")
     def serialize_model(self, handler):
         res = handler(self)
@@ -295,7 +280,7 @@ def from_common(
         )
 
 
-class NDRadio(NDAnnotation, NDRadioSubclass, VideoSupported):
+class NDRadio(NDAnnotation, NDRadioSubclass):
     @classmethod
     def from_common(
         cls,
@@ -425,7 +410,8 @@ def to_common(
     def from_common(
         cls,
         annotation: Union[
-            ClassificationAnnotation, VideoClassificationAnnotation
+            ClassificationAnnotation,
+            VideoClassificationAnnotation,
         ],
         data: GenericDataRowData,
     ) -> Union[NDTextSubclass, NDChecklistSubclass, NDRadioSubclass]:
@@ -448,7 +434,8 @@ def from_common(
     @staticmethod
     def lookup_classification(
         annotation: Union[
-            ClassificationAnnotation, VideoClassificationAnnotation
+            ClassificationAnnotation,
+            VideoClassificationAnnotation,
         ],
     ) -> Union[NDText, NDChecklist, NDRadio]:
         return {Text: NDText, Checklist: NDChecklist, Radio: NDRadio}.get(

@@ -2,7 +2,7 @@
 import copy
 from itertools import groupby
 from operator import itemgetter
-from typing import Generator, List, Tuple, Union
+from typing import Any, Dict, Generator, List, Tuple, Union
 from uuid import uuid4
 
 from pydantic import BaseModel
@@ -24,6 +24,10 @@
     VideoMaskAnnotation,
     VideoObjectAnnotation,
 )
+from typing import List
+from ...annotation_types.audio import (
+    AudioClassificationAnnotation,
+)
 from labelbox.types import DocumentRectangle, DocumentEntity
 from .classification import (
     NDChecklistSubclass,
@@ -69,6 +73,7 @@ def from_common(
             yield from cls._create_relationship_annotations(label)
             yield from cls._create_non_video_annotations(label)
             yield from cls._create_video_annotations(label)
+            yield from cls._create_audio_annotations(label)
 
     @staticmethod
     def _get_consecutive_frames(
@@ -80,6 +85,7 @@ def _get_consecutive_frames(
             consecutive.append((group[0], group[-1]))
         return consecutive
 
+
     @classmethod
     def _get_segment_frame_ranges(
         cls,
@@ -153,12 +159,91 @@ def _create_video_annotations(
                     for annotation in annotation_group:
                         if (
                             annotation.keyframe
-                            and start_frame <= annotation.frame <= end_frame
+                            and start_frame <= annotation.start_frame <= end_frame
                         ):
                             segment.append(annotation)
                     segments.append(segment)
                 yield NDObject.from_common(segments, label.data)
 
+    @classmethod
+    def _create_audio_annotations(
+        cls, label: Label
+    ) -> Generator[BaseModel, None, None]:
+        """Create audio annotations grouped by classification name in v2.py format."""
+        audio_annotations = defaultdict(list)
+
+        # Collect audio annotations by name/schema_id
+        for annot in label.annotations:
+            if isinstance(annot, AudioClassificationAnnotation):
+                audio_annotations[annot.feature_schema_id or annot.name].append(annot)
+
+        # Create v2.py format for each classification group
+        for classification_name, annotation_group in audio_annotations.items():
+            # Group annotations by value (like v2.py does)
+            value_groups = defaultdict(list)
+
+            for ann in annotation_group:
+                # Extract value based on classification type for grouping
+                if hasattr(ann.value, 'answer'):
+                    if isinstance(ann.value.answer, list):
+                        # Checklist classification - convert list to string for grouping
+                        value = str(sorted([item.name for item in ann.value.answer]))
+                    elif hasattr(ann.value.answer, 'name'):
+                        # Radio classification - ann.value.answer is ClassificationAnswer with name
+                        value = ann.value.answer.name
+                    else:
+                        # Text classification
+                        value = ann.value.answer
+                else:
+                    value = str(ann.value)
+
+                # Group by value
+                value_groups[value].append(ann)
+
+            # Create answer items with grouped frames (like v2.py)
+            answer_items = []
+            for value, annotations_with_same_value in value_groups.items():
+                frames = []
+                for ann in annotations_with_same_value:
+                    frames.append({"start": ann.start_frame, "end": ann.end_frame})
+
+                # Extract the actual value for the output (not the grouping key)
+                first_ann = annotations_with_same_value[0]
+
+                # Use different field names based on classification type
+                if hasattr(first_ann.value, 'answer') and isinstance(first_ann.value.answer, list):
+                    # Checklist - use "name" field (like v2.py)
+                    answer_items.append({
+                        "name": first_ann.value.answer[0].name,  # Single item for now
+                        "frames": frames
+                    })
+                elif hasattr(first_ann.value, 'answer') and hasattr(first_ann.value.answer, 'name'):
+                    # Radio - use "name" field (like v2.py)
+                    answer_items.append({
+                        "name": first_ann.value.answer.name,
+                        "frames": frames
+                    })
+                else:
+                    # Text - use "value" field (like v2.py)
+                    answer_items.append({
+                        "value": first_ann.value.answer,
+                        "frames": frames
+                    })
+
+            # Create a simple Pydantic model for the v2.py format
+            class AudioNDJSON(BaseModel):
+                name: str
+                answer: List[Dict[str, Any]]
+                dataRow: Dict[str, str]
+
+            yield AudioNDJSON(
+                name=classification_name,
+                answer=answer_items,
+                dataRow={"globalKey": label.data.global_key}
+            )
+
+
+
     @classmethod
     def _create_non_video_annotations(cls, label: Label):
         non_video_annotations = [
@@ -170,6 +255,7 @@ def _create_non_video_annotations(cls, label: Label):
                     VideoClassificationAnnotation,
                     VideoObjectAnnotation,
                     VideoMaskAnnotation,
+                    AudioClassificationAnnotation,
                     RelationshipAnnotation,
                 ),
             )
@@ -187,7 +273,7 @@ def _create_non_video_annotations(cls, label: Label):
                 yield NDMessageTask.from_common(annotation, label.data)
             else:
                 raise TypeError(
-                    f"Unable to convert object to MAL format. `{type(getattr(annotation, 'value',annotation))}`"
+                    f"Unable to convert object to MAL format. `{type(getattr(annotation, 'value', annotation))}`"
                 )
 
     @classmethod