chore: top level + nested working

rishisurana-labelbox · rishisurana-labelbox · commit 58b30f7a965e · 2025-09-26T11:45:12.000-07:00
diff --git a/libs/labelbox/src/labelbox/data/annotation_types/audio.py b/libs/labelbox/src/labelbox/data/annotation_types/audio.py
@@ -26,7 +26,7 @@ class AudioClassificationAnnotation(ClassificationAnnotation):
 
     start_frame: int = Field(
         validation_alias=AliasChoices("start_frame", "frame"),
-        serialization_alias="frame",
+        serialization_alias="startframe",
     )
     end_frame: Optional[int] = Field(
         default=None,
@@ -35,33 +35,3 @@ class AudioClassificationAnnotation(ClassificationAnnotation):
     )
     segment_index: Optional[int] = None
 
-
-class AudioTextClassificationAnnotation(ClassificationAnnotation):
-    """Audio classification for specific time range
-
-    Examples:
-    - Speaker identification from 2500ms to 4100ms
-    - Audio quality assessment for a segment
-    - Language detection for audio segments
-
-    Args:
-        name (Optional[str]): Name of the classification
-        feature_schema_id (Optional[Cuid]): Feature schema identifier
-        value (Union[Text, Checklist, Radio]): Classification value
-        start_frame (int): The frame index in milliseconds (e.g., 2500 = 2.5 seconds)
-        end_frame (Optional[int]): End frame in milliseconds (for time ranges)
-        segment_index (Optional[int]): Index of audio segment this annotation belongs to
-        extra (Dict[str, Any]): Additional metadata
-    """
-
-    start_frame: int = Field(
-        validation_alias=AliasChoices("start_frame", "frame"),
-        serialization_alias="frame",
-    )
-    end_frame: Optional[int] = Field(
-        default=None,
-        validation_alias=AliasChoices("end_frame", "endFrame"),
-        serialization_alias="end_frame",
-    )
-
-
diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py
@@ -60,22 +60,6 @@ def serialize_model(self, handler):
         return res
 
 
-class FrameLocation(BaseModel):
-    end: int
-    start: int
-
-
-class VideoSupported(BaseModel):
-    # Note that frames are only allowed as top level inferences for video
-    frames: Optional[List[FrameLocation]] = None
-
-    @model_serializer(mode="wrap")
-    def serialize_model(self, handler):
-        res = handler(self)
-        # This means these are no video frames ..
-        if self.frames is None:
-            res.pop("frames")
-        return res
 
 
 class NDTextSubclass(NDAnswer):
@@ -223,7 +207,7 @@ def from_common(
 # ====== End of subclasses
 
 
-class NDText(NDAnnotation, NDTextSubclass, VideoSupported):
+class NDText(NDAnnotation, NDTextSubclass):
     @classmethod
     def from_common(
         cls,
@@ -249,7 +233,7 @@ def from_common(
         )
 
 
-class NDChecklist(NDAnnotation, NDChecklistSubclass, VideoSupported):
+class NDChecklist(NDAnnotation, NDChecklistSubclass):
     @model_serializer(mode="wrap")
     def serialize_model(self, handler):
         res = handler(self)
@@ -296,7 +280,7 @@ def from_common(
         )
 
 
-class NDRadio(NDAnnotation, NDRadioSubclass, VideoSupported):
+class NDRadio(NDAnnotation, NDRadioSubclass):
     @classmethod
     def from_common(
         cls,
diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py
@@ -2,7 +2,7 @@
 import copy
 from itertools import groupby
 from operator import itemgetter
-from typing import Generator, List, Tuple, Union
+from typing import Any, Dict, Generator, List, Tuple, Union
 from uuid import uuid4
 
 from pydantic import BaseModel
@@ -168,25 +168,79 @@ def _create_video_annotations(
     @classmethod
     def _create_audio_annotations(
         cls, label: Label
-    ) -> Generator[Union[NDChecklistSubclass, NDRadioSubclass], None, None]:
-        """Create audio annotations serialized in Video NDJSON classification format."""
+    ) -> Generator[BaseModel, None, None]:
+        """Create audio annotations grouped by classification name in v2.py format."""
         audio_annotations = defaultdict(list)
 
         # Collect audio annotations by name/schema_id
         for annot in label.annotations:
             if isinstance(annot, AudioClassificationAnnotation):
                 audio_annotations[annot.feature_schema_id or annot.name].append(annot)
 
-        for annotation_group in audio_annotations.values():
-            # Simple grouping: one NDJSON entry per annotation group (same as video)
-            annotation = annotation_group[0]
-            frames_data = []
+        # Create v2.py format for each classification group
+        for classification_name, annotation_group in audio_annotations.items():
+            # Group annotations by value (like v2.py does)
+            value_groups = defaultdict(list)
+            
             for ann in annotation_group:
-                start = ann.start_frame
-                end = getattr(ann, "end_frame", None) or ann.start_frame
-                frames_data.append({"start": start, "end": end})
-            annotation.extra.update({"frames": frames_data})
-            yield NDClassification.from_common(annotation, label.data)
+                # Extract value based on classification type for grouping
+                if hasattr(ann.value, 'answer'):
+                    if isinstance(ann.value.answer, list):
+                        # Checklist classification - convert list to string for grouping
+                        value = str(sorted([item.name for item in ann.value.answer]))
+                    elif hasattr(ann.value.answer, 'name'):
+                        # Radio classification - ann.value.answer is ClassificationAnswer with name
+                        value = ann.value.answer.name
+                    else:
+                        # Text classification
+                        value = ann.value.answer
+                else:
+                    value = str(ann.value)
+                
+                # Group by value
+                value_groups[value].append(ann)
+            
+            # Create answer items with grouped frames (like v2.py)
+            answer_items = []
+            for value, annotations_with_same_value in value_groups.items():
+                frames = []
+                for ann in annotations_with_same_value:
+                    frames.append({"start": ann.start_frame, "end": ann.end_frame})
+                
+                # Extract the actual value for the output (not the grouping key)
+                first_ann = annotations_with_same_value[0]
+                
+                # Use different field names based on classification type
+                if hasattr(first_ann.value, 'answer') and isinstance(first_ann.value.answer, list):
+                    # Checklist - use "name" field (like v2.py)
+                    answer_items.append({
+                        "name": first_ann.value.answer[0].name,  # Single item for now
+                        "frames": frames
+                    })
+                elif hasattr(first_ann.value, 'answer') and hasattr(first_ann.value.answer, 'name'):
+                    # Radio - use "name" field (like v2.py)
+                    answer_items.append({
+                        "name": first_ann.value.answer.name,
+                        "frames": frames
+                    })
+                else:
+                    # Text - use "value" field (like v2.py)
+                    answer_items.append({
+                        "value": first_ann.value.answer,
+                        "frames": frames
+                    })
+            
+            # Create a simple Pydantic model for the v2.py format
+            class AudioNDJSON(BaseModel):
+                name: str
+                answer: List[Dict[str, Any]]
+                dataRow: Dict[str, str]
+            
+            yield AudioNDJSON(
+                name=classification_name,
+                answer=answer_items,
+                dataRow={"globalKey": label.data.global_key}
+            )