chore: clean up and organize code

rishisurana-labelbox · rishisurana-labelbox · commit 7a666cc24f2f · 2025-09-11T13:46:09.000-07:00
diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py
@@ -24,6 +24,7 @@
     VideoMaskAnnotation,
     VideoObjectAnnotation,
 )
+from typing import List
 from ...annotation_types.audio import (
     AudioClassificationAnnotation,
     AudioObjectAnnotation,
@@ -128,120 +129,36 @@ def _get_segment_frame_ranges(
     def _create_video_annotations(
         cls, label: Label
     ) -> Generator[Union[NDChecklistSubclass, NDRadioSubclass], None, None]:
-        video_annotations = defaultdict(list)
+        # Handle video mask annotations separately (special case)
         for annot in label.annotations:
-            if isinstance(
-                annot, (VideoClassificationAnnotation, VideoObjectAnnotation)
-            ):
-                video_annotations[annot.feature_schema_id or annot.name].append(
-                    annot
-                )
-            elif isinstance(annot, VideoMaskAnnotation):
+            if isinstance(annot, VideoMaskAnnotation):
                 yield NDObject.from_common(annotation=annot, data=label.data)
-
-        for annotation_group in video_annotations.values():
-            segment_frame_ranges = cls._get_segment_frame_ranges(
-                annotation_group
-            )
-            if isinstance(annotation_group[0], VideoClassificationAnnotation):
-                annotation = annotation_group[0]
-                frames_data = []
-                for frames in segment_frame_ranges:
-                    frames_data.append({"start": frames[0], "end": frames[-1]})
-                annotation.extra.update({"frames": frames_data})
-                yield NDClassification.from_common(annotation, label.data)
-
-            elif isinstance(annotation_group[0], VideoObjectAnnotation):
-                segments = []
-                for start_frame, end_frame in segment_frame_ranges:
-                    segment = []
-                    for annotation in annotation_group:
-                        if (
-                            annotation.keyframe
-                            and start_frame <= annotation.frame <= end_frame
-                        ):
-                            segment.append(annotation)
-                    segments.append(segment)
-                yield NDObject.from_common(segments, label.data)
+        
+        # Use temporal processor for video classifications and objects
+        from .utils.temporal_processor import VideoTemporalProcessor
+        processor = VideoTemporalProcessor()
+        yield from processor.process_annotations(label)
 
     @classmethod
     def _create_audio_annotations(
         cls, label: Label
     ) -> Generator[Union[NDChecklistSubclass, NDRadioSubclass], None, None]:
-        """Create audio annotations
+        """Create audio annotations using generic temporal processor
         
         Args:
             label: Label containing audio annotations to be processed
             
         Yields:
             NDClassification or NDObject: Audio annotations in NDJSON format
         """
-        audio_annotations = defaultdict(list)
-        for annot in label.annotations:
-            if isinstance(
-                annot, (AudioClassificationAnnotation, AudioObjectAnnotation)
-            ):
-                audio_annotations[annot.feature_schema_id or annot.name].append(
-                    annot
-                )
-
-        for annotation_group in audio_annotations.values():
-            if isinstance(annotation_group[0], AudioClassificationAnnotation):
-                # For TEXT classifications, group them into one feature with multiple keyframes
-                from ...annotation_types.classification.classification import Text
-                if isinstance(annotation_group[0].value, Text):
-                    
-                    # Group all annotations into one feature with multiple keyframes
-                    # Use first annotation as template but create combined content
-                    annotation = annotation_group[0]
-                    frames_data = []
-                    all_tokens = []
-                    
-                    for individual_annotation in annotation_group:
-                        frame = individual_annotation.frame
-                        end_frame = individual_annotation.end_frame if hasattr(individual_annotation, 'end_frame') and individual_annotation.end_frame is not None else frame
-                        frames_data.append({"start": frame, "end": end_frame})
-                        all_tokens.append(individual_annotation.value.answer)
-                    
-                    # For per-token annotations, embed token mapping in the content
-                    # Create a JSON structure that includes both the default text and token mapping
-                    import json
-                    token_mapping = {}
-                    for individual_annotation in annotation_group:
-                        frame = individual_annotation.frame
-                        token_mapping[str(frame)] = individual_annotation.value.answer
-                    
-                    # Embed token mapping in the answer field as JSON
-                    content_with_mapping = {
-                        "default_text": " ".join(all_tokens),  # Fallback text
-                        "token_mapping": token_mapping         # Per-keyframe content
-                    }
-                    from ...annotation_types.classification.classification import Text
-                    annotation.value = Text(answer=json.dumps(content_with_mapping))
-                    
-                    # Update the annotation with frames data
-                    annotation.extra = {"frames": frames_data}
-                    yield NDClassification.from_common(annotation, label.data)
-                else:
-                    # For non-TEXT classifications, process each individually
-                    for annotation in annotation_group:
-                        
-                        # Ensure frame data is properly formatted in extra field
-                        if hasattr(annotation, 'frame') and annotation.frame is not None:
-                            if not annotation.extra:
-                                annotation.extra = {}
-                            
-                            if 'frames' not in annotation.extra:
-                                end_frame = annotation.end_frame if hasattr(annotation, 'end_frame') and annotation.end_frame is not None else annotation.frame
-                                frames_data = [{"start": annotation.frame, "end": end_frame}]
-                                annotation.extra.update({"frames": frames_data})
-                        
-                        yield NDClassification.from_common(annotation, label.data)
-
-            elif isinstance(annotation_group[0], AudioObjectAnnotation):
-                # For audio objects, treat like single video frame
-                annotation = annotation_group[0]
-                yield NDObject.from_common(annotation, label.data)
+        from .utils.temporal_processor import AudioTemporalProcessor
+        
+        # Use processor with configurable behavior
+        processor = AudioTemporalProcessor(
+            group_text_annotations=True,  # Group multiple TEXT annotations into one feature
+            enable_token_mapping=True     # Enable per-keyframe token content
+        )
+        yield from processor.process_annotations(label)
 
     @classmethod
     def _create_non_video_annotations(cls, label: Label):
diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/objects.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/objects.py
@@ -748,7 +748,7 @@ def from_common(
             return obj.from_common(annotation, data)
         elif isinstance(annotation, AudioObjectAnnotation):
             # Handle audio object annotation like single video frame
-            return cls._handle_single_audio_annotation(annotation, data)
+            return cls._serialize_audio_object_annotation(annotation, data)
 
         subclasses = [
             NDSubclassification.from_common(annot)
@@ -773,8 +773,8 @@ def from_common(
         )
 
     @classmethod
-    def _handle_single_audio_annotation(cls, annotation: AudioObjectAnnotation, data: GenericDataRowData):
-        """Handle single audio annotation like video frame
+    def _serialize_audio_object_annotation(cls, annotation: AudioObjectAnnotation, data: GenericDataRowData):
+        """Serialize audio object annotation with temporal information
         
         Args:
             annotation: Audio object annotation to process
diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/utils/__init__.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/utils/__init__.py
@@ -0,0 +1 @@
+# Utils package for NDJSON serialization helpers
diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/utils/temporal_processor.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/utils/temporal_processor.py
@@ -0,0 +1,177 @@
+"""
+Generic temporal annotation processor for frame-based media (video, audio)
+"""
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from typing import Any, Dict, Generator, List, Union
+
+from ...annotation_types.annotation import ClassificationAnnotation, ObjectAnnotation
+from ...annotation_types.label import Label
+from .classification import NDClassificationType, NDClassification
+from .objects import NDObject
+
+
+class TemporalAnnotationProcessor(ABC):
+    """Abstract base class for processing temporal annotations (video, audio, etc.)"""
+    
+    @abstractmethod
+    def get_annotation_types(self) -> tuple:
+        """Return tuple of annotation types this processor handles"""
+        pass
+    
+    @abstractmethod
+    def should_group_annotations(self, annotation_group: List) -> bool:
+        """Determine if annotations should be grouped into one feature"""
+        pass
+    
+    @abstractmethod
+    def build_frame_data(self, annotation_group: List) -> List[Dict[str, Any]]:
+        """Extract frame data from annotation group"""
+        pass
+    
+    @abstractmethod
+    def prepare_grouped_content(self, annotation_group: List) -> Any:
+        """Prepare content for grouped annotations (may modify annotation.value)"""
+        pass
+    
+    def process_annotations(self, label: Label) -> Generator[Union[NDClassificationType, Any], None, None]:
+        """Main processing method - generic for all temporal media"""
+        temporal_annotations = defaultdict(list)
+        classification_types, object_types = self.get_annotation_types()
+        
+        # Group annotations by feature name/schema
+        for annot in label.annotations:
+            if isinstance(annot, classification_types + object_types):
+                temporal_annotations[annot.feature_schema_id or annot.name].append(annot)
+        
+        # Process each group
+        for annotation_group in temporal_annotations.values():
+            if isinstance(annotation_group[0], classification_types):
+                yield from self._process_classification_group(annotation_group, label.data)
+            elif isinstance(annotation_group[0], object_types):
+                yield from self._process_object_group(annotation_group, label.data)
+    
+    def _process_classification_group(self, annotation_group, data):
+        """Process classification annotations"""
+        if self.should_group_annotations(annotation_group):
+            # Group into single feature with multiple keyframes
+            annotation = annotation_group[0]  # Use first as template
+            
+            # Build frame data
+            frames_data = self.build_frame_data(annotation_group)
+            
+            # Prepare content (may modify annotation.value)
+            self.prepare_grouped_content(annotation_group)
+            
+            # Update with frame data
+            annotation.extra = {"frames": frames_data}
+            yield NDClassification.from_common(annotation, data)
+        else:
+            # Process individually
+            for annotation in annotation_group:
+                frames_data = self.build_frame_data([annotation])
+                if frames_data:
+                    if not annotation.extra:
+                        annotation.extra = {}
+                    annotation.extra.update({"frames": frames_data})
+                yield NDClassification.from_common(annotation, data)
+    
+    def _process_object_group(self, annotation_group, data):
+        """Process object annotations - default to individual processing"""
+        for annotation in annotation_group:
+            yield NDObject.from_common(annotation, data)
+
+
+class AudioTemporalProcessor(TemporalAnnotationProcessor):
+    """Processor for audio temporal annotations"""
+    
+    def __init__(self, 
+                 group_text_annotations: bool = True,
+                 enable_token_mapping: bool = True):
+        self.group_text_annotations = group_text_annotations
+        self.enable_token_mapping = enable_token_mapping
+    
+    def get_annotation_types(self) -> tuple:
+        from ...annotation_types.audio import AudioClassificationAnnotation, AudioObjectAnnotation
+        return (AudioClassificationAnnotation,), (AudioObjectAnnotation,)
+    
+    def should_group_annotations(self, annotation_group: List) -> bool:
+        """Group TEXT classifications with multiple temporal instances"""
+        if not self.group_text_annotations:
+            return False
+            
+        from ...annotation_types.classification.classification import Text
+        return (isinstance(annotation_group[0].value, Text) and 
+                len(annotation_group) > 1 and 
+                all(hasattr(ann, 'frame') for ann in annotation_group))
+    
+    def build_frame_data(self, annotation_group: List) -> List[Dict[str, Any]]:
+        """Extract frame ranges from audio annotations"""
+        frames_data = []
+        for annotation in annotation_group:
+            if hasattr(annotation, 'frame'):
+                frame = annotation.frame
+                end_frame = (annotation.end_frame 
+                           if hasattr(annotation, 'end_frame') and annotation.end_frame is not None 
+                           else frame)
+                frames_data.append({"start": frame, "end": end_frame})
+        return frames_data
+    
+    def prepare_grouped_content(self, annotation_group: List) -> None:
+        """Prepare content for grouped audio annotations"""
+        from ...annotation_types.classification.classification import Text
+        
+        if not isinstance(annotation_group[0].value, Text) or not self.enable_token_mapping:
+            return
+        
+        # Build token mapping for TEXT annotations
+        import json
+        
+        all_content = [ann.value.answer for ann in annotation_group]
+        token_mapping = {str(ann.frame): ann.value.answer for ann in annotation_group}
+        
+        content_structure = json.dumps({
+            "default_text": " ".join(all_content),
+            "token_mapping": token_mapping
+        })
+        
+        # Update the template annotation
+        annotation_group[0].value = Text(answer=content_structure)
+
+
+class VideoTemporalProcessor(TemporalAnnotationProcessor):
+    """Processor for video temporal annotations - matches existing behavior"""
+    
+    def get_annotation_types(self) -> tuple:
+        from ...annotation_types.video import VideoClassificationAnnotation, VideoObjectAnnotation
+        return (VideoClassificationAnnotation,), (VideoObjectAnnotation,)
+    
+    def should_group_annotations(self, annotation_group: List) -> bool:
+        """Video always groups by segment ranges"""
+        return True
+    
+    def build_frame_data(self, annotation_group: List) -> List[Dict[str, Any]]:
+        """Build frame data using existing video segment logic"""
+        from .label import NDLabel  # Import here to avoid circular import
+        
+        segment_frame_ranges = NDLabel._get_segment_frame_ranges(annotation_group)
+        return [{"start": frames[0], "end": frames[-1]} for frames in segment_frame_ranges]
+    
+    def prepare_grouped_content(self, annotation_group: List) -> None:
+        """Video doesn't modify content - uses existing value"""
+        pass
+    
+    def _process_object_group(self, annotation_group, data):
+        """Video objects use segment-based processing"""
+        from .label import NDLabel
+        
+        segment_frame_ranges = NDLabel._get_segment_frame_ranges(annotation_group)
+        segments = []
+        for start_frame, end_frame in segment_frame_ranges:
+            segment = []
+            for annotation in annotation_group:
+                if (annotation.keyframe and 
+                    start_frame <= annotation.frame <= end_frame):
+                    segment.append(annotation)
+            segments.append(segment)
+        yield NDObject.from_common(segments, data)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# Utils package for NDJSON serialization helpers`