diff --git a/pyproject.toml b/pyproject.toml index 23bd11b..c849081 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ maintainers = [ ] license = "Apache-2.0" readme = "README.md" -requires-python = ">=3.10,<3.13" +requires-python = ">=3.11,<3.13" homepage = "https://github.com/sensein/sailsprep" repository = "https://github.com/sensein/sailsprep" documentation = "https://sensein.github.io/sailsprep" @@ -30,7 +30,7 @@ classifiers = [ sailsprep-cli = "sailsprep.cli:main" [tool.poetry] -packages = [{include = "sailsprep", from = "src"}] +packages = [{include = "sailsprep", from = "src"}, {include = "vlm_baseline"}] requires-poetry = ">=2.0" version = "0.0.0" @@ -40,6 +40,12 @@ pandas = "^2.3.3" opencv-python = "^4.12.0.88" openpyxl = "^3.1.5" types-pyyaml = "^6.0.12.20250915" +decord = "^0.6.0" +pillow = "^9.2.0" +torch = "^2.9.1" +transformers = "^4.57.3" +scikit-learn = "^1.8.0" +moviepy = "1.0.3" [tool.poetry.group.dev] optional = true diff --git a/vlm_baseline/__init__.py b/vlm_baseline/__init__.py new file mode 100644 index 0000000..39928ef --- /dev/null +++ b/vlm_baseline/__init__.py @@ -0,0 +1 @@ +"""VLM baseline framework for automatic video annotation.""" diff --git a/vlm_baseline/configs/ovis2/activity.yaml b/vlm_baseline/configs/ovis2/activity.yaml new file mode 100644 index 0000000..657c3f7 --- /dev/null +++ b/vlm_baseline/configs/ovis2/activity.yaml @@ -0,0 +1,65 @@ +experiment: + name: activity_ovis2 + seed: 42 + description: > + Free-text activity description of what the child is doing in the video. + The model should produce a natural language sentence or short paragraph. + +model: + name: ovis2 + device: cuda + precision: bf16 + max_frames: 16 + +data: + ground_truth_csv: /orcd/scratch/bcs/001/sensein/sails/BIDS_data/test.csv + label_column: Activity + video_path_column: BidsProcessed + +output: + save_dir: /orcd/scratch/bcs/001/sensein/sails/pre2annot_evaluation/activity_ovis2 + save_predictions: true + +task: + type: description + output_format: + style: free_text + constraints: + - Describe only what is visible in the video + - Focus on the child's activity + - Do not infer intentions or emotions unless clearly visible + - Use short, concrete phrases + - Do not include timestamps + - Do not mention the camera or recording setup + +prompt: + message: | + You are a video understanding model. + + Your task is to describe the ACTIVITY performed by the child in the video. + + Activity definition: + A more detailed description of the events in the video. + It should provide further detail about the child's actions. + The description should be consistent with the broader context but more specific. + + Examples: + - Context: motor play → Activity: gymnastics or tumbling + - Context: book play → Activity: adult reading a book to the child + - Context: social play → Activity: laughing, tickling, engaging with an adult + + Instructions: + - Describe only observable actions + - Use ONLY ONE short descriptive phrase or sentence + - Do not list multiple unrelated activities + - Do not explain or justify your answer + + Now describe the activity in the video. + +evaluation: + type: text + metrics: + - none + notes: | + Free-text activity descriptions are not automatically scored. + Evaluation may be qualitative or based on downstream analysis. diff --git a/vlm_baseline/configs/ovis2/gesture_type.yaml b/vlm_baseline/configs/ovis2/gesture_type.yaml new file mode 100644 index 0000000..f27d2ab --- /dev/null +++ b/vlm_baseline/configs/ovis2/gesture_type.yaml @@ -0,0 +1,45 @@ +experiment: + name: gesture_type_ovis2 + seed: 42 + +model: + name: ovis2 + device: cuda + precision: bf16 + max_frames: 16 + +output: + save_dir: /orcd/scratch/bcs/001/sensein/sails/pre2annot_evaluation/gesture_type_ovis2 + save_predictions: true + +data: + video_path_column: BidsProcessed + ground_truth_csv: /orcd/scratch/bcs/001/sensein/sails/BIDS_data/anotated_processed.csv + label_column: Gesture_type + +task: + type: classification + labels: + - reach + - show + - give + - point + - take + - head shake + - head nod + - wave + - clap + - sign + - multiple + - NaN + +prompt: + message: | + You are a video understanding model. + Given the following video, classify the gesture being performed. + Possible labels are:[reach,show,give,point,take,head shake,head nod,wave,clap,sign,multiple,NaN]. + Answer with only the label. + +evaluation: + metrics: + - accuracy diff --git a/vlm_baseline/configs/ovis2/gestures.yaml b/vlm_baseline/configs/ovis2/gestures.yaml new file mode 100644 index 0000000..d14a225 --- /dev/null +++ b/vlm_baseline/configs/ovis2/gestures.yaml @@ -0,0 +1,35 @@ +experiment: + name: gestures_ovis2 + seed: 42 + +model: + name: ovis2 + device: cuda + precision: bf16 + max_frames: 16 + +output: + save_dir: /orcd/scratch/bcs/001/sensein/sails/pre2annot_evaluation/gestures_ovis2 + save_predictions: true + +data: + video_path_column: BidsProcessed + ground_truth_csv: /orcd/scratch/bcs/001/sensein/sails/BIDS_data/anotated_processed.csv + label_column: Gestures + +task: + type: classification + labels: + - yes + - no + +prompt: + message: | + You are a video understanding model. + Given the following video, Indicate whether the child engages in a motion or motions that signal(s) or attempt(s) to signal nonverbal communication. + Possible labels are:[yes,no]. + Answer with only the label. + +evaluation: + metrics: + - accuracy diff --git a/vlm_baseline/configs/ovis2/interaction_w_child.yaml b/vlm_baseline/configs/ovis2/interaction_w_child.yaml new file mode 100644 index 0000000..3005bcf --- /dev/null +++ b/vlm_baseline/configs/ovis2/interaction_w_child.yaml @@ -0,0 +1,41 @@ +experiment: + name: interaction_w_child_ovis2 + seed: 42 + +model: + name: ovis2 + device: cuda + precision: bf16 + max_frames: 16 + +output: + save_dir: /orcd/scratch/bcs/001/sensein/sails/pre2annot_evaluation/interaction_w_child_ovis2 + save_predictions: true + +data: + video_path_column: BidsProcessed + ground_truth_csv: /orcd/scratch/bcs/001/sensein/sails/BIDS_data/anotated_processed.csv + label_column: Interaction_with_child + +task: + type: classification + labels: + - yes + - no + + +prompt: + message: | + You are a video understanding model. + Given the following video, indicate whether there is a social interaction or attempt directed toward the target child. + This could be by the person filming the video or another individual in the video. + + Rate "yes" if there is a social interaction or an attempt directed toward the child (e.g., the person filming asks the child questions or makes a comment to the child). + Rate "no" if there is no interaction/attempt directed toward the child (e.g., the video is of the child paging through a book on their own with no one engaging them). + + Possible labels are: [yes, no]. + Answer with only the label. + +evaluation: + metrics: + - accuracy diff --git a/vlm_baseline/configs/ovis2/response_to_name.yaml b/vlm_baseline/configs/ovis2/response_to_name.yaml new file mode 100644 index 0000000..53a957b --- /dev/null +++ b/vlm_baseline/configs/ovis2/response_to_name.yaml @@ -0,0 +1,39 @@ +experiment: + name: response_to_name_ovis2 + seed: 42 + +model: + name: ovis2 + device: cuda + precision: bf16 + max_frames: 16 + +output: + save_dir: /orcd/scratch/bcs/001/sensein/sails/pre2annot_evaluation/response_to_name_ovis2 + save_predictions: true + +data: + video_path_column: BidsProcessed + ground_truth_csv: /orcd/scratch/bcs/001/sensein/sails/BIDS_data/anotated_processed.csv + label_column: Response_to_name + +task: + type: classification + labels: + - yes + - no + - inconsistent + - NaN + +prompt: + message: | + You are a video understanding model. + Given the following video, Indicate whether an overt attempt to gain child’s attention via use of their name or nickname occurs. Do not include use of name in conversation. Rate “yes” if the child looks + toward the person saying their name immediately. Rate “no” if the child does not react to hearing their name. Rate “inconsistent” if multiple attempts are made and different + responses are observed. Rate NaN” if no overt attempt is made. + Possible labels are:[yes,no,inconsistent,NaN]. + Answer with only the label. + +evaluation: + metrics: + - accuracy diff --git a/vlm_baseline/documentation.md b/vlm_baseline/documentation.md new file mode 100644 index 0000000..68109e5 --- /dev/null +++ b/vlm_baseline/documentation.md @@ -0,0 +1,112 @@ +# VLM Baseline Evaluation — Documentation + +## Overview + +This `vlm_baseline` folder provides a baseline framework for automatic annotation of videos using Video-Language Models (VLMs). The primary goal is to automate the manual annotation process currently performed on SAILS videos. + +### Key Concepts + +- **Automatic Annotation**: Videos that are currently manually annotated will be processed automatically using VLMs. +- **Annotation Types**: + - **Classifications**: Categorical labels (e.g., gesture types) + - **Descriptions**: Free-text descriptions of video content (e.g. activity) +- **Evaluation**: Different metrics are used for each annotation type to evaluate the VLM performances. +- **Inference Process**: Run VLM inference on all available videos and compare predictions against ground truth annotations. +- **Output Format**: Videos processed are from the BIDS folder, and evaluation results are saved to locations specified in the configuration file. + +**Key Architecture Principle:** + +- `models/` handles model interaction - if you want to try performances of a new VLM, you'll need to implement it here +- `postprocessing/` converts raw VLM output into task-specific prediction format +- `evaluation/` computes metrics comparing predictions vs. ground truth +- `runners/` orchestrates the entire pipeline (config loading, data iteration, output saving, evaluation) + +## How to Run + +Build a srun session with a gpu, then from the repo root, run: + +```bash +poetry run python vlm_baseline/runners/run_prediction.py vlm_baseline/configs/ovis2/response_to_name.yaml +``` + + +## Configuration File (YAML) + +A config defines one complete experiment (one model + one task + one dataset + one prompt + one output directory). If you want to try a vlm on a particular annotation prediction, feel free to create a new configuration file with the same structure as the ones already present. + +## Models (models/) + +This folder contains thin wrappers around VLM backends (Ovis2, Qwen2.5, …). +It loads the model, runs inference on a video + prompt, returns raw generated text + +## Postprocessing (postprocessing/) + +Postprocessing converts raw model output into the prediction type expected by the task. It then validates the postprocessed output + +## Evaluation (evaluation/) + +Evaluation metrics depend on `task.type`. For free text tasks, we haven't any metrics implemented yet. + +### Classification Evaluation + +Common metrics include: +- **Accuracy** (though not always most relevant for unbalanced datasets) +- Macro-F1 / Weighted-F1 +- Per-class precision/recall/F1 +- Confusion matrix + +**Inputs**: Ground truth labels from CSV vs. postprocessed predictions + +### How to add a new model + +## How to Add a New Model + +To integrate a new VLM into the baseline framework, follow these steps: + +### 1. Create Model Wrapper + +Create a new file `models/.py` with a class that inherits from `BaseVLM`: + +```python +class NewModelVLM(BaseVLM): + def load(self): + # Load weights/processor, set device, eval mode + pass + + def generate(self, video_path, prompt, video_cfg=None, gen_cfg=None): + # Implement inference logic + # Return VLMRawOutput + pass + + # Usually no need to override predict() +``` + +### 2. Register the Model + +Update `models/__init__.py`: + +- Import your new class +- Add a case in the `load_model()` function for your model's `config["name"]` + +### 3. Create Configuration + +Add a config YAML file under `configs//...yaml` with at least the annotation description, prompt etc,... and for the model configuration: + +```yaml +model: + name: "your_model_name" + model_path: "HF_repo_id" # or local path + device: "cuda" + precision: "bf16" + +``` + +### 4. Test the Integration + +Run your existing runner with the new config: + +```bash +poetry run python vlm_baseline/runners/run_prediction.py vlm_baseline/configs//your_config.yaml +``` + +**Note**: Downstream postprocessing automatically determines whether it's a classification or free-text task based on the configuration. diff --git a/vlm_baseline/evaluation/metrics.py b/vlm_baseline/evaluation/metrics.py new file mode 100644 index 0000000..9553f8e --- /dev/null +++ b/vlm_baseline/evaluation/metrics.py @@ -0,0 +1,87 @@ +"""Evaluation metrics for annotation tasks. + +This module contains evaluation functions for classification and free-text +annotation tasks in the VLM baseline framework. +""" + +from __future__ import annotations + +from typing import Any, Dict, List + +import pandas as pd +from sklearn.metrics import accuracy_score, classification_report, f1_score + + +def evaluate_classification( + y_true: List[str], + y_pred: List[str], + metrics: List[str], + labels: List[str], + invalid_label: str = "INVALID", +) -> Dict[str, Any]: + """Evaluate classification predictions against ground truth. + + Args: + y_true: Ground truth labels + y_pred: Predicted labels + metrics: List of metrics to compute + labels: List of valid label classes + invalid_label: Label used for invalid predictions + + Returns: + Dictionary containing computed metrics + """ + out: Dict[str, Any] = {} + + out["n"] = len(y_true) + out["invalid_rate"] = sum(1 for p in y_pred if p == invalid_label) / max( + 1, len(y_pred) + ) + + if "accuracy" in metrics: + out["accuracy"] = float(accuracy_score(y_true, y_pred)) + + if "f1_macro" in metrics: + out["f1_macro"] = float( + f1_score(y_true, y_pred, labels=labels, average="macro", zero_division=0) + ) + + if "f1_weighted" in metrics: + out["f1_weighted"] = float( + f1_score(y_true, y_pred, labels=labels, average="weighted", zero_division=0) + ) + + if "report" in metrics: + out["report"] = classification_report( + y_true, y_pred, labels=labels, output_dict=True, zero_division=0 + ) + + return out + + +def evaluate_description( + predictions_df: pd.DataFrame, + metrics: List[str], + cfg: dict | None = None, +) -> Dict[str, Any]: + """Evaluate free-text description predictions. + + Placeholder: your free-text eval likely uses BLEU/ROUGE/BERTScore, + or LLM-as-judge, or keyword-based scoring, etc. + Implement your own logic here. + + Args: + predictions_df: DataFrame containing predictions + metrics: List of metrics to compute + cfg: Optional configuration dictionary + + Returns: + Dictionary containing computed metrics + """ + out: Dict[str, Any] = {"n": int(len(predictions_df))} + + # Example: just track average length + if "avg_len" in metrics: + out["avg_len"] = float(predictions_df["prediction"].fillna("").map(len).mean()) + + return out diff --git a/vlm_baseline/models/__init__.py b/vlm_baseline/models/__init__.py new file mode 100644 index 0000000..0019cc7 --- /dev/null +++ b/vlm_baseline/models/__init__.py @@ -0,0 +1,20 @@ +"""VLM model implementations and factory functions.""" + +from __future__ import annotations + +from typing import Any, Dict + +from .ovis2 import Ovis2VLM + + +def load_model(model_config: Dict[str, Any]) -> Ovis2VLM: + """Factory used by runners. + + Expects cfg["model"] dict with at least: {"name": "..."}. + """ + name = str(model_config.get("name", "")).lower().strip() + + if name == "ovis2": + return Ovis2VLM(model_config) + + raise ValueError(f"Unknown model name: {name!r}. Available: ['ovis2']") diff --git a/vlm_baseline/models/base_vlm.py b/vlm_baseline/models/base_vlm.py new file mode 100644 index 0000000..781564a --- /dev/null +++ b/vlm_baseline/models/base_vlm.py @@ -0,0 +1,61 @@ +"""Base classes and interfaces for VLM implementations.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Any, Dict, Optional + + +@dataclass +class VLMRawOutput: + """Lowest-common-denominator output from any VLM backend. + + Task-specific parsing (classification vs free text) happens elsewhere. + """ + + raw_text: str + meta: Dict[str, Any] = field(default_factory=dict) + + +class BaseVLM(ABC): + """Minimal interface your runners can rely on. + + - load(): must initialize model weights/tokenizer/processor + - predict(): should return raw string (easy for CSV + downstream parsing) + - generate(): can return VLMRawOutput (raw + meta) + """ + + def __init__(self, config: Dict[str, Any]) -> None: + """Initialize the VLM with configuration. + + Args: + config: Configuration dictionary for the model. + """ + self.config = config or {} + self.model = None + self.device: Optional[str] = None + self._loaded: bool = False + + @abstractmethod + def load(self) -> None: + """Load model weights, tokenizer, and processor.""" + + def predict(self, video_path: str, prompt: str) -> str: + """Default implementation: call generate() and return raw_text. + + You can override, but usually you don't need to. + """ + if not self._loaded: + self.load() + return self.generate(video_path=video_path, prompt=prompt).raw_text + + @abstractmethod + def generate( + self, + video_path: str, + prompt: str, + video_cfg: Optional[Dict[str, Any]] = None, + gen_cfg: Optional[Dict[str, Any]] = None, + ) -> VLMRawOutput: + """Generate response for video and prompt.""" diff --git a/vlm_baseline/models/ovis2.py b/vlm_baseline/models/ovis2.py new file mode 100644 index 0000000..65a3ef2 --- /dev/null +++ b/vlm_baseline/models/ovis2.py @@ -0,0 +1,367 @@ +"""Ovis2 VLM model wrapper for the baseline framework.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union, cast + +import torch +from moviepy.editor import VideoFileClip +from PIL import Image +from transformers import AutoModelForCausalLM + +from .base_vlm import BaseVLM, VLMRawOutput + + +def _dtype_from_str(s: str) -> torch.dtype: + s = (s or "").lower().strip() + if s in {"bf16", "bfloat16"}: + return torch.bfloat16 + if s in {"fp16", "float16", "half"}: + return torch.float16 + return torch.float32 + + +def _get_cfg( + cfg: Dict[str, Any], + *keys: str, + default: Union[str, int, bool, Dict[str, Any], None] = None, +) -> Union[str, int, bool, Dict[str, Any], None]: + """Return first found key in cfg (supports multiple aliases).""" + for k in keys: + if k in cfg and cfg[k] is not None: + return cfg[k] + return default + + +class Ovis2VLM(BaseVLM): + """Runner-compatible wrapper for Ovis2 VLM. + + Provides a standardized interface for the Ovis2 model with methods: + - __init__(model_config) + - load() + - predict(video_path, prompt) -> str + """ + + def __init__(self, model_config: Dict[str, Any]) -> None: + """Initialize the Ovis2 VLM wrapper. + + Args: + model_config: Configuration dictionary for the model. + """ + self.config = model_config or {} + self.model: Optional[Any] = None + self.device: str = str( + cast(str, _get_cfg(self.config, "device", default="cpu")) + ) + self._loaded = False + + # Runner config uses these keys: + # precision: "bf16" + # max_frames: 16 + # Allow aliases for convenience. + self.precision = str( + cast(str, _get_cfg(self.config, "precision", "torch_dtype", default="bf16")) + ) + self.max_frames = int( + cast(int, _get_cfg(self.config, "max_frames", default=16)) + ) + + # HF id / path aliases + self.model_path = str( + cast( + str, + _get_cfg( + self.config, + "hf_model_id", + "model_path", + default="AIDC-AI/Ovis2.5-9B", + ), + ) + ) + + self.trust_remote_code = bool( + cast(bool, _get_cfg(self.config, "trust_remote_code", default=True)) + ) + + # Optional sub-config blocks (kept for compatibility with your existing pattern) + video_cfg_raw = cast(Dict[str, Any], _get_cfg(self.config, "video", default={})) + self.video_cfg = dict(video_cfg_raw) if video_cfg_raw is not None else {} + gen_cfg_raw = cast( + Dict[str, Any], _get_cfg(self.config, "generation", default={}) + ) + self.gen_cfg = dict(gen_cfg_raw) if gen_cfg_raw is not None else {} + + def load(self) -> None: + """Load the Ovis2 model and move it to the specified device.""" + if self._loaded: + return + + torch_dtype = _dtype_from_str(self.precision) + + self.model = AutoModelForCausalLM.from_pretrained( + self.model_path, + torch_dtype=torch_dtype, + trust_remote_code=self.trust_remote_code, + ) + + # Device handling + if self.device.startswith("cuda") and torch.cuda.is_available(): + self.model = self.model.to(self.device) + else: + self.device = "cpu" + self.model = self.model.to("cpu") + + self.model.eval() + self._loaded = True + + # ------------------------- + # Public API expected by runner + # ------------------------- + # Note: Using base class predict() which calls generate().raw_text + + # ------------------------- + # Core generation + # ------------------------- + def _build_messages( + self, frames: List[Image.Image], prompt: str + ) -> List[Dict[str, Any]]: + return [ + { + "role": "user", + "content": [ + {"type": "video", "video": frames}, + {"type": "text", "text": prompt}, + ], + } + ] + + def generate( + self, + video_path: str, + prompt: str, + video_cfg: Optional[Dict[str, Any]] = None, + gen_cfg: Optional[Dict[str, Any]] = None, + ) -> VLMRawOutput: + """Generate a response for the given video and prompt. + + Args: + video_path: Path to the video file. + prompt: Text prompt for the model. + video_cfg: Optional video processing configuration. + gen_cfg: Optional generation configuration. + + Returns: + VLMRawOutput containing the generated text and metadata. + """ + if self.model is None or self.device is None: + raise RuntimeError("Model not loaded. Call load() first.") + + video_cfg = video_cfg or self.config.get("video", {}) or {} + gen_cfg = gen_cfg or self.config.get("generation", {}) or {} + + frames, fmeta = self._extract_frames(video_path, video_cfg) + if not frames: + return VLMRawOutput( + raw_text="", meta={"model": "ovis2", "empty_frames": True, **fmeta} + ) + + messages = self._build_messages(frames, prompt) + + # 1) PREPROCESS (same as your snippet) + max_pixels = int(video_cfg.get("max_pixels", 896 * 896)) + input_ids, pixel_values, grid_thws = self.model.preprocess_inputs( + messages=messages, + add_generation_prompt=True, + max_pixels=max_pixels, + ) + # 2) MOVE TO DEVICE (equivalent to .cuda(), but device-agnostic) + input_ids = input_ids.to(self.device) + pixel_values = ( + pixel_values.to(self.device).to(self.model.dtype) + if pixel_values is not None + else None + ) + grid_thws = grid_thws.to(self.device) if grid_thws is not None else None + + # 3) INFERENCE (same as your snippet) + max_new_tokens = int(gen_cfg.get("max_new_tokens", 128)) + do_sample = bool(gen_cfg.get("do_sample", False)) + + with torch.no_grad(): + outputs = self.model.generate( + inputs=input_ids, + pixel_values=pixel_values, + grid_thws=grid_thws, + max_new_tokens=max_new_tokens, + do_sample=do_sample, + eos_token_id=self.model.text_tokenizer.eos_token_id, + pad_token_id=self.model.text_tokenizer.pad_token_id, + ) + + # 4) DECODE (same as your snippet) + answer = self.model.text_tokenizer.decode( + outputs[0], skip_special_tokens=True + ).strip() + return VLMRawOutput(raw_text=answer, meta={"model": "ovis2", **fmeta}) + + # --- Frame extraction (keep here for now; move later if you want) --- + + def _extract_frames( + self, video_path: str, video_cfg: Dict[str, Any] + ) -> Tuple[List[Image.Image], Dict[str, Any]]: + num_frames = int(video_cfg.get("num_frames", 16)) + sampling = str(video_cfg.get("sampling", "uniform")).lower() + + p = Path(str(video_path)) + if not p.exists(): + # IMPORTANT: don’t hide this — it’s usually the real bug on clusters + return [], {"frame_backend": None, "error": f"FileNotFound: {p}"} + + # 1) decord + try: + from decord import VideoReader, cpu # type: ignore + + vr = VideoReader(str(p), ctx=cpu(0)) + n = len(vr) + if n <= 0: + return [], {"frame_backend": "decord", "error": "EmptyVideo(len=0)"} + + if sampling != "uniform": + raise ValueError(f"Unknown sampling strategy: {sampling}") + + if n <= num_frames: + idxs = list(range(n)) + else: + step = (n - 1) / max(num_frames - 1, 1) + idxs = [int(round(i * step)) for i in range(num_frames)] + + frames_np = vr.get_batch(idxs).asnumpy() # (T,H,W,C) RGB + frames_decord = [ + Image.fromarray(frames_np[i]) for i in range(frames_np.shape[0]) + ] + return frames_decord, {"frame_backend": "decord", "n_frames_video": n} + except Exception as e: + decord_err = repr(e) + + # 2) OpenCV fallback (often works even when moviepy/ffmpeg is weird) + try: + import cv2 # type: ignore + + cap = cv2.VideoCapture(str(p)) + if not cap.isOpened(): + return [], { + "frame_backend": "opencv", + "error": f"VideoCaptureNotOpened; decord_error={decord_err}", + } + + n = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0) + + if sampling != "uniform": + cap.release() + raise ValueError(f"Unknown sampling strategy: {sampling}") + + frames_opencv: List[Image.Image] = [] + + if n > 0: + # sample indices + if n <= num_frames: + idxs = list(range(n)) + else: + step = (n - 1) / max(num_frames - 1, 1) + idxs = [int(round(i * step)) for i in range(num_frames)] + wanted = set(idxs) + + i = 0 + while True: + ok, frame = cap.read() + if not ok: + break + if i in wanted: + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + frames_opencv.append(Image.fromarray(frame)) + if len(frames_opencv) >= len(idxs): + break + i += 1 + else: + # unknown frame count: just take first num_frames + while len(frames_opencv) < num_frames: + ok, frame = cap.read() + if not ok: + break + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + frames_opencv.append(Image.fromarray(frame)) + + cap.release() + if not frames_opencv: + return [], { + "frame_backend": "opencv", + "error": f"NoFramesDecoded; decord_error={decord_err}", + } + return frames_opencv, { + "frame_backend": "opencv", + "n_frames_video": n, + "decord_error": decord_err, + } + except Exception as e: + opencv_err = repr(e) + + # 3) moviepy fallback (relies on ffmpeg) + try: + # type: ignore + + frames_moviepy: List[Image.Image] = [] + with VideoFileClip(str(p)) as clip: + if clip.duration is None or clip.fps is None: + return [], { + "frame_backend": "moviepy", + "error": ( + f"NoDurationOrFPS; decord_error={decord_err}; " + f"opencv_error={opencv_err}" + ), + } + + total_frames = int(clip.fps * clip.duration) + if total_frames <= 0: + return [], { + "frame_backend": "moviepy", + "error": ( + f"total_frames<=0; decord_error={decord_err}; " + f"opencv_error={opencv_err}" + ), + } + + if sampling != "uniform": + raise ValueError(f"Unknown sampling strategy: {sampling}") + + idxs = [ + int(i * (total_frames - 1) / max(num_frames - 1, 1)) + for i in range(num_frames) + ] + for idx in idxs: + t = min(max(idx / clip.fps, 0.0), clip.duration - 1e-3) + frame = clip.get_frame(t) + frames_moviepy.append(Image.fromarray(frame)) + + if not frames_moviepy: + return [], { + "frame_backend": "moviepy", + "error": ( + f"NoFramesDecoded; decord_error={decord_err}; " + f"opencv_error={opencv_err}" + ), + } + return frames_moviepy, { + "frame_backend": "moviepy", + "total_frames_est": total_frames, + "decord_error": decord_err, + "opencv_error": opencv_err, + } + except Exception as e: + return [], { + "frame_backend": "moviepy", + "error": ( + f"moviepy_failed={repr(e)}; decord_error={decord_err}; " + f"opencv_error={opencv_err}" + ), + } diff --git a/vlm_baseline/postprocessing/validation.py b/vlm_baseline/postprocessing/validation.py new file mode 100644 index 0000000..3fdb786 --- /dev/null +++ b/vlm_baseline/postprocessing/validation.py @@ -0,0 +1,69 @@ +"""Postprocessing and validation functions for VLM outputs.""" + +from __future__ import annotations + +import re +from typing import Dict, List, Tuple + +INVALID_LABEL = "INVALID" + + +def _normalize_space(s: str) -> str: + return re.sub(r"\s+", " ", str(s).strip()) + + +def _normalize_label(s: str) -> str: + return _normalize_space(s).lower() + + +def validate_classification_output( + raw_output: str, + allowed_labels: List[str], + *, + invalid_label: str = INVALID_LABEL, +) -> Tuple[str, Dict]: + """Returns: (final_label, debug). + + - final_label is always a string: + - one of allowed_labels, OR invalid_label + """ + debug: Dict = {"raw_output": raw_output} + + if raw_output is None: + debug["reason"] = "raw_output_none" + return invalid_label, debug + + out = _normalize_space(raw_output) + if not out: + debug["reason"] = "empty_output" + return invalid_label, debug + + allowed_norm_to_orig = {_normalize_label(label): label for label in allowed_labels} + out_norm = _normalize_label(out) + + # 1) Exact match + if out_norm in allowed_norm_to_orig: + label = allowed_norm_to_orig[out_norm] + debug["mode"] = "exact" + debug["label"] = label + return label, debug + + # 2) Extract if exactly one allowed label is present + hits = [] + for label_norm, label_orig in allowed_norm_to_orig.items(): + pattern = r"(?:^|[^a-z0-9])" + re.escape(label_norm) + r"(?:$|[^a-z0-9])" + if re.search(pattern, out_norm): + hits.append(label_orig) + + hits = list(dict.fromkeys(hits)) + debug["hits"] = hits + + if len(hits) == 1: + debug["mode"] = "single_hit" + debug["label"] = hits[0] + return hits[0], debug + + debug["reason"] = ( + "no_unique_label_found" if len(hits) == 0 else "multiple_labels_found" + ) + return invalid_label, debug diff --git a/vlm_baseline/runners/run_prediction.py b/vlm_baseline/runners/run_prediction.py new file mode 100644 index 0000000..55aad1e --- /dev/null +++ b/vlm_baseline/runners/run_prediction.py @@ -0,0 +1,275 @@ +"""Main runner script for VLM baseline evaluation.""" + +from __future__ import annotations + +import json +import re +import sys +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List + +import pandas as pd +import yaml +from tqdm.auto import tqdm +from vlm_baseline.evaluation.metrics import ( + evaluate_classification, + evaluate_description, +) +from vlm_baseline.models import load_model +from vlm_baseline.postprocessing.validation import validate_classification_output + +INVALID_LABEL = "INVALID" + + +def now_tag() -> str: + """Generate a timestamp tag for the current run.""" + return datetime.now().strftime("%Y%m%d_%H%M") + + +def normalize_space(s: str) -> str: + """Normalize whitespace in a string.""" + return re.sub(r"\s+", " ", str(s).strip()) + + +def main(config_path: str) -> None: + """Run the VLM baseline evaluation pipeline.""" + # --------------------------- + # Load config + # --------------------------- + with open(config_path, "r", encoding="utf-8") as f: + cfg = yaml.safe_load(f) + + exp_name = cfg["experiment"]["name"] + task_type = str(cfg["task"]["type"]).lower().strip() + + # --------------------------- + # Output dir (add run tag) + # --------------------------- + run_id = f"{now_tag()}" + out_root = Path(cfg["output"]["save_dir"]) + out_dir = out_root / run_id + out_dir.mkdir(parents=True, exist_ok=True) + + # --------------------------- + # Load annotation CSV + # --------------------------- + gt_file = cfg["data"]["ground_truth_csv"] + df = pd.read_csv(gt_file) + + video_col = cfg["data"]["video_path_column"] + label_col = cfg["data"].get("label_column") + + if video_col not in df.columns: + raise ValueError( + f"CSV missing video_path_column '{video_col}'. Columns: {list(df.columns)}" + ) + + # --------------------------- + # Prompt + # --------------------------- + prompt = cfg["prompt"]["message"] + + # --------------------------- + # Load model + # --------------------------- + model = load_model(cfg["model"]) + if hasattr(model, "load"): + model.load() + + # --------------------------- + # Task setup + # --------------------------- + task_type = cfg["task"]["type"] + metrics_cfg = cfg.get("evaluation", {}).get("metrics", []) + + if video_col is None or video_col not in df.columns: + raise ValueError("data.video_path_column must be present in the CSV.") + + if label_col is None or label_col not in df.columns: + raise ValueError( + "data.label_column must be present in the CSV for both " + "classification and description tasks." + ) + + allowed_labels: List[str] = [] + if task_type == "classification": + allowed_labels = list(cfg["task"]["labels"]) + df[label_col] = df[label_col].astype(object).where(df[label_col].notna(), "NaN") + else: + df[label_col] = df[label_col].astype(object).where(df[label_col].notna(), "") + + preds_rows: List[Dict[str, Any]] = [] + debug_rows: List[Dict[str, Any]] = [] + + y_true: List[str] = [] + y_pred: List[str] = [] + + # --------------------------- + # Progress bar counters + # --------------------------- + skipped_not_found = 0 + predict_errors = 0 + invalid_preds = 0 + + # --------------------------- + # Run inference (with progress bar) + # --------------------------- + iterator = df.iterrows() + + pbar = tqdm( + iterator, + total=len(df), + desc="Processing videos", + unit="video", + dynamic_ncols=True, + mininterval=1.0, + ) + + for i, row in pbar: + video_path = row[video_col] + gt = row[label_col] + + if not isinstance(video_path, str) or not Path(video_path).exists(): + skipped_not_found += 1 + debug_rows.append( + { + "index": int(i), + "video_path": str(video_path), + "error": "video_not_found", + } + ) + pbar.set_postfix( + skipped=skipped_not_found, errors=predict_errors, invalid=invalid_preds + ) + continue + + try: + raw = model.predict(str(video_path), prompt) + except Exception as e: + raw = "" + predict_errors += 1 + debug_rows.append( + { + "index": int(i), + "video_path": str(video_path), + "error": f"predict_exception: {repr(e)}", + } + ) + pbar.set_postfix( + skipped=skipped_not_found, errors=predict_errors, invalid=invalid_preds + ) + continue + + if task_type == "classification": + pred_label, dbg = validate_classification_output( + raw_output=str(raw), + allowed_labels=allowed_labels, + invalid_label=INVALID_LABEL, + ) + if pred_label is None: + pred_label = INVALID_LABEL + + if str(pred_label) == INVALID_LABEL: + invalid_preds += 1 + + dbg.update({"index": int(i), "video_path": str(video_path)}) + debug_rows.append(dbg) + + preds_rows.append( + { + "index": int(i), + "video_path": str(video_path), + "ground_truth": str(gt), + "raw_prediction": raw, + "prediction": str(pred_label), + } + ) + + y_true.append(str(gt)) + y_pred.append(str(pred_label)) + + elif task_type == "description": + pred_text = normalize_space(str(raw)) + preds_rows.append( + { + "index": int(i), + "video_path": str(video_path), + "ground_truth": str(gt), + "raw_prediction": raw, + "prediction": pred_text, + } + ) + else: + raise ValueError( + f"Unknown task.type '{task_type}'. Expected 'classification' " + "or 'description'." + ) + + pbar.set_postfix( + skipped=skipped_not_found, errors=predict_errors, invalid=invalid_preds + ) + + # --------------------------- + # Make a single predictions DF for saving + for description evaluation + # --------------------------- + pred_df = pd.DataFrame(preds_rows) + + # --------------------------- + # Evaluation + # --------------------------- + if task_type == "classification": + metrics = evaluate_classification( + y_true=y_true, + y_pred=y_pred, + labels=allowed_labels, + metrics=metrics_cfg, + invalid_label=INVALID_LABEL, + ) + elif task_type == "description": + metrics = evaluate_description( + predictions_df=pred_df, + metrics=metrics_cfg, + cfg=cfg, + ) + + # --------------------------- + # Save artifacts + # --------------------------- + debug_df = pd.DataFrame(debug_rows) if debug_rows else pd.DataFrame() + + results = { + "experiment": exp_name, + "run_id": run_id, + "model": cfg["model"]["name"], + "task": task_type, + "num_samples": int(len(pred_df)), + "metrics": metrics, + "files": { + "predictions_csv": str(out_dir / "predictions.csv"), + "debug_csv": str(out_dir / "debug.csv"), + "results_json": str(out_dir / "results.json"), + "config_used": str(out_dir / "config_used.yaml"), + }, + } + + with open(out_dir / "results.json", "w", encoding="utf-8") as f: + json.dump(results, f, indent=2, ensure_ascii=False) + + with open(out_dir / "config_used.yaml", "w", encoding="utf-8") as f: + yaml.safe_dump(cfg, f, sort_keys=False, allow_unicode=True) + + if cfg["output"].get("save_predictions", True): + pred_df.to_csv(out_dir / "predictions.csv", index=False) + + debug_df.to_csv(out_dir / "debug.csv", index=False) + + print(f"✅ Experiment completed successfully. Saved to: {out_dir}") + + +if __name__ == "__main__": + import sys + + if len(sys.argv) < 2: + raise SystemExit("Usage: python -m runners.run_experiment path/to/config.yaml") + main(sys.argv[1])