sensein · brukew · Dec 24, 2025 · Dec 22, 2025 · Dec 22, 2025 · gemini-code-assist
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,7 @@ maintainers = [
 ]
 license = "Apache-2.0"
 readme = "README.md"
-requires-python = ">=3.10,<3.13"
+requires-python = ">=3.11,<3.13"
 homepage = "https://github.com/sensein/sailsprep"
 repository = "https://github.com/sensein/sailsprep"
 documentation = "https://sensein.github.io/sailsprep"
@@ -30,7 +30,7 @@ classifiers = [
 sailsprep-cli = "sailsprep.cli:main"
 
 [tool.poetry]
-packages = [{include = "sailsprep", from = "src"}]
+packages = [{include = "sailsprep", from = "src"}, {include = "vlm_baseline"}]
 requires-poetry = ">=2.0"
 version = "0.0.0"
 
@@ -40,6 +40,12 @@ pandas = "^2.3.3"
 opencv-python = "^4.12.0.88"
 openpyxl = "^3.1.5"
 types-pyyaml = "^6.0.12.20250915"
+decord = "^0.6.0"
+pillow = "^9.2.0"
+torch = "^2.9.1"
+transformers = "^4.57.3"
+scikit-learn = "^1.8.0"
-torch = "^2.9.1"
-transformers = "^4.57.3"
-scikit-learn = "^1.8.0"
+torch = "^2.3.1"
+transformers = "^4.41.2"
+scikit-learn = "^1.5.0"
-torch = "^2.9.1"
-transformers = "^4.57.3"
-scikit-learn = "^1.8.0"
+torch = "^2.3.1"
+transformers = "^4.41.2"
+scikit-learn = "^1.5.0"
+moviepy = "1.0.3"
 
 [tool.poetry.group.dev]
 optional = true

diff --git a/vlm_baseline/__init__.py b/vlm_baseline/__init__.py
@@ -0,0 +1 @@
+"""VLM baseline framework for automatic video annotation."""
diff --git a/vlm_baseline/configs/ovis2/activity.yaml b/vlm_baseline/configs/ovis2/activity.yaml
@@ -0,0 +1,65 @@
+experiment:
+  name: activity_ovis2
+  seed: 42
+  description: >
+    Free-text activity description of what the child is doing in the video.
+    The model should produce a natural language sentence or short paragraph.
+
+model:
+  name: ovis2
+  device: cuda
+  precision: bf16
+  max_frames: 16
+
+data:
+  ground_truth_csv: /orcd/scratch/bcs/001/sensein/sails/BIDS_data/test.csv
+  label_column: Activity
+  video_path_column: BidsProcessed
+
+output:
+  save_dir: /orcd/scratch/bcs/001/sensein/sails/pre2annot_evaluation/activity_ovis2
+  save_predictions: true
+
+task:
+  type: description
+  output_format:
+    style: free_text
+    constraints:
+    - Describe only what is visible in the video
+    - Focus on the child's activity
+    - Do not infer intentions or emotions unless clearly visible
+    - Use short, concrete phrases
+    - Do not include timestamps
+    - Do not mention the camera or recording setup
+
+prompt:
+  message: |
+    You are a video understanding model.
+
+    Your task is to describe the ACTIVITY performed by the child in the video.
+
+    Activity definition:
+    A more detailed description of the events in the video.
+    It should provide further detail about the child's actions.
+    The description should be consistent with the broader context but more specific.
+
+    Examples:
+    - Context: motor play → Activity: gymnastics or tumbling
+    - Context: book play → Activity: adult reading a book to the child
+    - Context: social play → Activity: laughing, tickling, engaging with an adult
+
+    Instructions:
+    - Describe only observable actions
+    - Use ONLY ONE short descriptive phrase or sentence
+    - Do not list multiple unrelated activities
+    - Do not explain or justify your answer
+
+    Now describe the activity in the video.
+
+evaluation:
+  type: text
+  metrics:
+  - none
+  notes: |
+    Free-text activity descriptions are not automatically scored.
+    Evaluation may be qualitative or based on downstream analysis.
diff --git a/vlm_baseline/configs/ovis2/gesture_type.yaml b/vlm_baseline/configs/ovis2/gesture_type.yaml
@@ -0,0 +1,45 @@
+experiment:
+  name: gesture_type_ovis2
+  seed: 42
+
+model:
+  name: ovis2
+  device: cuda
+  precision: bf16
+  max_frames: 16
+
+output:
+  save_dir: /orcd/scratch/bcs/001/sensein/sails/pre2annot_evaluation/gesture_type_ovis2
+  save_predictions: true
+
+data:
+  video_path_column: BidsProcessed
+  ground_truth_csv: /orcd/scratch/bcs/001/sensein/sails/BIDS_data/anotated_processed.csv
+  label_column: Gesture_type
+
+task:
+  type: classification
+  labels:
+  - reach
+  - show
+  - give
+  - point
+  - take
+  - head shake
+  - head nod
+  - wave
+  - clap
+  - sign
+  - multiple
+  - NaN
+
+prompt:
+  message: |
+    You are a video understanding model.
+    Given the following video, classify the gesture being performed.
+    Possible labels are:[reach,show,give,point,take,head shake,head nod,wave,clap,sign,multiple,NaN].
+    Answer with only the label.
+
+evaluation:
+  metrics:
+  - accuracy
diff --git a/vlm_baseline/configs/ovis2/gestures.yaml b/vlm_baseline/configs/ovis2/gestures.yaml
@@ -0,0 +1,35 @@
+experiment:
+  name: gestures_ovis2
+  seed: 42
+
+model:
+  name: ovis2
+  device: cuda
+  precision: bf16
+  max_frames: 16
+
+output:
+  save_dir: /orcd/scratch/bcs/001/sensein/sails/pre2annot_evaluation/gestures_ovis2
+  save_predictions: true
+
+data:
+  video_path_column: BidsProcessed
+  ground_truth_csv: /orcd/scratch/bcs/001/sensein/sails/BIDS_data/anotated_processed.csv
+  label_column: Gestures
+
+task:
+  type: classification
+  labels:
+  - yes
+  - no
+
+prompt:
+  message: |
+    You are a video understanding model.
+    Given the following video, Indicate whether the child engages in a motion or motions that signal(s) or attempt(s) to signal nonverbal communication.
+    Possible labels are:[yes,no].
+    Answer with only the label.
+
+evaluation:
+  metrics:
+  - accuracy
diff --git a/vlm_baseline/configs/ovis2/interaction_w_child.yaml b/vlm_baseline/configs/ovis2/interaction_w_child.yaml
@@ -0,0 +1,41 @@
+experiment:
+  name: interaction_w_child_ovis2
+  seed: 42
+
+model:
+  name: ovis2
+  device: cuda
+  precision: bf16
+  max_frames: 16
+
+output:
+  save_dir: /orcd/scratch/bcs/001/sensein/sails/pre2annot_evaluation/interaction_w_child_ovis2
+  save_predictions: true
+
+data:
+  video_path_column: BidsProcessed
+  ground_truth_csv: /orcd/scratch/bcs/001/sensein/sails/BIDS_data/anotated_processed.csv
+  label_column: Interaction_with_child
+
+task:
+  type: classification
+  labels:
+  - yes
+  - no
+
+
+prompt:
+  message: |
+    You are a video understanding model.
+    Given the following video, indicate whether there is a social interaction or attempt directed toward the target child.
+    This could be by the person filming the video or another individual in the video.
+
+    Rate "yes" if there is a social interaction or an attempt directed toward the child (e.g., the person filming asks the child questions or makes a comment to the child).
+    Rate "no" if there is no interaction/attempt directed toward the child (e.g., the video is of the child paging through a book on their own with no one engaging them).
+
+    Possible labels are: [yes, no].
+    Answer with only the label.
+
+evaluation:
+  metrics:
+  - accuracy
diff --git a/vlm_baseline/configs/ovis2/response_to_name.yaml b/vlm_baseline/configs/ovis2/response_to_name.yaml
@@ -0,0 +1,39 @@
+experiment:
+  name: response_to_name_ovis2
+  seed: 42
+
+model:
+  name: ovis2
+  device: cuda
+  precision: bf16
+  max_frames: 16
+
+output:
+  save_dir: /orcd/scratch/bcs/001/sensein/sails/pre2annot_evaluation/response_to_name_ovis2
+  save_predictions: true
+
+data:
+  video_path_column: BidsProcessed
+  ground_truth_csv: /orcd/scratch/bcs/001/sensein/sails/BIDS_data/anotated_processed.csv
+  label_column: Response_to_name
+
+task:
+  type: classification
+  labels:
+  - yes
+  - no
+  - inconsistent
+  - NaN
+
+prompt:
+  message: |
+    You are a video understanding model.
+    Given the following video, Indicate whether an overt attempt to gain child’s attention via use of their name or nickname occurs. Do not include use of name in conversation. Rate “yes” if the child looks
+    toward the person saying their name immediately. Rate “no” if the child does not react to hearing their name. Rate “inconsistent” if multiple attempts are made and different
+    responses are observed. Rate NaN” if no overt attempt is made.
+    Possible labels are:[yes,no,inconsistent,NaN].
+    Answer with only the label.
+
+evaluation:
+  metrics:
+  - accuracy
diff --git a/vlm_baseline/documentation.md b/vlm_baseline/documentation.md
@@ -0,0 +1,112 @@
+# VLM Baseline Evaluation — Documentation
+
+## Overview
+
+This `vlm_baseline` folder provides a baseline framework for automatic annotation of videos using Video-Language Models (VLMs). The primary goal is to automate the manual annotation process currently performed on SAILS videos.
+
+### Key Concepts
+
+- **Automatic Annotation**: Videos that are currently manually annotated will be processed automatically using VLMs.
+- **Annotation Types**:
+  - **Classifications**: Categorical labels (e.g., gesture types)
+  - **Descriptions**: Free-text descriptions of video content (e.g. activity)
+- **Evaluation**: Different metrics are used for each annotation type to evaluate the VLM performances.
+- **Inference Process**: Run VLM inference on all available videos and compare predictions against ground truth annotations.
+- **Output Format**: Videos processed are from the BIDS folder, and evaluation results are saved to locations specified in the configuration file.
+
+**Key Architecture Principle:**
+
+- `models/` handles model interaction - if you want to try performances of a new VLM, you'll need to implement it here
+- `postprocessing/` converts raw VLM output into task-specific prediction format
+- `evaluation/` computes metrics comparing predictions vs. ground truth
+- `runners/` orchestrates the entire pipeline (config loading, data iteration, output saving, evaluation)
+
+## How to Run
+
+Build a srun session with a gpu, then from the repo root, run:
+
+```bash
+poetry run python vlm_baseline/runners/run_prediction.py vlm_baseline/configs/ovis2/response_to_name.yaml
+```
+
+
+## Configuration File (YAML)
+
+A config defines one complete experiment (one model + one task + one dataset + one prompt + one output directory). If you want to try a vlm on a particular annotation prediction, feel free to create a new configuration file with the same structure as the ones already present.
+
+## Models (models/)
+
+This folder contains thin wrappers around VLM backends (Ovis2, Qwen2.5, …).
+It loads the model, runs inference on a video + prompt, returns raw generated text
+
+## Postprocessing (postprocessing/)
+
+Postprocessing converts raw model output into the prediction type expected by the task. It then validates the postprocessed output
+
+## Evaluation (evaluation/)
+
+Evaluation metrics depend on `task.type`. For free text tasks, we haven't any metrics implemented yet.
+
+### Classification Evaluation
+
+Common metrics include:
+- **Accuracy** (though not always most relevant for unbalanced datasets)
+- Macro-F1 / Weighted-F1
+- Per-class precision/recall/F1
+- Confusion matrix
+
+**Inputs**: Ground truth labels from CSV vs. postprocessed predictions
+
+### How to add a new model
+
+## How to Add a New Model
-### How to add a new model
-
-## How to Add a New Model
+## How to Add a New Model
-### How to add a new model
-
-## How to Add a New Model
+## How to Add a New Model
+
+To integrate a new VLM into the baseline framework, follow these steps:
+
+### 1. Create Model Wrapper
+
+Create a new file `models/<new_model>.py` with a class that inherits from `BaseVLM`:
+
+```python
+class NewModelVLM(BaseVLM):
+    def load(self):
+        # Load weights/processor, set device, eval mode
+        pass
+
+    def generate(self, video_path, prompt, video_cfg=None, gen_cfg=None):
+        # Implement inference logic
+        # Return VLMRawOutput
+        pass
+
+    # Usually no need to override predict()
+```
+
+### 2. Register the Model
+
+Update `models/__init__.py`:
+
+- Import your new class
+- Add a case in the `load_model()` function for your model's `config["name"]`
+
+### 3. Create Configuration
+
+Add a config YAML file under `configs/<new_model>/...yaml` with at least the annotation description, prompt etc,... and for the model configuration:
+
+```yaml
+model:
+  name: "your_model_name"
+  model_path: "HF_repo_id"  # or local path
+  device: "cuda"
+  precision: "bf16"
+
+```
+
+### 4. Test the Integration
+
+Run your existing runner with the new config:
+
+```bash
+poetry run python vlm_baseline/runners/run_prediction.py vlm_baseline/configs/<new_model>/your_config.yaml
+```
+
+**Note**: Downstream postprocessing automatically determines whether it's a classification or free-text task based on the configuration.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""VLM baseline framework for automatic video annotation."""