configs/pretrain/vitl16.yaml

app: vjepa
nodes: 16
tasks_per_node: 8
data:
  dataset_type: VideoDataset
  datasets:
    - /your_path_to_kinetics710_csv_file_index.csv
    - /your_path_to_ssv2_csv_file_index.csv
    - /your_path_to_howto100m_csv_file_index.csv
  decode_one_clip: true
  batch_size: 24
  num_clips: 1
  num_frames: 16
  tubelet_size: 2
  sampling_rate: 4
  crop_size: 224
  patch_size: 16
  pin_mem: true
  num_workers: 12
  filter_short_videos: false
  clip_duration: null
data_aug:
  auto_augment: false
  motion_shift: false
  random_resize_aspect_ratio:
  - 0.75
  - 1.35
  random_resize_scale:
  - 0.3
  - 1.0
  reprob: 0.0
logging:
  folder: /your_absolute_file_path_for_saving_logs_and_checkpoints/
  write_tag: jepa
loss:
  loss_exp: 1.0
  reg_coeff: 0.0
mask:
  - aspect_ratio:
      - 0.75
      - 1.5
    num_blocks: 8
    spatial_scale:
      - 0.15
      - 0.15
    temporal_scale:
      - 1.0
      - 1.0
    max_temporal_keep: 1.0
    max_keep: null
  - aspect_ratio:
      - 0.75
      - 1.5
    num_blocks: 2
    spatial_scale:
      - 0.7
      - 0.7
    temporal_scale:
      - 1.0
      - 1.0
    max_temporal_keep: 1.0
    max_keep: null
meta:
  load_checkpoint: false
  read_checkpoint: null
  seed: 234
  eval_freq: 100
  use_sdpa: true
  dtype: bfloat16
model:
  model_name: vit_large
  pred_depth: 12
  pred_embed_dim: 384
  uniform_power: true
  use_mask_tokens: true
  zero_init_mask_tokens: true
optimization:
  ipe: 300
  ipe_scale: 1.25
  clip_grad: 10.0
  weight_decay: 0.04
  final_weight_decay: 0.4
  epochs: 300
  warmup: 40
  start_lr: 0.0002
  lr: 0.000625
  final_lr: 1.0e-06
  ema:
  - 0.998
  - 1.0