diff --git a/roboverse_learn/il/README.md b/roboverse_learn/il/README.md
new file mode 100644
index 000000000..11be6f589
--- /dev/null
+++ b/roboverse_learn/il/README.md
@@ -0,0 +1,48 @@
+# RoboVerse Imitation Learning (IL) Policies
+
+## Example Usage
+
+Pick a policy folder and follow its README for setup and usage.
+
+Example:
+
+```bash
+# From the repo root
+cd roboverse_learn/il/dp   # or fm/, vita/ depending on the policy
+pip install -r requirements.txt
+cd ../../..
+
+# Run policy training and evaluation (example: diffusion policy, DiT backbone)
+bash roboverse_learn/il/il_run.sh --task_name_set close_box --algo_choose ddpm_dit
+```
+
+We keep each policy as self-contained as possible (code, dependencies, docs) and only share the minimum common abstractions.
+
+## Troubleshooting
+
+```bash
+# Fix potential package version issues
+bash roboverse_learn/il/il_setup.sh
+```
+
+## Supported Algorithms
+
+| Name | Policy | Backbone | Model Config | Ref |
+| --- | --- | --- | --- | --- |
+| `ddpm_dit` | Diffusion Policy (DDPM) | DiT | `model_config/ddpm_dit_model.yaml` | [1], [5] |
+| `fm_dit` | Flow Matching | DiT | `model_config/fm_dit_model.yaml` | [6], [5] |
+| `vita` | VITA Policy | MLP | `model_config/vita_model.yaml` | [7] |
+| `ddpm_unet` | Diffusion Policy (DDPM) | UNet | `model_config/ddpm_model.yaml` | [1], [4] |
+| `ddim_unet` | Diffusion Policy (DDIM) | UNet | `model_config/ddim_model.yaml` | [2], [4] |
+| `fm_unet` | Flow Matching | UNet | `model_config/fm_unet_model.yaml` | [6] |
+| `score_unet` | Score-Based Model | UNet | `model_config/score_model.yaml` | [3], [4] |
+
+### References
+
+1. Ho, Jonathan, Ajay Jain, and Pieter Abbeel. "Denoising Diffusion Probabilistic Models." (2020).  
+2. Song, Jiaming, Chenlin Meng, and Stefano Ermon. "Denoising Diffusion Implicit Models." (2021).  
+3. Song, Yang, et al. "Score-Based Generative Modeling through Stochastic Differential Equations." (2021).  
+4. Chi, Cheng, et al. "Diffusion Policy: Diffusion Models for Robotic Manipulation." (2023).  
+5. Peebles, William, and Jun-Yan Zhu. "DiT: Diffusion Models with Transformers." (2023).  
+6. Lipman, Yaron, et al. "Flow Matching for Generative Modeling." (2023).  
+7. Gao, Dechen, et al. "VITA: Vision-to-Action Flow Matching Policy." (2025).
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/libraries/mingpt/__init__.py b/roboverse_learn/il/__init__.py
similarity index 100%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/libraries/mingpt/__init__.py
rename to roboverse_learn/il/__init__.py
diff --git a/roboverse_learn/il/act/utils.py b/roboverse_learn/il/act/utils.py
index 5d7022d1f..66e15b33b 100644
--- a/roboverse_learn/il/act/utils.py
+++ b/roboverse_learn/il/act/utils.py
@@ -6,7 +6,7 @@
 import h5py
 import json
 from torch.utils.data import TensorDataset, DataLoader
-from roboverse_learn.il.utils.common.replay_buffer import ReplayBuffer
+from roboverse_learn.il.utils.replay_buffer import ReplayBuffer
 
 import IPython
 e = IPython.embed
diff --git a/roboverse_learn/il/base/__init__.py b/roboverse_learn/il/base/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/dataset/base_dataset.py b/roboverse_learn/il/base/base_dataset.py
similarity index 94%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/dataset/base_dataset.py
rename to roboverse_learn/il/base/base_dataset.py
index 79ce1e528..68830efde 100644
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/dataset/base_dataset.py
+++ b/roboverse_learn/il/base/base_dataset.py
@@ -2,7 +2,8 @@
 
 import torch
 import torch.nn
-from diffusion_policy.model.common.normalizer import LinearNormalizer
+
+from roboverse_learn.il.utils.normalizer import LinearNormalizer
 
 
 class BaseLowdimDataset(torch.utils.data.Dataset):
diff --git a/roboverse_learn/il/dp/base/base_eval_runner.py b/roboverse_learn/il/base/base_eval_runner.py
similarity index 99%
rename from roboverse_learn/il/dp/base/base_eval_runner.py
rename to roboverse_learn/il/base/base_eval_runner.py
index 3ffe8c714..9f2b24407 100644
--- a/roboverse_learn/il/dp/base/base_eval_runner.py
+++ b/roboverse_learn/il/base/base_eval_runner.py
@@ -1,4 +1,4 @@
-from dp.runner.base_policy import BasePolicyCfg
+from roboverse_learn.il.runner.base_policy import BasePolicyCfg
 
 try:
     from curobo.types.math import Pose
@@ -12,7 +12,7 @@
 import torch
 from loguru import logger as log
 from metasim.scenario.scenario import ScenarioCfg
-from roboverse_learn.il.utils.common.pytorch_util import dict_apply
+from roboverse_learn.il.utils.pytorch_util import dict_apply
 
 
 class BaseEvalRunner:
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/policy/base_image_policy.py b/roboverse_learn/il/base/base_image_policy.py
similarity index 81%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/policy/base_image_policy.py
rename to roboverse_learn/il/base/base_image_policy.py
index fa5a4751f..beafab037 100644
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/policy/base_image_policy.py
+++ b/roboverse_learn/il/base/base_image_policy.py
@@ -1,8 +1,8 @@
 from typing import Dict
 
 import torch
-from diffusion_policy.model.common.module_attr_mixin import ModuleAttrMixin
-from diffusion_policy.model.common.normalizer import LinearNormalizer
+from roboverse_learn.il.utils.module_attr_mixin import ModuleAttrMixin
+from roboverse_learn.il.utils.normalizer import LinearNormalizer
 
 
 class BaseImagePolicy(ModuleAttrMixin):
diff --git a/roboverse_learn/il/dp/base/base_model.py b/roboverse_learn/il/base/base_model.py
similarity index 100%
rename from roboverse_learn/il/dp/base/base_model.py
rename to roboverse_learn/il/base/base_model.py
diff --git a/roboverse_learn/il/dp/base/base_runner.py b/roboverse_learn/il/base/base_runner.py
similarity index 100%
rename from roboverse_learn/il/dp/base/base_runner.py
rename to roboverse_learn/il/base/base_runner.py
diff --git a/roboverse_learn/il/configs/__init__.py b/roboverse_learn/il/configs/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/roboverse_learn/il/dp/configs/dataset_config/robot_image_dataset.yaml b/roboverse_learn/il/configs/dataset_config/robot_image_dataset.yaml
similarity index 72%
rename from roboverse_learn/il/dp/configs/dataset_config/robot_image_dataset.yaml
rename to roboverse_learn/il/configs/dataset_config/robot_image_dataset.yaml
index 1b15af977..6012c131c 100644
--- a/roboverse_learn/il/dp/configs/dataset_config/robot_image_dataset.yaml
+++ b/roboverse_learn/il/configs/dataset_config/robot_image_dataset.yaml
@@ -1,4 +1,4 @@
-_target_: dp.datasets.robot_image_dataset.RobotImageDataset
+_target_: roboverse_learn.il.datasets.robot_image_dataset.RobotImageDataset
 zarr_path: data_policy/useless.zarr
 horizon: ${horizon}
 pad_before: ${eval:'${n_obs_steps}-1'}
diff --git a/roboverse_learn/il/dp/configs/dp_runner.yaml b/roboverse_learn/il/configs/dp_runner.yaml
similarity index 90%
rename from roboverse_learn/il/dp/configs/dp_runner.yaml
rename to roboverse_learn/il/configs/dp_runner.yaml
index a5febd94d..d18623539 100644
--- a/roboverse_learn/il/dp/configs/dp_runner.yaml
+++ b/roboverse_learn/il/configs/dp_runner.yaml
@@ -1,13 +1,13 @@
 defaults:
   - _self_
   - dataset_config: robot_image_dataset
-  - model_config:  ${oc.env:algo_model,ddpm_model}      # diffusion_policy_model/fm_model/DDIM_model
+  - model_config:  ${oc.env:algo_model,ddpm_dit_model}
   - eval_config: diffusion_policy_eval
   - train_config: diffusion_policy_train
 
 task_name: placeholder
 name: robot_${task_name}
-_target_: dp.runner.dp_runner.DPRunner
+_target_: roboverse_learn.il.runner.dp_runner.DPRunner
 
 
 image_shape: &image_shape [3, 256, 256]
diff --git a/roboverse_learn/il/dp/configs/eval_config/diffusion_policy_eval.yaml b/roboverse_learn/il/configs/eval_config/diffusion_policy_eval.yaml
similarity index 86%
rename from roboverse_learn/il/dp/configs/eval_config/diffusion_policy_eval.yaml
rename to roboverse_learn/il/configs/eval_config/diffusion_policy_eval.yaml
index 25768449f..8633a210f 100644
--- a/roboverse_learn/il/dp/configs/eval_config/diffusion_policy_eval.yaml
+++ b/roboverse_learn/il/configs/eval_config/diffusion_policy_eval.yaml
@@ -1,5 +1,5 @@
 eval_args:
-  _target_: roboverse_learn.il.utils.common.eval_args.Args
+  _target_: roboverse_learn.il.utils.eval_args.Args
   # random:
     # _target_: metasim.cfg.randomization.RandomizationCfg
     # level: 0
diff --git a/roboverse_learn/il/dp/configs/model_config/ddim_unet_model.yaml b/roboverse_learn/il/configs/model_config/ddim_unet_model.yaml
similarity index 79%
rename from roboverse_learn/il/dp/configs/model_config/ddim_unet_model.yaml
rename to roboverse_learn/il/configs/model_config/ddim_unet_model.yaml
index ef0fd5da5..05286fd9b 100644
--- a/roboverse_learn/il/dp/configs/model_config/ddim_unet_model.yaml
+++ b/roboverse_learn/il/configs/model_config/ddim_unet_model.yaml
@@ -1,4 +1,4 @@
-_target_: dp.models.ddim_unet_image_policy.DiffusionUnetImagePolicy
+_target_: roboverse_learn.il.dp.policies.ddim_unet_image_policy.DiffusionUnetImagePolicy
 
 shape_meta: ${shape_meta}
 
@@ -13,10 +13,10 @@ noise_scheduler:
   prediction_type: epsilon # or sample
 
 obs_encoder:
-  _target_: diffusion_policy.model.vision.multi_image_obs_encoder.MultiImageObsEncoder
+  _target_: roboverse_learn.il.dp.models.vision.multi_image_obs_encoder.MultiImageObsEncoder
   shape_meta: ${shape_meta}
   rgb_model:
-    _target_: diffusion_policy.model.vision.model_getter.get_resnet
+    _target_: roboverse_learn.il.dp.models.vision.model_getter.get_resnet
     name: resnet18
     weights: null
   resize_shape: null
diff --git a/roboverse_learn/il/dp/configs/model_config/ddpm_dit_model.yaml b/roboverse_learn/il/configs/model_config/ddpm_dit_model.yaml
similarity index 78%
rename from roboverse_learn/il/dp/configs/model_config/ddpm_dit_model.yaml
rename to roboverse_learn/il/configs/model_config/ddpm_dit_model.yaml
index f3806a9c4..bb453cb7c 100644
--- a/roboverse_learn/il/dp/configs/model_config/ddpm_dit_model.yaml
+++ b/roboverse_learn/il/configs/model_config/ddpm_dit_model.yaml
@@ -1,4 +1,4 @@
-_target_: dp.models.ddpm_dit_image_policy.DiffusionDiTImagePolicy
+_target_: roboverse_learn.il.dp.policies.ddpm_dit_image_policy.DiffusionDiTImagePolicy
 
 shape_meta: ${shape_meta}
 
@@ -13,10 +13,10 @@ noise_scheduler:
   prediction_type: epsilon # or sample
 
 obs_encoder:
-  _target_: diffusion_policy.model.vision.multi_image_obs_encoder.MultiImageObsEncoder
+  _target_: roboverse_learn.il.dp.models.vision.multi_image_obs_encoder.MultiImageObsEncoder
   shape_meta: ${shape_meta}
   rgb_model:
-    _target_: diffusion_policy.model.vision.model_getter.get_resnet
+    _target_: roboverse_learn.il.dp.models.vision.model_getter.get_resnet
     name: resnet18
     weights: null
   resize_shape: null
diff --git a/roboverse_learn/il/dp/configs/model_config/ddpm_unet_model.yaml b/roboverse_learn/il/configs/model_config/ddpm_unet_model.yaml
similarity index 79%
rename from roboverse_learn/il/dp/configs/model_config/ddpm_unet_model.yaml
rename to roboverse_learn/il/configs/model_config/ddpm_unet_model.yaml
index 4d14c065c..575fd912b 100644
--- a/roboverse_learn/il/dp/configs/model_config/ddpm_unet_model.yaml
+++ b/roboverse_learn/il/configs/model_config/ddpm_unet_model.yaml
@@ -1,4 +1,4 @@
-_target_: dp.models.ddpm_unet_image_policy.DiffusionUnetImagePolicy
+_target_: roboverse_learn.il.dp.policies.ddpm_unet_image_policy.DiffusionUnetImagePolicy
 
 shape_meta: ${shape_meta}
 
@@ -13,10 +13,10 @@ noise_scheduler:
   prediction_type: epsilon # or sample
 
 obs_encoder:
-  _target_: diffusion_policy.model.vision.multi_image_obs_encoder.MultiImageObsEncoder
+  _target_: roboverse_learn.il.dp.models.vision.multi_image_obs_encoder.MultiImageObsEncoder
   shape_meta: ${shape_meta}
   rgb_model:
-    _target_: diffusion_policy.model.vision.model_getter.get_resnet
+    _target_: roboverse_learn.il.dp.models.vision.model_getter.get_resnet
     name: resnet18
     weights: null
   resize_shape: null
diff --git a/roboverse_learn/il/dp/configs/model_config/fm_dit_model.yaml b/roboverse_learn/il/configs/model_config/fm_dit_model.yaml
similarity index 68%
rename from roboverse_learn/il/dp/configs/model_config/fm_dit_model.yaml
rename to roboverse_learn/il/configs/model_config/fm_dit_model.yaml
index aa2417783..4779783c2 100644
--- a/roboverse_learn/il/dp/configs/model_config/fm_dit_model.yaml
+++ b/roboverse_learn/il/configs/model_config/fm_dit_model.yaml
@@ -1,13 +1,13 @@
-_target_: dp.models.fm_dit_image_policy.FlowMatchingDiTImagePolicy
+_target_: roboverse_learn.il.fm.policies.fm_dit_image_policy.FlowMatchingDiTImagePolicy
 
 
 shape_meta: ${shape_meta}
 
 obs_encoder:
-  _target_: diffusion_policy.model.vision.multi_image_obs_encoder.MultiImageObsEncoder
+  _target_: roboverse_learn.il.dp.models.vision.multi_image_obs_encoder.MultiImageObsEncoder
   shape_meta: ${shape_meta}
   rgb_model:
-    _target_: diffusion_policy.model.vision.model_getter.get_resnet
+    _target_: roboverse_learn.il.dp.models.vision.model_getter.get_resnet
     name: resnet18
     weights: null
   resize_shape: null
diff --git a/roboverse_learn/il/dp/configs/model_config/fm_unet_model.yaml b/roboverse_learn/il/configs/model_config/fm_unet_model.yaml
similarity index 68%
rename from roboverse_learn/il/dp/configs/model_config/fm_unet_model.yaml
rename to roboverse_learn/il/configs/model_config/fm_unet_model.yaml
index 61306a6de..41bb00e90 100644
--- a/roboverse_learn/il/dp/configs/model_config/fm_unet_model.yaml
+++ b/roboverse_learn/il/configs/model_config/fm_unet_model.yaml
@@ -1,13 +1,13 @@
-_target_: dp.models.fm_unet_image_policy.FlowMatchingUnetImagePolicy
+_target_: roboverse_learn.il.fm.policies.fm_unet_image_policy.FlowMatchingUnetImagePolicy
 
 
 shape_meta: ${shape_meta}
 
 obs_encoder:
-  _target_: diffusion_policy.model.vision.multi_image_obs_encoder.MultiImageObsEncoder
+  _target_: roboverse_learn.il.dp.models.vision.multi_image_obs_encoder.MultiImageObsEncoder
   shape_meta: ${shape_meta}
   rgb_model:
-    _target_: diffusion_policy.model.vision.model_getter.get_resnet
+    _target_: roboverse_learn.il.dp.models.vision.model_getter.get_resnet
     name: resnet18
     weights: null
   resize_shape: null
diff --git a/roboverse_learn/il/dp/configs/model_config/score_model.yaml b/roboverse_learn/il/configs/model_config/score_model.yaml
similarity index 79%
rename from roboverse_learn/il/dp/configs/model_config/score_model.yaml
rename to roboverse_learn/il/configs/model_config/score_model.yaml
index 211bcac1b..60e21dcaa 100644
--- a/roboverse_learn/il/dp/configs/model_config/score_model.yaml
+++ b/roboverse_learn/il/configs/model_config/score_model.yaml
@@ -1,4 +1,4 @@
-_target_: dp.models.score_unet_image_policy.ScoreMatchingUnetImagePolicy
+_target_: roboverse_learn.il.dp.policies.score_unet_image_policy.ScoreMatchingUnetImagePolicy
 
 shape_meta: ${shape_meta}
 
@@ -13,10 +13,10 @@ noise_scheduler:
   prediction_type: epsilon # or sample
 
 obs_encoder:
-  _target_: diffusion_policy.model.vision.multi_image_obs_encoder.MultiImageObsEncoder
+  _target_: roboverse_learn.il.dp.models.vision.multi_image_obs_encoder.MultiImageObsEncoder
   shape_meta: ${shape_meta}
   rgb_model:
-    _target_: diffusion_policy.model.vision.model_getter.get_resnet
+    _target_: roboverse_learn.il.dp.models.vision.model_getter.get_resnet
     name: resnet18
     weights: null
   resize_shape: null
diff --git a/roboverse_learn/il/dp/configs/model_config/vita_model.yaml b/roboverse_learn/il/configs/model_config/vita_model.yaml
similarity index 76%
rename from roboverse_learn/il/dp/configs/model_config/vita_model.yaml
rename to roboverse_learn/il/configs/model_config/vita_model.yaml
index e55c04258..378dc73ae 100644
--- a/roboverse_learn/il/dp/configs/model_config/vita_model.yaml
+++ b/roboverse_learn/il/configs/model_config/vita_model.yaml
@@ -1,13 +1,13 @@
-_target_: dp.models.vita_policy.VITAImagePolicy
+_target_: roboverse_learn.il.vita.policies.vita_policy.VITAImagePolicy
 
 
 shape_meta: ${shape_meta}
 
 obs_encoder:
-  _target_: diffusion_policy.model.vision.multi_image_obs_encoder.MultiImageObsEncoder
+  _target_: roboverse_learn.il.dp.models.vision.multi_image_obs_encoder.MultiImageObsEncoder
   shape_meta: ${shape_meta}
   rgb_model:
-    _target_: diffusion_policy.model.vision.model_getter.get_resnet
+    _target_: roboverse_learn.il.dp.models.vision.model_getter.get_resnet
     name: resnet18
     weights: null
   resize_shape: null
@@ -31,7 +31,7 @@ latent_dim: 512
 
 # Flow matcher parameters
 flow_matcher:
-  _target_: diffusion_policy.common.flow_matchers.ExactOptimalTransportConditionalFlowMatcher
+  _target_: roboverse_learn.il.utils.flow_matchers.ExactOptimalTransportConditionalFlowMatcher
   sigma: 0.0
   num_sampling_steps: 6
 
diff --git a/roboverse_learn/il/dp/configs/train_config/diffusion_policy_train.yaml b/roboverse_learn/il/configs/train_config/diffusion_policy_train.yaml
similarity index 93%
rename from roboverse_learn/il/dp/configs/train_config/diffusion_policy_train.yaml
rename to roboverse_learn/il/configs/train_config/diffusion_policy_train.yaml
index 92e5426e0..edb213072 100644
--- a/roboverse_learn/il/dp/configs/train_config/diffusion_policy_train.yaml
+++ b/roboverse_learn/il/configs/train_config/diffusion_policy_train.yaml
@@ -46,7 +46,7 @@ val_dataloader:
   persistent_workers: False
 
 ema:
-  _target_: diffusion_policy.model.diffusion.ema_model.EMAModel
+  _target_: roboverse_learn.il.dp.models.diffusion.ema_model.EMAModel
   update_after_step: 0
   inv_gamma: 1.0
   power: 0.75
diff --git a/roboverse_learn/il/datasets/__init__.py b/roboverse_learn/il/datasets/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/roboverse_learn/il/dp/base/base_dataset.py b/roboverse_learn/il/datasets/base_dataset.py
similarity index 94%
rename from roboverse_learn/il/dp/base/base_dataset.py
rename to roboverse_learn/il/datasets/base_dataset.py
index e29862e47..e2d04a803 100644
--- a/roboverse_learn/il/dp/base/base_dataset.py
+++ b/roboverse_learn/il/datasets/base_dataset.py
@@ -2,8 +2,7 @@
 
 import torch
 import torch.nn
-
-from roboverse_learn.il.utils.common.normalizer import LinearNormalizer
+from roboverse_learn.il.utils.normalizer import LinearNormalizer
 
 
 class BaseLowdimDataset(torch.utils.data.Dataset):
diff --git a/roboverse_learn/il/dp/datasets/robot_image_dataset.py b/roboverse_learn/il/datasets/robot_image_dataset.py
similarity index 93%
rename from roboverse_learn/il/dp/datasets/robot_image_dataset.py
rename to roboverse_learn/il/datasets/robot_image_dataset.py
index 9d3596f39..f8be9376c 100644
--- a/roboverse_learn/il/dp/datasets/robot_image_dataset.py
+++ b/roboverse_learn/il/datasets/robot_image_dataset.py
@@ -4,18 +4,17 @@
 import numba
 import numpy as np
 import torch
-from termcolor import cprint
-
-from dp.base.base_dataset import BaseImageDataset
-from roboverse_learn.il.utils.common.normalize_util import get_image_range_normalizer
-from roboverse_learn.il.utils.common.normalizer import LinearNormalizer
-from roboverse_learn.il.utils.common.pytorch_util import dict_apply
-from roboverse_learn.il.utils.common.replay_buffer import ReplayBuffer
-from roboverse_learn.il.utils.common.sampler import (
+from roboverse_learn.il.utils.normalize_util import get_image_range_normalizer
+from roboverse_learn.il.utils.pytorch_util import dict_apply
+from roboverse_learn.il.utils.replay_buffer import ReplayBuffer
+from roboverse_learn.il.utils.sampler import (
     SequenceSampler,
     downsample_mask,
     get_val_mask,
 )
+from roboverse_learn.il.base.base_dataset import BaseImageDataset
+from roboverse_learn.il.utils.normalizer import LinearNormalizer
+from termcolor import cprint
 
 
 class RobotImageDataset(BaseImageDataset):
@@ -30,6 +29,7 @@ def __init__(
         batch_size=64,
         max_train_episodes=None,
     ):
+
         super().__init__()
 
         self.replay_buffer = ReplayBuffer.copy_from_path(
diff --git a/roboverse_learn/il/utils/diffusion_policy/.gitignore b/roboverse_learn/il/dp/.gitignore
similarity index 100%
rename from roboverse_learn/il/utils/diffusion_policy/.gitignore
rename to roboverse_learn/il/dp/.gitignore
diff --git a/roboverse_learn/il/dp/README.md b/roboverse_learn/il/dp/README.md
index 674bb722a..62ddb4734 100644
--- a/roboverse_learn/il/dp/README.md
+++ b/roboverse_learn/il/dp/README.md
@@ -1,15 +1,10 @@
-# Flow Matching and Diffusion Based IL Policies
+# Diffusion Policy
 
 ## 1. Install
 
 ```bash
-cd roboverse_learn/il/utils/diffusion_policy
-
-pip install -e .
-
-cd ../../../../
-
-pip install pandas wandb
+cd roboverse_learn/il/dp
+pip install -r requirements.txt
 ```
 
 Register for a Weights & Biases (wandb) account to obtain an API key.
@@ -39,25 +34,3 @@ eval_enable=False
 train_enable=False
 eval_enable=True
 ```
-
-## Supported Algorithms
-
-| Algorithm | Backbone | Model Config | Ref |
-| --- | --- | --- | --- |
-| Diffusion Policy (DDPM) | DiT | `model_config/ddpm_dit_model.yaml` | [1], [5] |
-| Flow Matching | DiT | `model_config/fm_dit_model.yaml` | [6], [5] |
-| VITA Policy | MLP | `model_config/vita_model.yaml` | [7] |
-| Diffusion Policy (DDPM) | UNet | `model_config/ddpm_model.yaml` | [1], [4] |
-| Diffusion Policy (DDIM) | UNet | `model_config/ddim_model.yaml` | [2], [4] |
-| Flow Matching | UNet | `model_config/fm_unet_model.yaml` | [6] |
-| Score-Based Model | UNet | `model_config/score_model.yaml` | [3], [4] |
-
-### References
-
-1. Ho, Jonathan, Ajay Jain, and Pieter Abbeel. "Denoising Diffusion Probabilistic Models." (2020).  
-2. Song, Jiaming, Chenlin Meng, and Stefano Ermon. "Denoising Diffusion Implicit Models." (2021).  
-3. Song, Yang, et al. "Score-Based Generative Modeling through Stochastic Differential Equations." (2021).  
-4. Chi, Cheng, et al. "Diffusion Policy: Diffusion Models for Robotic Manipulation." (2023).  
-5. Peebles, William, and Jun-Yan Zhu. "DiT: Diffusion Models with Transformers." (2023).  
-6. Lipman, Yaron, et al. "Flow Matching for Generative Modeling." (2023).  
-7. Gao, Dechen, et al. "VITA: Vision-to-Action Flow Matching Policy." (2025).
diff --git a/roboverse_learn/il/dp/__init__.py b/roboverse_learn/il/dp/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/action_ae/__init__.py b/roboverse_learn/il/dp/models/bet/action_ae/__init__.py
similarity index 96%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/action_ae/__init__.py
rename to roboverse_learn/il/dp/models/bet/action_ae/__init__.py
index bfaa76a40..6808f9035 100644
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/action_ae/__init__.py
+++ b/roboverse_learn/il/dp/models/bet/action_ae/__init__.py
@@ -1,7 +1,7 @@
 import abc
 from typing import Optional, Union
 
-import diffusion_policy.model.bet.utils as utils
+import roboverse_learn.il.dp.models.bet.utils as utils
 import torch
 import torch.nn as nn
 from torch.utils.data import DataLoader
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/action_ae/discretizers/k_means.py b/roboverse_learn/il/dp/models/bet/action_ae/discretizers/k_means.py
similarity index 98%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/action_ae/discretizers/k_means.py
rename to roboverse_learn/il/dp/models/bet/action_ae/discretizers/k_means.py
index bae365596..e9a5effde 100644
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/action_ae/discretizers/k_means.py
+++ b/roboverse_learn/il/dp/models/bet/action_ae/discretizers/k_means.py
@@ -3,7 +3,7 @@
 import numpy as np
 import torch
 import tqdm
-from diffusion_policy.model.common.dict_of_tensor_mixin import DictOfTensorMixin
+from roboverse_learn.il.utils.dict_of_tensor_mixin import DictOfTensorMixin
 
 
 class KMeansDiscretizer(DictOfTensorMixin):
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/latent_generators/latent_generator.py b/roboverse_learn/il/dp/models/bet/latent_generators/latent_generator.py
similarity index 97%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/latent_generators/latent_generator.py
rename to roboverse_learn/il/dp/models/bet/latent_generators/latent_generator.py
index 89120c660..36fbf4399 100644
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/latent_generators/latent_generator.py
+++ b/roboverse_learn/il/dp/models/bet/latent_generators/latent_generator.py
@@ -1,7 +1,7 @@
 import abc
 from typing import Optional, Tuple
 
-import diffusion_policy.model.bet.utils as utils
+import roboverse_learn.il.dp.models.bet.utils as utils
 import torch
 
 
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/latent_generators/mingpt.py b/roboverse_learn/il/dp/models/bet/latent_generators/mingpt.py
similarity index 95%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/latent_generators/mingpt.py
rename to roboverse_learn/il/dp/models/bet/latent_generators/mingpt.py
index 242bebdea..55b6d7748 100644
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/latent_generators/mingpt.py
+++ b/roboverse_learn/il/dp/models/bet/latent_generators/mingpt.py
@@ -1,13 +1,13 @@
 from typing import Optional, Tuple
 
-import diffusion_policy.model.bet.latent_generators.latent_generator as latent_generator
-import diffusion_policy.model.bet.libraries.mingpt.model as mingpt_model
-import diffusion_policy.model.bet.libraries.mingpt.trainer as mingpt_trainer
+import roboverse_learn.il.dp.models.bet.latent_generators.latent_generator as latent_generator
+import roboverse_learn.il.dp.models.bet.libraries.mingpt.model as mingpt_model
+import roboverse_learn.il.dp.models.bet.libraries.mingpt.trainer as mingpt_trainer
 import einops
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from diffusion_policy.model.bet.libraries.loss_fn import FocalLoss, soft_cross_entropy
+from roboverse_learn.il.dp.models.bet.libraries.loss_fn import FocalLoss, soft_cross_entropy
 
 
 class MinGPT(latent_generator.AbstractLatentGenerator):
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/latent_generators/transformer.py b/roboverse_learn/il/dp/models/bet/latent_generators/transformer.py
similarity index 93%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/latent_generators/transformer.py
rename to roboverse_learn/il/dp/models/bet/latent_generators/transformer.py
index 96c093374..b9baf4f48 100644
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/latent_generators/transformer.py
+++ b/roboverse_learn/il/dp/models/bet/latent_generators/transformer.py
@@ -1,12 +1,12 @@
 from typing import Optional, Tuple
 
-import diffusion_policy.model.bet.latent_generators.latent_generator as latent_generator
+import roboverse_learn.il.dp.models.bet.latent_generators.latent_generator as latent_generator
 import einops
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from diffusion_policy.model.bet.libraries.loss_fn import FocalLoss, soft_cross_entropy
-from diffusion_policy.model.diffusion.transformer_for_diffusion import (
+from roboverse_learn.il.dp.models.bet.libraries.loss_fn import FocalLoss, soft_cross_entropy
+from roboverse_learn.il.dp.models.diffusion.transformer_for_diffusion import (
     TransformerForDiffusion,
 )
 
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/libraries/loss_fn.py b/roboverse_learn/il/dp/models/bet/libraries/loss_fn.py
similarity index 100%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/libraries/loss_fn.py
rename to roboverse_learn/il/dp/models/bet/libraries/loss_fn.py
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/libraries/mingpt/LICENSE b/roboverse_learn/il/dp/models/bet/libraries/mingpt/LICENSE
similarity index 100%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/libraries/mingpt/LICENSE
rename to roboverse_learn/il/dp/models/bet/libraries/mingpt/LICENSE
diff --git a/roboverse_learn/il/dp/models/bet/libraries/mingpt/__init__.py b/roboverse_learn/il/dp/models/bet/libraries/mingpt/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/libraries/mingpt/model.py b/roboverse_learn/il/dp/models/bet/libraries/mingpt/model.py
similarity index 100%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/libraries/mingpt/model.py
rename to roboverse_learn/il/dp/models/bet/libraries/mingpt/model.py
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/libraries/mingpt/trainer.py b/roboverse_learn/il/dp/models/bet/libraries/mingpt/trainer.py
similarity index 100%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/libraries/mingpt/trainer.py
rename to roboverse_learn/il/dp/models/bet/libraries/mingpt/trainer.py
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/libraries/mingpt/utils.py b/roboverse_learn/il/dp/models/bet/libraries/mingpt/utils.py
similarity index 100%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/libraries/mingpt/utils.py
rename to roboverse_learn/il/dp/models/bet/libraries/mingpt/utils.py
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/utils.py b/roboverse_learn/il/dp/models/bet/utils.py
similarity index 100%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/bet/utils.py
rename to roboverse_learn/il/dp/models/bet/utils.py
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/action_ae.py b/roboverse_learn/il/dp/models/diffusion/action_ae.py
similarity index 98%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/action_ae.py
rename to roboverse_learn/il/dp/models/diffusion/action_ae.py
index 1672e1438..9b2262bb7 100644
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/action_ae.py
+++ b/roboverse_learn/il/dp/models/diffusion/action_ae.py
@@ -1,6 +1,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
-from diffusion_policy.model.diffusion.layers import Mlp
+from roboverse_learn.il.dp.models.diffusion.layers import Mlp
 
 
 def weights_init_encoder(m):
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/conditional_unet1d.py b/roboverse_learn/il/dp/models/diffusion/conditional_unet1d.py
similarity index 98%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/conditional_unet1d.py
rename to roboverse_learn/il/dp/models/diffusion/conditional_unet1d.py
index 630c6d2f5..8c5228110 100644
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/conditional_unet1d.py
+++ b/roboverse_learn/il/dp/models/diffusion/conditional_unet1d.py
@@ -4,12 +4,12 @@
 import einops
 import torch
 import torch.nn as nn
-from diffusion_policy.model.diffusion.conv1d_components import (
+from roboverse_learn.il.dp.models.diffusion.conv1d_components import (
     Conv1dBlock,
     Downsample1d,
     Upsample1d,
 )
-from diffusion_policy.model.diffusion.positional_embedding import SinusoidalPosEmb
+from roboverse_learn.il.dp.models.diffusion.positional_embedding import SinusoidalPosEmb
 from einops.layers.torch import Rearrange
 
 logger = logging.getLogger(__name__)
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/conv1d_components.py b/roboverse_learn/il/dp/models/diffusion/conv1d_components.py
similarity index 100%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/conv1d_components.py
rename to roboverse_learn/il/dp/models/diffusion/conv1d_components.py
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/ema_model.py b/roboverse_learn/il/dp/models/diffusion/ema_model.py
similarity index 100%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/ema_model.py
rename to roboverse_learn/il/dp/models/diffusion/ema_model.py
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/flow_net.py b/roboverse_learn/il/dp/models/diffusion/flow_net.py
similarity index 98%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/flow_net.py
rename to roboverse_learn/il/dp/models/diffusion/flow_net.py
index d3447406a..1f0b446b6 100644
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/flow_net.py
+++ b/roboverse_learn/il/dp/models/diffusion/flow_net.py
@@ -2,8 +2,8 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from diffusion_policy.model.diffusion.positional_embedding import RotaryPosEmb, SinusoidalPosEmb
-from diffusion_policy.model.diffusion.layers import Mlp
+from roboverse_learn.il.dp.models.diffusion.positional_embedding import RotaryPosEmb, SinusoidalPosEmb
+from roboverse_learn.il.dp.models.diffusion.layers import Mlp
 
 
 class Attention(nn.Module):
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/layers.py b/roboverse_learn/il/dp/models/diffusion/layers.py
similarity index 100%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/layers.py
rename to roboverse_learn/il/dp/models/diffusion/layers.py
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/mask_generator.py b/roboverse_learn/il/dp/models/diffusion/mask_generator.py
similarity index 99%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/mask_generator.py
rename to roboverse_learn/il/dp/models/diffusion/mask_generator.py
index e6c8d6a3c..314cc18d6 100644
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/mask_generator.py
+++ b/roboverse_learn/il/dp/models/diffusion/mask_generator.py
@@ -1,7 +1,7 @@
 from typing import Optional, Sequence
 
 import torch
-from diffusion_policy.model.common.module_attr_mixin import ModuleAttrMixin
+from roboverse_learn.il.utils.module_attr_mixin import ModuleAttrMixin
 from torch import nn
 
 
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/positional_embedding.py b/roboverse_learn/il/dp/models/diffusion/positional_embedding.py
similarity index 100%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/positional_embedding.py
rename to roboverse_learn/il/dp/models/diffusion/positional_embedding.py
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/transformer_for_diffusion.py b/roboverse_learn/il/dp/models/diffusion/transformer_for_diffusion.py
similarity index 98%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/transformer_for_diffusion.py
rename to roboverse_learn/il/dp/models/diffusion/transformer_for_diffusion.py
index 5e7e89634..29c164a79 100644
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/diffusion/transformer_for_diffusion.py
+++ b/roboverse_learn/il/dp/models/diffusion/transformer_for_diffusion.py
@@ -3,8 +3,8 @@
 
 import torch
 import torch.nn as nn
-from diffusion_policy.model.common.module_attr_mixin import ModuleAttrMixin
-from diffusion_policy.model.diffusion.positional_embedding import SinusoidalPosEmb
+from roboverse_learn.il.utils.module_attr_mixin import ModuleAttrMixin
+from roboverse_learn.il.dp.models.diffusion.positional_embedding import SinusoidalPosEmb
 
 logger = logging.getLogger(__name__)
 
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/vision/crop_randomizer.py b/roboverse_learn/il/dp/models/vision/crop_randomizer.py
similarity index 99%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/vision/crop_randomizer.py
rename to roboverse_learn/il/dp/models/vision/crop_randomizer.py
index 218e88dde..0817216aa 100644
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/vision/crop_randomizer.py
+++ b/roboverse_learn/il/dp/models/vision/crop_randomizer.py
@@ -1,4 +1,4 @@
-import diffusion_policy.model.common.tensor_util as tu
+import roboverse_learn.il.utils.tensor_util as tu
 import torch
 import torch.nn as nn
 import torchvision.transforms.functional as ttf
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/vision/model_getter.py b/roboverse_learn/il/dp/models/vision/model_getter.py
similarity index 100%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/vision/model_getter.py
rename to roboverse_learn/il/dp/models/vision/model_getter.py
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/vision/multi_image_obs_encoder.py b/roboverse_learn/il/dp/models/vision/multi_image_obs_encoder.py
similarity index 96%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/vision/multi_image_obs_encoder.py
rename to roboverse_learn/il/dp/models/vision/multi_image_obs_encoder.py
index 0214d673b..912518709 100644
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/vision/multi_image_obs_encoder.py
+++ b/roboverse_learn/il/dp/models/vision/multi_image_obs_encoder.py
@@ -4,9 +4,9 @@
 import torch
 import torch.nn as nn
 import torchvision
-from diffusion_policy.common.pytorch_util import dict_apply, replace_submodules
-from diffusion_policy.model.common.module_attr_mixin import ModuleAttrMixin
-from diffusion_policy.model.vision.crop_randomizer import CropRandomizer
+from roboverse_learn.il.utils.pytorch_util import dict_apply, replace_submodules
+from roboverse_learn.il.utils.module_attr_mixin import ModuleAttrMixin
+from roboverse_learn.il.dp.models.vision.crop_randomizer import CropRandomizer
 
 
 class MultiImageObsEncoder(ModuleAttrMixin):
diff --git a/roboverse_learn/il/dp/models/ddim_unet_image_policy.py b/roboverse_learn/il/dp/policies/ddim_unet_image_policy.py
similarity index 95%
rename from roboverse_learn/il/dp/models/ddim_unet_image_policy.py
rename to roboverse_learn/il/dp/policies/ddim_unet_image_policy.py
index f195e8ad8..69efd3822 100644
--- a/roboverse_learn/il/dp/models/ddim_unet_image_policy.py
+++ b/roboverse_learn/il/dp/policies/ddim_unet_image_policy.py
@@ -4,16 +4,16 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from diffusers.schedulers.scheduling_ddim import DDIMScheduler
-from diffusion_policy.model.diffusion.conditional_unet1d import ConditionalUnet1D
-from diffusion_policy.model.diffusion.mask_generator import LowdimMaskGenerator
-from diffusion_policy.model.vision.multi_image_obs_encoder import MultiImageObsEncoder
+from roboverse_learn.il.dp.models.diffusion.conditional_unet1d import ConditionalUnet1D
+from roboverse_learn.il.dp.models.diffusion.mask_generator import LowdimMaskGenerator
+from roboverse_learn.il.dp.models.vision.multi_image_obs_encoder import MultiImageObsEncoder
 from einops import rearrange, reduce
 from loguru import logger as log
 
-from roboverse_learn.il.utils.common.module_attr_mixin import ModuleAttrMixin
-from roboverse_learn.il.utils.common.normalizer import LinearNormalizer
-from roboverse_learn.il.utils.common.pytorch_util import dict_apply
-from diffusion_policy.policy.base_image_policy import BaseImagePolicy
+from roboverse_learn.il.utils.module_attr_mixin import ModuleAttrMixin
+from roboverse_learn.il.utils.normalizer import LinearNormalizer
+from roboverse_learn.il.utils.pytorch_util import dict_apply
+from roboverse_learn.il.base.base_image_policy import BaseImagePolicy
 
 
 class BaseImagePolicy(ModuleAttrMixin):
diff --git a/roboverse_learn/il/dp/models/ddpm_dit_image_policy.py b/roboverse_learn/il/dp/policies/ddpm_dit_image_policy.py
similarity index 89%
rename from roboverse_learn/il/dp/models/ddpm_dit_image_policy.py
rename to roboverse_learn/il/dp/policies/ddpm_dit_image_policy.py
index 07f777d3b..f533931b0 100644
--- a/roboverse_learn/il/dp/models/ddpm_dit_image_policy.py
+++ b/roboverse_learn/il/dp/policies/ddpm_dit_image_policy.py
@@ -3,11 +3,11 @@
 from typing import Any, Dict, Mapping, Optional
 
 from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
-from diffusion_policy.model.diffusion.flow_net import FlowTransformer
-from diffusion_policy.model.vision.multi_image_obs_encoder import MultiImageObsEncoder
+from roboverse_learn.il.dp.models.diffusion.flow_net import FlowTransformer
+from roboverse_learn.il.dp.models.vision.multi_image_obs_encoder import MultiImageObsEncoder
 import torch
 
-from roboverse_learn.il.dp.models.ddpm_image_policy import DiffusionDenoisingImagePolicy
+from roboverse_learn.il.dp.policies.ddpm_image_policy import DiffusionDenoisingImagePolicy
 
 
 class DiffusionDiTImagePolicy(DiffusionDenoisingImagePolicy):
diff --git a/roboverse_learn/il/dp/models/ddpm_image_policy.py b/roboverse_learn/il/dp/policies/ddpm_image_policy.py
similarity index 96%
rename from roboverse_learn/il/dp/models/ddpm_image_policy.py
rename to roboverse_learn/il/dp/policies/ddpm_image_policy.py
index 159139f81..ccfdf5868 100644
--- a/roboverse_learn/il/dp/models/ddpm_image_policy.py
+++ b/roboverse_learn/il/dp/policies/ddpm_image_policy.py
@@ -5,13 +5,13 @@
 import torch
 import torch.nn.functional as F
 from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
-from diffusion_policy.model.diffusion.mask_generator import LowdimMaskGenerator
-from diffusion_policy.model.vision.multi_image_obs_encoder import MultiImageObsEncoder
+from roboverse_learn.il.dp.models.diffusion.mask_generator import LowdimMaskGenerator
+from roboverse_learn.il.dp.models.vision.multi_image_obs_encoder import MultiImageObsEncoder
 from einops import reduce
 
-from roboverse_learn.il.utils.common.normalizer import LinearNormalizer
-from roboverse_learn.il.utils.common.pytorch_util import dict_apply
-from diffusion_policy.policy.base_image_policy import BaseImagePolicy
+from roboverse_learn.il.utils.normalizer import LinearNormalizer
+from roboverse_learn.il.utils.pytorch_util import dict_apply
+from roboverse_learn.il.base.base_image_policy import BaseImagePolicy
 
 
 class DiffusionDenoisingImagePolicy(BaseImagePolicy):
diff --git a/roboverse_learn/il/dp/models/ddpm_unet_image_policy.py b/roboverse_learn/il/dp/policies/ddpm_unet_image_policy.py
similarity index 88%
rename from roboverse_learn/il/dp/models/ddpm_unet_image_policy.py
rename to roboverse_learn/il/dp/policies/ddpm_unet_image_policy.py
index b09a5c1f7..e8e5ddb91 100644
--- a/roboverse_learn/il/dp/models/ddpm_unet_image_policy.py
+++ b/roboverse_learn/il/dp/policies/ddpm_unet_image_policy.py
@@ -3,11 +3,11 @@
 from typing import Any, Dict, Mapping, Optional, Sequence
 
 from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
-from diffusion_policy.model.diffusion.conditional_unet1d import ConditionalUnet1D
-from diffusion_policy.model.vision.multi_image_obs_encoder import MultiImageObsEncoder
+from roboverse_learn.il.dp.models.diffusion.conditional_unet1d import ConditionalUnet1D
+from roboverse_learn.il.dp.models.vision.multi_image_obs_encoder import MultiImageObsEncoder
 import torch
 
-from roboverse_learn.il.dp.models.ddpm_image_policy import DiffusionDenoisingImagePolicy
+from roboverse_learn.il.dp.policies.ddpm_image_policy import DiffusionDenoisingImagePolicy
 
 
 class DiffusionUnetImagePolicy(DiffusionDenoisingImagePolicy):
diff --git a/roboverse_learn/il/dp/models/score_unet_image_policy.py b/roboverse_learn/il/dp/policies/score_unet_image_policy.py
similarity index 94%
rename from roboverse_learn/il/dp/models/score_unet_image_policy.py
rename to roboverse_learn/il/dp/policies/score_unet_image_policy.py
index a00ffa242..e6e787553 100644
--- a/roboverse_learn/il/dp/models/score_unet_image_policy.py
+++ b/roboverse_learn/il/dp/policies/score_unet_image_policy.py
@@ -4,15 +4,15 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
-from diffusion_policy.model.diffusion.conditional_unet1d import ConditionalUnet1D
-from diffusion_policy.model.diffusion.mask_generator import LowdimMaskGenerator
-from diffusion_policy.model.vision.multi_image_obs_encoder import MultiImageObsEncoder
+from roboverse_learn.il.dp.models.diffusion.conditional_unet1d import ConditionalUnet1D
+from roboverse_learn.il.dp.models.diffusion.mask_generator import LowdimMaskGenerator
+from roboverse_learn.il.dp.models.vision.multi_image_obs_encoder import MultiImageObsEncoder
 from einops import rearrange, reduce
 from loguru import logger as log
 
-from roboverse_learn.il.utils.common.normalizer import LinearNormalizer
-from roboverse_learn.il.utils.common.pytorch_util import dict_apply
-from diffusion_policy.policy.base_image_policy import BaseImagePolicy
+from roboverse_learn.il.utils.normalizer import LinearNormalizer
+from roboverse_learn.il.utils.pytorch_util import dict_apply
+from roboverse_learn.il.base.base_image_policy import BaseImagePolicy
 
 
 class ScoreMatchingUnetImagePolicy(BaseImagePolicy):
diff --git a/roboverse_learn/il/dp/requirements.txt b/roboverse_learn/il/dp/requirements.txt
new file mode 100644
index 000000000..f881747b6
--- /dev/null
+++ b/roboverse_learn/il/dp/requirements.txt
@@ -0,0 +1,18 @@
+zarr==2.12.0
+ipdb
+gpustat
+omegaconf
+hydra-core==1.2.0
+dill==0.3.5.1
+einops==0.4.1
+diffusers
+numba
+moviepy
+imageio
+av
+matplotlib
+termcolor
+huggingface_hub
+pillow
+pandas
+wandb
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/shared_memory/shared_memory_queue.py b/roboverse_learn/il/dp/shared_memory/shared_memory_queue.py
similarity index 97%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/shared_memory/shared_memory_queue.py
rename to roboverse_learn/il/dp/shared_memory/shared_memory_queue.py
index b6d609f30..b9196c205 100644
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/shared_memory/shared_memory_queue.py
+++ b/roboverse_learn/il/dp/shared_memory/shared_memory_queue.py
@@ -4,11 +4,11 @@
 from typing import Dict, List, Union
 
 import numpy as np
-from diffusion_policy.shared_memory.shared_memory_util import (
+from roboverse_learn.il.dp.shared_memory.shared_memory_util import (
     ArraySpec,
     SharedAtomicCounter,
 )
-from diffusion_policy.shared_memory.shared_ndarray import SharedNDArray
+from roboverse_learn.il.dp.shared_memory.shared_ndarray import SharedNDArray
 
 
 class SharedMemoryQueue:
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/shared_memory/shared_memory_ring_buffer.py b/roboverse_learn/il/dp/shared_memory/shared_memory_ring_buffer.py
similarity index 98%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/shared_memory/shared_memory_ring_buffer.py
rename to roboverse_learn/il/dp/shared_memory/shared_memory_ring_buffer.py
index 09c9fc20f..5fceea8e9 100644
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/shared_memory/shared_memory_ring_buffer.py
+++ b/roboverse_learn/il/dp/shared_memory/shared_memory_ring_buffer.py
@@ -5,11 +5,11 @@
 from typing import Dict, List, Union
 
 import numpy as np
-from diffusion_policy.shared_memory.shared_memory_util import (
+from roboverse_learn.il.dp.shared_memory.shared_memory_util import (
     ArraySpec,
     SharedAtomicCounter,
 )
-from diffusion_policy.shared_memory.shared_ndarray import SharedNDArray
+from roboverse_learn.il.dp.shared_memory.shared_ndarray import SharedNDArray
 
 
 class SharedMemoryRingBuffer:
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/shared_memory/shared_memory_util.py b/roboverse_learn/il/dp/shared_memory/shared_memory_util.py
similarity index 100%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/shared_memory/shared_memory_util.py
rename to roboverse_learn/il/dp/shared_memory/shared_memory_util.py
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/shared_memory/shared_ndarray.py b/roboverse_learn/il/dp/shared_memory/shared_ndarray.py
similarity index 98%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/shared_memory/shared_ndarray.py
rename to roboverse_learn/il/dp/shared_memory/shared_ndarray.py
index f741cbe19..be9266a1f 100644
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/shared_memory/shared_ndarray.py
+++ b/roboverse_learn/il/dp/shared_memory/shared_ndarray.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 import numpy.typing as npt
-from diffusion_policy.common.nested_dict_util import nested_dict_check, nested_dict_map
+from roboverse_learn.il.utils.nested_dict_util import nested_dict_check, nested_dict_map
 
 SharedMemoryLike = Union[str, SharedMemory]  # shared memory or name of shared memory
 SharedT = TypeVar("SharedT", bound=np.generic)
diff --git a/roboverse_learn/il/eval_runner/__init__.py b/roboverse_learn/il/eval_runner/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/roboverse_learn/il/dp/eval_runner/dp_eval_runner.py b/roboverse_learn/il/eval_runner/dp_eval_runner.py
similarity index 95%
rename from roboverse_learn/il/dp/eval_runner/dp_eval_runner.py
rename to roboverse_learn/il/eval_runner/dp_eval_runner.py
index 36cf14250..8b71e6288 100644
--- a/roboverse_learn/il/dp/eval_runner/dp_eval_runner.py
+++ b/roboverse_learn/il/eval_runner/dp_eval_runner.py
@@ -4,9 +4,9 @@
 import hydra
 import numpy as np
 import torch
-from dp.runner.base_policy import DiffusionPolicyCfg
-from dp.base.base_eval_runner import BaseEvalRunner
-from dp.runner.dp_runner import DPRunner
+from roboverse_learn.il.runner.base_policy import DiffusionPolicyCfg
+from roboverse_learn.il.base.base_eval_runner import BaseEvalRunner
+from roboverse_learn.il.runner.dp_runner import DPRunner
 
 
 class DPEvalRunner(BaseEvalRunner):
diff --git a/roboverse_learn/il/fm/README.md b/roboverse_learn/il/fm/README.md
new file mode 100644
index 000000000..264b4bbe3
--- /dev/null
+++ b/roboverse_learn/il/fm/README.md
@@ -0,0 +1,36 @@
+# Flow Matching Policies (IL)
+
+Flow Matching variants (UNet and DiT) live here and use the shared IL runners under `il/dp/`.
+
+## Install
+
+```bash
+cd roboverse_learn/il/dp
+pip install -r requirements.txt
+```
+
+Create a Weights & Biases account to obtain an API key for logging.
+
+## Collect and process data
+
+```bash
+./roboverse_learn/il/collect_demo.sh
+```
+
+## Train and eval
+
+Use the shared driver and point it at a Flow Matching model:
+
+```bash
+# Choose one: fm_dit_model (DiT backbone) or fm_unet_model (UNet backbone)
+export algo_model="fm_dit_model"
+
+./roboverse_learn/il/dp/dp_run.sh
+```
+
+Inside `dp_run.sh` you can toggle `train_enable` / `eval_enable`, set task names, seeds, GPU id, and checkpoint paths for evaluation.
+
+## References
+
+- Yaron Lipman et al., "Flow Matching for Generative Modeling." (2023).
+- William Peebles and Jun-Yan Zhu, "DiT: Diffusion Models with Transformers." (2023).
diff --git a/roboverse_learn/il/dp/models/fm_dit_image_policy.py b/roboverse_learn/il/fm/policies/fm_dit_image_policy.py
similarity index 95%
rename from roboverse_learn/il/dp/models/fm_dit_image_policy.py
rename to roboverse_learn/il/fm/policies/fm_dit_image_policy.py
index 555be1624..36325c582 100644
--- a/roboverse_learn/il/dp/models/fm_dit_image_policy.py
+++ b/roboverse_learn/il/fm/policies/fm_dit_image_policy.py
@@ -2,14 +2,14 @@
 
 import torch
 import torch.nn.functional as F
-from diffusion_policy.model.diffusion.flow_net import FlowTransformer
-from diffusion_policy.model.diffusion.mask_generator import LowdimMaskGenerator
-from diffusion_policy.model.vision.multi_image_obs_encoder import MultiImageObsEncoder
 from einops import reduce
 
-from roboverse_learn.il.utils.common.normalizer import LinearNormalizer
-from roboverse_learn.il.utils.common.pytorch_util import dict_apply
-from diffusion_policy.policy.base_image_policy import BaseImagePolicy
+from roboverse_learn.il.dp.models.diffusion.flow_net import FlowTransformer
+from roboverse_learn.il.dp.models.diffusion.mask_generator import LowdimMaskGenerator
+from roboverse_learn.il.dp.models.vision.multi_image_obs_encoder import MultiImageObsEncoder
+from roboverse_learn.il.utils.normalizer import LinearNormalizer
+from roboverse_learn.il.utils.pytorch_util import dict_apply
+from roboverse_learn.il.base.base_image_policy import BaseImagePolicy
 
 
 class FlowMatchingDiTImagePolicy(BaseImagePolicy):
diff --git a/roboverse_learn/il/dp/models/fm_unet_image_policy.py b/roboverse_learn/il/fm/policies/fm_unet_image_policy.py
similarity index 95%
rename from roboverse_learn/il/dp/models/fm_unet_image_policy.py
rename to roboverse_learn/il/fm/policies/fm_unet_image_policy.py
index 7fff8d436..5134064dc 100644
--- a/roboverse_learn/il/dp/models/fm_unet_image_policy.py
+++ b/roboverse_learn/il/fm/policies/fm_unet_image_policy.py
@@ -2,14 +2,14 @@
 
 import torch
 import torch.nn.functional as F
-from diffusion_policy.model.diffusion.conditional_unet1d import ConditionalUnet1D
-from diffusion_policy.model.diffusion.mask_generator import LowdimMaskGenerator
-from diffusion_policy.model.vision.multi_image_obs_encoder import MultiImageObsEncoder
 from einops import reduce
 
-from roboverse_learn.il.utils.common.normalizer import LinearNormalizer
-from roboverse_learn.il.utils.common.pytorch_util import dict_apply
-from diffusion_policy.policy.base_image_policy import BaseImagePolicy
+from roboverse_learn.il.dp.models.diffusion.conditional_unet1d import ConditionalUnet1D
+from roboverse_learn.il.dp.models.diffusion.mask_generator import LowdimMaskGenerator
+from roboverse_learn.il.dp.models.vision.multi_image_obs_encoder import MultiImageObsEncoder
+from roboverse_learn.il.utils.normalizer import LinearNormalizer
+from roboverse_learn.il.utils.pytorch_util import dict_apply
+from roboverse_learn.il.base.base_image_policy import BaseImagePolicy
 
 
 class FlowMatchingUnetImagePolicy(BaseImagePolicy):
diff --git a/roboverse_learn/il/fm/requirements.txt b/roboverse_learn/il/fm/requirements.txt
new file mode 100644
index 000000000..fb7e62adf
--- /dev/null
+++ b/roboverse_learn/il/fm/requirements.txt
@@ -0,0 +1,19 @@
+zarr==2.12.0
+ipdb
+gpustat
+omegaconf
+hydra-core==1.2.0
+dill==0.3.5.1
+einops==0.4.1
+diffusers
+numba
+moviepy
+imageio
+av
+matplotlib
+termcolor
+huggingface_hub
+pillow
+pandas
+wandb
+torchcfm
diff --git a/roboverse_learn/il/il_run.sh b/roboverse_learn/il/il_run.sh
index 475e7b772..76e4eb8d8 100644
--- a/roboverse_learn/il/il_run.sh
+++ b/roboverse_learn/il/il_run.sh
@@ -1,12 +1,27 @@
 #!/bin/bash
-# Try： bash roboverse_learn/il/il_run.sh --task_name_set close_box --algo_choose dp_DDPM --demo_num 100 --sim_set mujoco
+# Usage: bash roboverse_learn/il/il_run.sh --task_name_set close_box --algo_choose ddpm_dit --demo_num 100 --sim_set mujoco
 
-task_name_set="close_box" # Tasks, opts: close_box, stack_cube pick_cube
-algo_choose="dp_DDPM"     # IL algorithm, opts: act, dp_DDPM, dp_DDIM, dp_FM_UNet, dp_FM_DiT, dp_Score, dp_VITA
-sim_set="mujoco"          # Simulator, opts: mujoco, isaacsim
-demo_num=100              # Number of demonstration to collect, train, and eval
+task_name_set="close_box" # Tasks, e.g., close_box, stack_cube, pick_cube
+algo_choose="ddpm_dit"    # IL algorithm, opts: ddpm_unet, ddpm_dit, ddim_unet, fm_unet, fm_dit, vita, act, score
+sim_set="mujoco"          # Simulator, e.g., mujoco, isaacsim
+demo_num=90              # Number of demonstrations to collect, train, and eval
 
-# parse parameters
+# Training/eval control
+train_enable=True
+eval_enable=False
+
+# Training parameters
+level=0
+num_epochs=100
+seed=42
+gpu=0
+obs_space=joint_pos
+act_space=joint_pos
+delta_ee=0
+eval_num_envs=1
+eval_max_step=300
+
+# Parse parameters
 while [[ $# -gt 0 ]]; do
     case "$1" in
         --task_name_set)
@@ -25,14 +40,31 @@ while [[ $# -gt 0 ]]; do
             demo_num="$2"
             shift 2
             ;;
+        --train_enable)
+            train_enable="$2"
+            shift 2
+            ;;
+        --eval_enable)
+            eval_enable="$2"
+            shift 2
+            ;;
+        --num_epochs)
+            num_epochs="$2"
+            shift 2
+            ;;
+        --gpu)
+            gpu="$2"
+            shift 2
+            ;;
         *)
-            echo "Unknown parameter: $1，optional parameter：--task_name_set --algo_choose --sim_set --demo_num"
+            echo "Unknown parameter: $1"
+            echo "Optional parameters: --task_name_set --algo_choose --sim_set --demo_num --train_enable --eval_enable --num_epochs --gpu"
             exit 1
             ;;
     esac
 done
 
-# 1. collect_demo
+# Collect demo
 echo "=== Running collect_demo.sh ==="
 sed -i "s/^task_name_set=.*/task_name_set=$task_name_set/" ./roboverse_learn/il/collect_demo.sh
 sed -i "s/^sim_set=.*/sim_set=$sim_set/" ./roboverse_learn/il/collect_demo.sh
@@ -40,73 +72,97 @@ sed -i "s/^num_demo_success=.*/num_demo_success=$demo_num/" ./roboverse_learn/il
 sed -i "s/^expert_data_num=.*/expert_data_num=$demo_num/" ./roboverse_learn/il/collect_demo.sh
 bash ./roboverse_learn/il/collect_demo.sh
 
-# 2. il algorithm
+# Map algo_choose to model config
 case "$algo_choose" in
-    "dp_DDPM")
-        echo "=== Running dp_run.sh ==="
-        sed -i "s/^task_name_set=.*/task_name_set=$task_name_set/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^sim_set=.*/sim_set=$sim_set/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^expert_data_num=.*/expert_data_num=$demo_num/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^eval_ckpt_name=.*/eval_ckpt_name=$demo_num/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^algo_choose=.*/algo_choose=0/" ./roboverse_learn/il/dp/dp_run.sh
-        bash ./roboverse_learn/il/dp/dp_run.sh
+    "ddpm_unet")
+        algo_model="ddpm_unet_model"
+        config_name="dp_runner"
+        main_script="./roboverse_learn/il/train.py"
+        output_dir="DP"
         ;;
-    "dp_DDIM")
-        echo "=== Running dp_run.sh ==="
-        sed -i "s/^task_name_set=.*/task_name_set=$task_name_set/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^sim_set=.*/sim_set=$sim_set/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^expert_data_num=.*/expert_data_num=$demo_num/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^eval_ckpt_name=.*/eval_ckpt_name=$demo_num/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^algo_choose=.*/algo_choose=1/" ./roboverse_learn/il/dp/dp_run.sh
-        bash ./roboverse_learn/il/dp/dp_run.sh
+    "ddpm_dit")
+        algo_model="ddpm_dit_model"
+        config_name="dp_runner"
+        main_script="./roboverse_learn/il/train.py"
+        output_dir="DP"
         ;;
-    "dp_FM_UNet")
-        echo "=== Running dp_run.sh ==="
-        sed -i "s/^task_name_set=.*/task_name_set=$task_name_set/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^sim_set=.*/sim_set=$sim_set/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^expert_data_num=.*/expert_data_num=$demo_num/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^eval_ckpt_name=.*/eval_ckpt_name=$demo_num/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^algo_choose=.*/algo_choose=2/" ./roboverse_learn/il/dp/dp_run.sh
-        bash ./roboverse_learn/il/dp/dp_run.sh
+    "ddim_unet")
+        algo_model="ddim_unet_model"
+        config_name="dp_runner"
+        main_script="./roboverse_learn/il/train.py"
+        output_dir="DP"
         ;;
-    "dp_FM_DiT")
-        echo "=== Running dp_run.sh ==="
-        sed -i "s/^task_name_set=.*/task_name_set=$task_name_set/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^sim_set=.*/sim_set=$sim_set/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^expert_data_num=.*/expert_data_num=$demo_num/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^eval_ckpt_name=.*/eval_ckpt_name=$demo_num/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^algo_choose=.*/algo_choose=3/" ./roboverse_learn/il/dp/dp_run.sh
-        bash ./roboverse_learn/il/dp/dp_run.sh
+    "fm_unet")
+        algo_model="fm_unet_model"
+        config_name="dp_runner"
+        main_script="./roboverse_learn/il/train.py"
+        output_dir="FM"
         ;;
-    "dp_Score")
-        echo "=== Running dp_run.sh ==="
-        sed -i "s/^task_name_set=.*/task_name_set=$task_name_set/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^sim_set=.*/sim_set=$sim_set/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^expert_data_num=.*/expert_data_num=$demo_num/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^eval_ckpt_name=.*/eval_ckpt_name=$demo_num/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^algo_choose=.*/algo_choose=4/" ./roboverse_learn/il/dp/dp_run.sh
-        bash ./roboverse_learn/il/dp/dp_run.sh
+    "fm_dit")
+        algo_model="fm_dit_model"
+        config_name="dp_runner"
+        main_script="./roboverse_learn/il/train.py"
+        output_dir="FM"
         ;;
-    "dp_VITA")
-        echo "=== Running dp_run.sh ==="
-        sed -i "s/^task_name_set=.*/task_name_set=$task_name_set/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^sim_set=.*/sim_set=$sim_set/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^expert_data_num=.*/expert_data_num=$demo_num/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^eval_ckpt_name=.*/eval_ckpt_name=$demo_num/" ./roboverse_learn/il/dp/dp_run.sh
-        sed -i "s/^algo_choose=.*/algo_choose=5/" ./roboverse_learn/il/dp/dp_run.sh
-        bash ./roboverse_learn/il/dp/dp_run.sh
+    "score")
+        algo_model="score_model"
+        config_name="dp_runner"
+        main_script="./roboverse_learn/il/train.py"
+        output_dir="DP"
+        ;;
+    "vita")
+        algo_model="vita_model"
+        config_name="dp_runner"
+        main_script="./roboverse_learn/il/train.py"
+        output_dir="VITA"
         ;;
     "act")
-        echo "=== Running act_run.sh ==="
+        echo "=== Running ACT training ==="
         sed -i "s/^task_name_set=.*/task_name_set=$task_name_set/" ./roboverse_learn/il/act/act_run.sh
         sed -i "s/^sim_set=.*/sim_set=$sim_set/" ./roboverse_learn/il/act/act_run.sh
         sed -i "s/^expert_data_num=.*/expert_data_num=$demo_num/" ./roboverse_learn/il/act/act_run.sh
         bash ./roboverse_learn/il/act/act_run.sh
+        echo "=== Completed all data collection, training, and evaluation ==="
+        exit 0
         ;;
     *)
-        echo "Unavailable chose: $algo_choose, optional options: act, dp_DDPM, dp_DDIM, dp_FM_UNet, dp_FM_DiT, dp_Score, dp_VITA"
+        echo "Unsupported algorithm: $algo_choose"
+        echo "Available options: act, ddpm_unet, ddpm_dit, ddim_unet, fm_unet, fm_dit, score, vita"
         exit 1
         ;;
 esac
 
+# Run training/evaluation for DP/FM/VITA policies
+echo "=== Running ${algo_choose} (${algo_model}) ==="
+echo "Selected model: $algo_model"
+
+eval_ckpt_name=$demo_num
+eval_path="./info/outputs/${output_dir}/${task_name_set}/checkpoints/${eval_ckpt_name}.ckpt"
+
+echo "Checkpoint path: $eval_path"
+
+extra="obs:${obs_space}_act:${act_space}"
+if [ "${delta_ee}" = 1 ]; then
+  extra="${extra}_delta"
+fi
+
+export algo_model
+python ${main_script} --config-name=${config_name}.yaml \
+task_name=${task_name_set} \
+"dataset_config.zarr_path=./data_policy/${task_name_set}FrankaL${level}_${extra}_${demo_num}.zarr" \
+train_config.training_params.seed=${seed} \
+train_config.training_params.num_epochs=${num_epochs} \
+train_config.training_params.device=${gpu} \
+eval_config.policy_runner.obs.obs_type=${obs_space} \
+eval_config.policy_runner.action.action_type=${act_space} \
+eval_config.policy_runner.action.delta=${delta_ee} \
+eval_config.eval_args.task=${task_name_set} \
+eval_config.eval_args.max_step=${eval_max_step} \
+eval_config.eval_args.num_envs=${eval_num_envs} \
+eval_config.eval_args.sim=${sim_set} \
++eval_config.eval_args.max_demo=${demo_num} \
+train_enable=${train_enable} \
+eval_enable=${eval_enable} \
+eval_path=${eval_path}
+
 echo "=== Completed all data collection, training, and evaluation ==="
diff --git a/roboverse_learn/il/il_setup.sh b/roboverse_learn/il/il_setup.sh
index 098ddc73d..f87f9aaa8 100644
--- a/roboverse_learn/il/il_setup.sh
+++ b/roboverse_learn/il/il_setup.sh
@@ -1,27 +1,10 @@
 #!/bin/bash
 
-# Install diffusion_policy
-echo "Install diffusion_policy..."
-cd ./roboverse_learn/il/utils/diffusion_policy || { echo "diffusion_policy do not exit"; exit 1; }
-pip install -e .
-
-# Install act
-echo "Install act..."
-cd ../../../../
-cd roboverse_learn/il/act/detr || { echo "detr do not exit"; exit 1; }
-pip install -e .
-
 # Install additional dependencies
 echo "Install additional dependencies..."
-cd ../../../../../
-pip install pandas wandb
-
 # Fix .zarr issue
 pip install zarr==2.16.1 blosc==1.11.1
 pip install numcodecs==0.11.0
 
 # Fix hydra issue
 pip install --upgrade hydra-core
-
-# dp-VITA additional dependency
-pip install torchcfm
diff --git a/roboverse_learn/il/runner/__init__.py b/roboverse_learn/il/runner/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/roboverse_learn/il/dp/runner/base_policy.py b/roboverse_learn/il/runner/base_policy.py
similarity index 100%
rename from roboverse_learn/il/dp/runner/base_policy.py
rename to roboverse_learn/il/runner/base_policy.py
diff --git a/roboverse_learn/il/dp/runner/dp_runner.py b/roboverse_learn/il/runner/dp_runner.py
similarity index 98%
rename from roboverse_learn/il/dp/runner/dp_runner.py
rename to roboverse_learn/il/runner/dp_runner.py
index 68a4f0b84..b1997caff 100644
--- a/roboverse_learn/il/dp/runner/dp_runner.py
+++ b/roboverse_learn/il/runner/dp_runner.py
@@ -14,20 +14,20 @@
 import torch
 import tqdm
 import wandb
-from diffusion_policy.model.diffusion.ema_model import EMAModel
+from roboverse_learn.il.dp.models.diffusion.ema_model import EMAModel
 from loguru import logger as log
 from metasim.scenario.scenario import ScenarioCfg
 from metasim.scenario.cameras import PinholeCameraCfg
 from metasim.constants import SimType
 from metasim.utils.demo_util import get_traj
 from metasim.utils.setup_util import get_robot
-from dp.base.base_eval_runner import BaseEvalRunner
-from dp.base.base_runner import BaseRunner
-from roboverse_learn.il.utils.common.eval_args import Args
-from roboverse_learn.il.utils.common.eval_runner_getter import get_runner
-from roboverse_learn.il.utils.common.json_logger import JsonLogger
-from roboverse_learn.il.utils.common.lr_scheduler import get_scheduler
-from roboverse_learn.il.utils.common.pytorch_util import dict_apply, optimizer_to
+from roboverse_learn.il.base.base_eval_runner import BaseEvalRunner
+from roboverse_learn.il.base.base_runner import BaseRunner
+from roboverse_learn.il.utils.eval_args import Args
+from roboverse_learn.il.utils.eval_runner_getter import get_runner
+from roboverse_learn.il.utils.json_logger import JsonLogger
+from roboverse_learn.il.utils.lr_scheduler import get_scheduler
+from roboverse_learn.il.utils.pytorch_util import dict_apply, optimizer_to
 from torch.utils.data import DataLoader
 
 from metasim.task.registry import get_task_class
diff --git a/roboverse_learn/il/dp/main.py b/roboverse_learn/il/train.py
similarity index 91%
rename from roboverse_learn/il/dp/main.py
rename to roboverse_learn/il/train.py
index b1adf2706..6306cc9e5 100644
--- a/roboverse_learn/il/dp/main.py
+++ b/roboverse_learn/il/train.py
@@ -8,7 +8,7 @@
 here = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.dirname(here)
 sys.path.insert(0, project_root)
-from dp.base.base_runner import BaseRunner
+from roboverse_learn.il.base.base_runner import BaseRunner
 
 abs_config_path = str(pathlib.Path(__file__).resolve().parent.joinpath("configs").absolute())
 OmegaConf.register_new_resolver("eval", eval, replace=True)
diff --git a/roboverse_learn/il/utils/common/checkpoint_util.py b/roboverse_learn/il/utils/checkpoint_util.py
similarity index 100%
rename from roboverse_learn/il/utils/common/checkpoint_util.py
rename to roboverse_learn/il/utils/checkpoint_util.py
diff --git a/roboverse_learn/il/utils/common/normalize_util.py b/roboverse_learn/il/utils/common/normalize_util.py
deleted file mode 100644
index ba8e32097..000000000
--- a/roboverse_learn/il/utils/common/normalize_util.py
+++ /dev/null
@@ -1,198 +0,0 @@
-import numpy as np
-from diffusion_policy.model.common.normalizer import SingleFieldLinearNormalizer
-
-
-from roboverse_learn.il.utils.common.pytorch_util import (
-    dict_apply_reduce,
-    dict_apply_split,
-)
-
-
-def get_range_normalizer_from_stat(stat, output_max=1, output_min=-1, range_eps=1e-7):
-    # -1, 1 normalization
-    input_max = stat["max"]
-    input_min = stat["min"]
-    input_range = input_max - input_min
-    ignore_dim = input_range < range_eps
-    input_range[ignore_dim] = output_max - output_min
-    scale = (output_max - output_min) / input_range
-    offset = output_min - scale * input_min
-    offset[ignore_dim] = (output_max + output_min) / 2 - input_min[ignore_dim]
-
-    return SingleFieldLinearNormalizer.create_manual(scale=scale, offset=offset, input_stats_dict=stat)
-
-
-def get_image_range_normalizer():
-    scale = np.array([2], dtype=np.float32)
-    offset = np.array([-1], dtype=np.float32)
-    stat = {
-        "min": np.array([0], dtype=np.float32),
-        "max": np.array([1], dtype=np.float32),
-        "mean": np.array([0.5], dtype=np.float32),
-        "std": np.array([np.sqrt(1 / 12)], dtype=np.float32),
-    }
-    return SingleFieldLinearNormalizer.create_manual(scale=scale, offset=offset, input_stats_dict=stat)
-
-
-def get_identity_normalizer_from_stat(stat):
-    scale = np.ones_like(stat["min"])
-    offset = np.zeros_like(stat["min"])
-    return SingleFieldLinearNormalizer.create_manual(scale=scale, offset=offset, input_stats_dict=stat)
-
-
-def robomimic_abs_action_normalizer_from_stat(stat, rotation_transformer):
-    result = dict_apply_split(stat, lambda x: {"pos": x[..., :3], "rot": x[..., 3:6], "gripper": x[..., 6:]})
-
-    def get_pos_param_info(stat, output_max=1, output_min=-1, range_eps=1e-7):
-        # -1, 1 normalization
-        input_max = stat["max"]
-        input_min = stat["min"]
-        input_range = input_max - input_min
-        ignore_dim = input_range < range_eps
-        input_range[ignore_dim] = output_max - output_min
-        scale = (output_max - output_min) / input_range
-        offset = output_min - scale * input_min
-        offset[ignore_dim] = (output_max + output_min) / 2 - input_min[ignore_dim]
-
-        return {"scale": scale, "offset": offset}, stat
-
-    def get_rot_param_info(stat):
-        example = rotation_transformer.forward(stat["mean"])
-        scale = np.ones_like(example)
-        offset = np.zeros_like(example)
-        info = {
-            "max": np.ones_like(example),
-            "min": np.full_like(example, -1),
-            "mean": np.zeros_like(example),
-            "std": np.ones_like(example),
-        }
-        return {"scale": scale, "offset": offset}, info
-
-    def get_gripper_param_info(stat):
-        example = stat["max"]
-        scale = np.ones_like(example)
-        offset = np.zeros_like(example)
-        info = {
-            "max": np.ones_like(example),
-            "min": np.full_like(example, -1),
-            "mean": np.zeros_like(example),
-            "std": np.ones_like(example),
-        }
-        return {"scale": scale, "offset": offset}, info
-
-    pos_param, pos_info = get_pos_param_info(result["pos"])
-    rot_param, rot_info = get_rot_param_info(result["rot"])
-    gripper_param, gripper_info = get_gripper_param_info(result["gripper"])
-
-    param = dict_apply_reduce([pos_param, rot_param, gripper_param], lambda x: np.concatenate(x, axis=-1))
-    info = dict_apply_reduce([pos_info, rot_info, gripper_info], lambda x: np.concatenate(x, axis=-1))
-
-    return SingleFieldLinearNormalizer.create_manual(
-        scale=param["scale"], offset=param["offset"], input_stats_dict=info
-    )
-
-
-def robomimic_abs_action_only_normalizer_from_stat(stat):
-    result = dict_apply_split(stat, lambda x: {"pos": x[..., :3], "other": x[..., 3:]})
-
-    def get_pos_param_info(stat, output_max=1, output_min=-1, range_eps=1e-7):
-        # -1, 1 normalization
-        input_max = stat["max"]
-        input_min = stat["min"]
-        input_range = input_max - input_min
-        ignore_dim = input_range < range_eps
-        input_range[ignore_dim] = output_max - output_min
-        scale = (output_max - output_min) / input_range
-        offset = output_min - scale * input_min
-        offset[ignore_dim] = (output_max + output_min) / 2 - input_min[ignore_dim]
-
-        return {"scale": scale, "offset": offset}, stat
-
-    def get_other_param_info(stat):
-        example = stat["max"]
-        scale = np.ones_like(example)
-        offset = np.zeros_like(example)
-        info = {
-            "max": np.ones_like(example),
-            "min": np.full_like(example, -1),
-            "mean": np.zeros_like(example),
-            "std": np.ones_like(example),
-        }
-        return {"scale": scale, "offset": offset}, info
-
-    pos_param, pos_info = get_pos_param_info(result["pos"])
-    other_param, other_info = get_other_param_info(result["other"])
-
-    param = dict_apply_reduce([pos_param, other_param], lambda x: np.concatenate(x, axis=-1))
-    info = dict_apply_reduce([pos_info, other_info], lambda x: np.concatenate(x, axis=-1))
-
-    return SingleFieldLinearNormalizer.create_manual(
-        scale=param["scale"], offset=param["offset"], input_stats_dict=info
-    )
-
-
-def robomimic_abs_action_only_dual_arm_normalizer_from_stat(stat):
-    Da = stat["max"].shape[-1]
-    Dah = Da // 2
-    result = dict_apply_split(
-        stat,
-        lambda x: {
-            "pos0": x[..., :3],
-            "other0": x[..., 3:Dah],
-            "pos1": x[..., Dah : Dah + 3],
-            "other1": x[..., Dah + 3 :],
-        },
-    )
-
-    def get_pos_param_info(stat, output_max=1, output_min=-1, range_eps=1e-7):
-        # -1, 1 normalization
-        input_max = stat["max"]
-        input_min = stat["min"]
-        input_range = input_max - input_min
-        ignore_dim = input_range < range_eps
-        input_range[ignore_dim] = output_max - output_min
-        scale = (output_max - output_min) / input_range
-        offset = output_min - scale * input_min
-        offset[ignore_dim] = (output_max + output_min) / 2 - input_min[ignore_dim]
-
-        return {"scale": scale, "offset": offset}, stat
-
-    def get_other_param_info(stat):
-        example = stat["max"]
-        scale = np.ones_like(example)
-        offset = np.zeros_like(example)
-        info = {
-            "max": np.ones_like(example),
-            "min": np.full_like(example, -1),
-            "mean": np.zeros_like(example),
-            "std": np.ones_like(example),
-        }
-        return {"scale": scale, "offset": offset}, info
-
-    pos0_param, pos0_info = get_pos_param_info(result["pos0"])
-    pos1_param, pos1_info = get_pos_param_info(result["pos1"])
-    other0_param, other0_info = get_other_param_info(result["other0"])
-    other1_param, other1_info = get_other_param_info(result["other1"])
-
-    param = dict_apply_reduce(
-        [pos0_param, other0_param, pos1_param, other1_param],
-        lambda x: np.concatenate(x, axis=-1),
-    )
-    info = dict_apply_reduce(
-        [pos0_info, other0_info, pos1_info, other1_info],
-        lambda x: np.concatenate(x, axis=-1),
-    )
-
-    return SingleFieldLinearNormalizer.create_manual(
-        scale=param["scale"], offset=param["offset"], input_stats_dict=info
-    )
-
-
-def array_to_stats(arr: np.ndarray):
-    stat = {
-        "min": np.min(arr, axis=0),
-        "max": np.max(arr, axis=0),
-        "mean": np.mean(arr, axis=0),
-        "std": np.std(arr, axis=0),
-    }
-    return stat
diff --git a/roboverse_learn/il/utils/common/normalizer.py b/roboverse_learn/il/utils/common/normalizer.py
deleted file mode 100644
index 62f45b4ef..000000000
--- a/roboverse_learn/il/utils/common/normalizer.py
+++ /dev/null
@@ -1,368 +0,0 @@
-import unittest
-from typing import Dict, Union
-
-import numpy as np
-import torch
-import torch.nn as nn
-import zarr
-from roboverse_learn.il.utils.common.pytorch_util import dict_apply
-from roboverse_learn.il.utils.common.dict_of_tensor_mixin import DictOfTensorMixin
-
-
-class LinearNormalizer(DictOfTensorMixin):
-    avaliable_modes = ["limits", "gaussian"]
-
-    @torch.no_grad()
-    def fit(
-        self,
-        data: Union[Dict, torch.Tensor, np.ndarray, zarr.Array],
-        last_n_dims=1,
-        dtype=torch.float32,
-        mode="limits",
-        output_max=1.0,
-        output_min=-1.0,
-        range_eps=1e-4,
-        fit_offset=True,
-    ):
-        if isinstance(data, dict):
-            for key, value in data.items():
-                self.params_dict[key] = _fit(
-                    value,
-                    last_n_dims=last_n_dims,
-                    dtype=dtype,
-                    mode=mode,
-                    output_max=output_max,
-                    output_min=output_min,
-                    range_eps=range_eps,
-                    fit_offset=fit_offset,
-                )
-        else:
-            self.params_dict["_default"] = _fit(
-                data,
-                last_n_dims=last_n_dims,
-                dtype=dtype,
-                mode=mode,
-                output_max=output_max,
-                output_min=output_min,
-                range_eps=range_eps,
-                fit_offset=fit_offset,
-            )
-
-    def __call__(self, x: Union[Dict, torch.Tensor, np.ndarray]) -> torch.Tensor:
-        return self.normalize(x)
-
-    def __getitem__(self, key: str):
-        return SingleFieldLinearNormalizer(self.params_dict[key])
-
-    def __setitem__(self, key: str, value: "SingleFieldLinearNormalizer"):
-        self.params_dict[key] = value.params_dict
-
-    def _normalize_impl(self, x, forward=True):
-        if isinstance(x, dict):
-            result = dict()
-            for key, value in x.items():
-                params = self.params_dict[key]
-                try:
-                    result[key] = _normalize(value, params, forward=forward)
-                except:
-                    import pdb
-
-                    pdb.set_trace()
-            return result
-        else:
-            if "_default" not in self.params_dict:
-                raise RuntimeError("Not initialized")
-            params = self.params_dict["_default"]
-            return _normalize(x, params, forward=forward)
-
-    def normalize(self, x: Union[Dict, torch.Tensor, np.ndarray]) -> torch.Tensor:
-        return self._normalize_impl(x, forward=True)
-
-    def unnormalize(self, x: Union[Dict, torch.Tensor, np.ndarray]) -> torch.Tensor:
-        return self._normalize_impl(x, forward=False)
-
-    def get_input_stats(self) -> Dict:
-        if len(self.params_dict) == 0:
-            raise RuntimeError("Not initialized")
-        if len(self.params_dict) == 1 and "_default" in self.params_dict:
-            return self.params_dict["_default"]["input_stats"]
-
-        result = dict()
-        for key, value in self.params_dict.items():
-            if key != "_default":
-                result[key] = value["input_stats"]
-        return result
-
-    def get_output_stats(self, key="_default"):
-        input_stats = self.get_input_stats()
-        if "min" in input_stats:
-            # no dict
-            return dict_apply(input_stats, self.normalize)
-
-        result = dict()
-        for key, group in input_stats.items():
-            this_dict = dict()
-            for name, value in group.items():
-                this_dict[name] = self.normalize({key: value})[key]
-            result[key] = this_dict
-        return result
-
-
-class SingleFieldLinearNormalizer(DictOfTensorMixin):
-    avaliable_modes = ["limits", "gaussian"]
-
-    @torch.no_grad()
-    def fit(
-        self,
-        data: Union[torch.Tensor, np.ndarray, zarr.Array],
-        last_n_dims=1,
-        dtype=torch.float32,
-        mode="limits",
-        output_max=1.0,
-        output_min=-1.0,
-        range_eps=1e-4,
-        fit_offset=True,
-    ):
-        self.params_dict = _fit(
-            data,
-            last_n_dims=last_n_dims,
-            dtype=dtype,
-            mode=mode,
-            output_max=output_max,
-            output_min=output_min,
-            range_eps=range_eps,
-            fit_offset=fit_offset,
-        )
-
-    @classmethod
-    def create_fit(cls, data: Union[torch.Tensor, np.ndarray, zarr.Array], **kwargs):
-        obj = cls()
-        obj.fit(data, **kwargs)
-        return obj
-
-    @classmethod
-    def create_manual(
-        cls,
-        scale: Union[torch.Tensor, np.ndarray],
-        offset: Union[torch.Tensor, np.ndarray],
-        input_stats_dict: Dict[str, Union[torch.Tensor, np.ndarray]],
-    ):
-        def to_tensor(x):
-            if not isinstance(x, torch.Tensor):
-                x = torch.from_numpy(x)
-            x = x.flatten()
-            return x
-
-        # check
-        for x in [offset] + list(input_stats_dict.values()):
-            assert x.shape == scale.shape
-            assert x.dtype == scale.dtype
-
-        params_dict = nn.ParameterDict({
-            "scale": to_tensor(scale),
-            "offset": to_tensor(offset),
-            "input_stats": nn.ParameterDict(dict_apply(input_stats_dict, to_tensor)),
-        })
-        return cls(params_dict)
-
-    @classmethod
-    def create_identity(cls, dtype=torch.float32):
-        scale = torch.tensor([1], dtype=dtype)
-        offset = torch.tensor([0], dtype=dtype)
-        input_stats_dict = {
-            "min": torch.tensor([-1], dtype=dtype),
-            "max": torch.tensor([1], dtype=dtype),
-            "mean": torch.tensor([0], dtype=dtype),
-            "std": torch.tensor([1], dtype=dtype),
-        }
-        return cls.create_manual(scale, offset, input_stats_dict)
-
-    def normalize(self, x: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
-        return _normalize(x, self.params_dict, forward=True)
-
-    def unnormalize(self, x: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
-        return _normalize(x, self.params_dict, forward=False)
-
-    def get_input_stats(self):
-        return self.params_dict["input_stats"]
-
-    def get_output_stats(self):
-        return dict_apply(self.params_dict["input_stats"], self.normalize)
-
-    def __call__(self, x: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
-        return self.normalize(x)
-
-
-def _fit(
-    data: Union[torch.Tensor, np.ndarray, zarr.Array],
-    last_n_dims=1,
-    dtype=torch.float32,
-    mode="limits",
-    output_max=1.0,
-    output_min=-1.0,
-    range_eps=1e-4,
-    fit_offset=True,
-):
-    assert mode in ["limits", "gaussian"]
-    assert last_n_dims >= 0
-    assert output_max > output_min
-
-    # convert data to torch and type
-    if isinstance(data, zarr.Array):
-        data = data[:]
-    if isinstance(data, np.ndarray):
-        data = torch.from_numpy(data)
-    if dtype is not None:
-        data = data.type(dtype)
-
-    # convert shape
-    dim = 1
-    if last_n_dims > 0:
-        dim = np.prod(data.shape[-last_n_dims:])
-    data = data.reshape(-1, dim)
-
-    # compute input stats min max mean std
-    input_min, _ = data.min(axis=0)
-    input_max, _ = data.max(axis=0)
-    input_mean = data.mean(axis=0)
-    input_std = data.std(axis=0)
-
-    # compute scale and offset
-    if mode == "limits":
-        if fit_offset:
-            # unit scale
-            input_range = input_max - input_min
-            ignore_dim = input_range < range_eps
-            input_range[ignore_dim] = output_max - output_min
-            scale = (output_max - output_min) / input_range
-            offset = output_min - scale * input_min
-            offset[ignore_dim] = (output_max + output_min) / 2 - input_min[ignore_dim]
-            # ignore dims scaled to mean of output max and min
-        else:
-            # use this when data is pre-zero-centered.
-            assert output_max > 0
-            assert output_min < 0
-            # unit abs
-            output_abs = min(abs(output_min), abs(output_max))
-            input_abs = torch.maximum(torch.abs(input_min), torch.abs(input_max))
-            ignore_dim = input_abs < range_eps
-            input_abs[ignore_dim] = output_abs
-            # don't scale constant channels
-            scale = output_abs / input_abs
-            offset = torch.zeros_like(input_mean)
-    elif mode == "gaussian":
-        ignore_dim = input_std < range_eps
-        scale = input_std.clone()
-        scale[ignore_dim] = 1
-        scale = 1 / scale
-
-        if fit_offset:
-            offset = -input_mean * scale
-        else:
-            offset = torch.zeros_like(input_mean)
-
-    # save
-    this_params = nn.ParameterDict({
-        "scale": scale,
-        "offset": offset,
-        "input_stats": nn.ParameterDict({
-            "min": input_min,
-            "max": input_max,
-            "mean": input_mean,
-            "std": input_std,
-        }),
-    })
-    for p in this_params.parameters():
-        p.requires_grad_(False)
-    return this_params
-
-
-def _normalize(x, params, forward=True):
-    assert "scale" in params
-    if isinstance(x, np.ndarray):
-        x = torch.from_numpy(x)
-    scale = params["scale"]
-    offset = params["offset"]
-    x = x.to(device=scale.device, dtype=scale.dtype)
-    src_shape = x.shape
-    x = x.reshape(-1, scale.shape[0])
-    if forward:
-        x = x * scale + offset
-    else:
-        x = (x - offset) / scale
-    x = x.reshape(src_shape)
-    return x
-
-
-def test():
-    data = torch.zeros((100, 10, 9, 2)).uniform_()
-    data[..., 0, 0] = 0
-
-    normalizer = SingleFieldLinearNormalizer()
-    normalizer.fit(data, mode="limits", last_n_dims=2)
-    datan = normalizer.normalize(data)
-    assert datan.shape == data.shape
-    assert np.allclose(datan.max(), 1.0)
-    assert np.allclose(datan.min(), -1.0)
-    dataun = normalizer.unnormalize(datan)
-    assert torch.allclose(data, dataun, atol=1e-7)
-
-    input_stats = normalizer.get_input_stats()
-    output_stats = normalizer.get_output_stats()
-
-    normalizer = SingleFieldLinearNormalizer()
-    normalizer.fit(data, mode="limits", last_n_dims=1, fit_offset=False)
-    datan = normalizer.normalize(data)
-    assert datan.shape == data.shape
-    assert np.allclose(datan.max(), 1.0, atol=1e-3)
-    assert np.allclose(datan.min(), 0.0, atol=1e-3)
-    dataun = normalizer.unnormalize(datan)
-    assert torch.allclose(data, dataun, atol=1e-7)
-
-    data = torch.zeros((100, 10, 9, 2)).uniform_()
-    normalizer = SingleFieldLinearNormalizer()
-    normalizer.fit(data, mode="gaussian", last_n_dims=0)
-    datan = normalizer.normalize(data)
-    assert datan.shape == data.shape
-    assert np.allclose(datan.mean(), 0.0, atol=1e-3)
-    assert np.allclose(datan.std(), 1.0, atol=1e-3)
-    dataun = normalizer.unnormalize(datan)
-    assert torch.allclose(data, dataun, atol=1e-7)
-
-    # dict
-    data = torch.zeros((100, 10, 9, 2)).uniform_()
-    data[..., 0, 0] = 0
-
-    normalizer = LinearNormalizer()
-    normalizer.fit(data, mode="limits", last_n_dims=2)
-    datan = normalizer.normalize(data)
-    assert datan.shape == data.shape
-    assert np.allclose(datan.max(), 1.0)
-    assert np.allclose(datan.min(), -1.0)
-    dataun = normalizer.unnormalize(datan)
-    assert torch.allclose(data, dataun, atol=1e-7)
-
-    input_stats = normalizer.get_input_stats()
-    output_stats = normalizer.get_output_stats()
-
-    data = {
-        "obs": torch.zeros((1000, 128, 9, 2)).uniform_() * 512,
-        "action": torch.zeros((1000, 128, 2)).uniform_() * 512,
-    }
-    normalizer = LinearNormalizer()
-    normalizer.fit(data)
-    datan = normalizer.normalize(data)
-    dataun = normalizer.unnormalize(datan)
-    for key in data:
-        assert torch.allclose(data[key], dataun[key], atol=1e-4)
-
-    input_stats = normalizer.get_input_stats()
-    output_stats = normalizer.get_output_stats()
-
-    state_dict = normalizer.state_dict()
-    n = LinearNormalizer()
-    n.load_state_dict(state_dict)
-    datan = n.normalize(data)
-    dataun = n.unnormalize(datan)
-    for key in data:
-        assert torch.allclose(data[key], dataun[key], atol=1e-4)
diff --git a/roboverse_learn/il/utils/common/replay_buffer.py b/roboverse_learn/il/utils/common/replay_buffer.py
deleted file mode 100644
index c832cd578..000000000
--- a/roboverse_learn/il/utils/common/replay_buffer.py
+++ /dev/null
@@ -1,622 +0,0 @@
-import math
-import numbers
-import os
-from functools import cached_property
-from typing import Dict, Optional, Union
-
-import numcodecs
-import numpy as np
-import zarr
-
-
-def check_chunks_compatible(chunks: tuple, shape: tuple):
-    assert len(shape) == len(chunks)
-    for c in chunks:
-        assert isinstance(c, numbers.Integral)
-        assert c > 0
-
-
-def rechunk_recompress_array(group, name, chunks=None, chunk_length=None, compressor=None, tmp_key="_temp"):
-    old_arr = group[name]
-    if chunks is None:
-        if chunk_length is not None:
-            chunks = (chunk_length,) + old_arr.chunks[1:]
-        else:
-            chunks = old_arr.chunks
-    check_chunks_compatible(chunks, old_arr.shape)
-
-    if compressor is None:
-        compressor = old_arr.compressor
-
-    if (chunks == old_arr.chunks) and (compressor == old_arr.compressor):
-        # no change
-        return old_arr
-
-    # rechunk recompress
-    group.move(name, tmp_key)
-    old_arr = group[tmp_key]
-    n_copied, n_skipped, n_bytes_copied = zarr.copy(
-        source=old_arr,
-        dest=group,
-        name=name,
-        chunks=chunks,
-        compressor=compressor,
-    )
-    del group[tmp_key]
-    arr = group[name]
-    return arr
-
-
-def get_optimal_chunks(shape, dtype, target_chunk_bytes=2e6, max_chunk_length=None):
-    """
-    Common shapes
-    T,D
-    T,N,D
-    T,H,W,C
-    T,N,H,W,C
-    """
-    itemsize = np.dtype(dtype).itemsize
-    # reversed
-    rshape = list(shape[::-1])
-    if max_chunk_length is not None:
-        rshape[-1] = int(max_chunk_length)
-    split_idx = len(shape) - 1
-    for i in range(len(shape) - 1):
-        this_chunk_bytes = itemsize * np.prod(rshape[:i])
-        next_chunk_bytes = itemsize * np.prod(rshape[: i + 1])
-        if this_chunk_bytes <= target_chunk_bytes and next_chunk_bytes > target_chunk_bytes:
-            split_idx = i
-
-    rchunks = rshape[:split_idx]
-    item_chunk_bytes = itemsize * np.prod(rshape[:split_idx])
-    this_max_chunk_length = rshape[split_idx]
-    next_chunk_length = min(this_max_chunk_length, math.ceil(target_chunk_bytes / item_chunk_bytes))
-    rchunks.append(next_chunk_length)
-    len_diff = len(shape) - len(rchunks)
-    rchunks.extend([1] * len_diff)
-    chunks = tuple(rchunks[::-1])
-    # print(np.prod(chunks) * itemsize / target_chunk_bytes)
-    return chunks
-
-
-class ReplayBuffer:
-    """
-    Zarr-based temporal datastructure.
-    Assumes first dimension to be time. Only chunk in time dimension.
-    """
-
-    def __init__(self, root: Union[zarr.Group, Dict[str, dict]]):
-        """
-        Dummy constructor. Use copy_from* and create_from* class methods instead.
-        """
-        assert "data" in root
-        assert "meta" in root
-        assert "episode_ends" in root["meta"]
-        for key, value in root["data"].items():
-            assert value.shape[0] == root["meta"]["episode_ends"][-1]
-        self.root = root
-
-    # ============= create constructors ===============
-    @classmethod
-    def create_empty_zarr(cls, storage=None, root=None):
-        if root is None:
-            if storage is None:
-                storage = zarr.MemoryStore()
-            root = zarr.group(store=storage)
-        data = root.require_group("data", overwrite=False)
-        meta = root.require_group("meta", overwrite=False)
-        if "episode_ends" not in meta:
-            episode_ends = meta.zeros(
-                "episode_ends",
-                shape=(0,),
-                dtype=np.int64,
-                compressor=None,
-                overwrite=False,
-            )
-        return cls(root=root)
-
-    @classmethod
-    def create_empty_numpy(cls):
-        root = {
-            "data": dict(),
-            "meta": {"episode_ends": np.zeros((0,), dtype=np.int64)},
-        }
-        return cls(root=root)
-
-    @classmethod
-    def create_from_group(cls, group, **kwargs):
-        if "data" not in group:
-            # create from stratch
-            buffer = cls.create_empty_zarr(root=group, **kwargs)
-        else:
-            # already exist
-            buffer = cls(root=group, **kwargs)
-        return buffer
-
-    @classmethod
-    def create_from_path(cls, zarr_path, mode="r", **kwargs):
-        """
-        Open a on-disk zarr directly (for dataset larger than memory).
-        Slower.
-        """
-        group = zarr.open(os.path.expanduser(zarr_path), mode)
-        return cls.create_from_group(group, **kwargs)
-
-    # ============= copy constructors ===============
-    @classmethod
-    def copy_from_store(
-        cls,
-        src_store,
-        store=None,
-        keys=None,
-        chunks: Dict[str, tuple] = dict(),
-        compressors: Union[dict, str, numcodecs.abc.Codec] = dict(),
-        if_exists="replace",
-        **kwargs,
-    ):
-        """
-        Load to memory.
-        """
-        src_root = zarr.group(src_store)
-        root = None
-        if store is None:
-            # numpy backend
-            meta = dict()
-            for key, value in src_root["meta"].items():
-                if len(value.shape) == 0:
-                    meta[key] = np.array(value)
-                else:
-                    meta[key] = value[:]
-
-            if keys is None:
-                keys = src_root["data"].keys()
-            data = dict()
-            for key in keys:
-                arr = src_root["data"][key]
-                data[key] = arr[:]
-
-            root = {"meta": meta, "data": data}
-        else:
-            root = zarr.group(store=store)
-            # copy without recompression
-            n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
-                source=src_store,
-                dest=store,
-                source_path="/meta",
-                dest_path="/meta",
-                if_exists=if_exists,
-            )
-            data_group = root.create_group("data", overwrite=True)
-            if keys is None:
-                keys = src_root["data"].keys()
-            for key in keys:
-                value = src_root["data"][key]
-                cks = cls._resolve_array_chunks(chunks=chunks, key=key, array=value)
-                cpr = cls._resolve_array_compressor(compressors=compressors, key=key, array=value)
-                if cks == value.chunks and cpr == value.compressor:
-                    # copy without recompression
-                    this_path = "/data/" + key
-                    n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
-                        source=src_store,
-                        dest=store,
-                        source_path=this_path,
-                        dest_path=this_path,
-                        if_exists=if_exists,
-                    )
-                else:
-                    # copy with recompression
-                    n_copied, n_skipped, n_bytes_copied = zarr.copy(
-                        source=value,
-                        dest=data_group,
-                        name=key,
-                        chunks=cks,
-                        compressor=cpr,
-                        if_exists=if_exists,
-                    )
-        buffer = cls(root=root)
-        return buffer
-
-    @classmethod
-    def copy_from_path(
-        cls,
-        zarr_path,
-        backend=None,
-        store=None,
-        keys=None,
-        chunks: Dict[str, tuple] = dict(),
-        compressors: Union[dict, str, numcodecs.abc.Codec] = dict(),
-        if_exists="replace",
-        **kwargs,
-    ):
-        """
-        Copy a on-disk zarr to in-memory compressed.
-        Recommended
-        """
-        if backend == "numpy":
-            print("backend argument is deprecated!")
-            store = None
-
-        group = zarr.open(store=os.path.expanduser(zarr_path), mode="r")
-        return cls.copy_from_store(
-            src_store=group.store,
-            store=store,
-            keys=keys,
-            chunks=chunks,
-            compressors=compressors,
-            if_exists=if_exists,
-            **kwargs,
-        )
-
-    # ============= save methods ===============
-    def save_to_store(
-        self,
-        store,
-        chunks: Optional[Dict[str, tuple]] = dict(),
-        compressors: Union[str, numcodecs.abc.Codec, dict] = dict(),
-        if_exists="replace",
-        **kwargs,
-    ):
-
-        root = zarr.group(store)
-        if self.backend == "zarr":
-            # recompression free copy
-            n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
-                source=self.root.store,
-                dest=store,
-                source_path="/meta",
-                dest_path="/meta",
-                if_exists=if_exists,
-            )
-        else:
-            meta_group = root.create_group("meta", overwrite=True)
-            # save meta, no chunking
-            for key, value in self.root["meta"].items():
-                _ = meta_group.array(name=key, data=value, shape=value.shape, chunks=value.shape)
-
-        # save data, chunk
-        data_group = root.create_group("data", overwrite=True)
-        for key, value in self.root["data"].items():
-            cks = self._resolve_array_chunks(chunks=chunks, key=key, array=value)
-            cpr = self._resolve_array_compressor(compressors=compressors, key=key, array=value)
-            if isinstance(value, zarr.Array):
-                if cks == value.chunks and cpr == value.compressor:
-                    # copy without recompression
-                    this_path = "/data/" + key
-                    n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
-                        source=self.root.store,
-                        dest=store,
-                        source_path=this_path,
-                        dest_path=this_path,
-                        if_exists=if_exists,
-                    )
-                else:
-                    # copy with recompression
-                    n_copied, n_skipped, n_bytes_copied = zarr.copy(
-                        source=value,
-                        dest=data_group,
-                        name=key,
-                        chunks=cks,
-                        compressor=cpr,
-                        if_exists=if_exists,
-                    )
-            else:
-                # numpy
-                _ = data_group.array(name=key, data=value, chunks=cks, compressor=cpr)
-        return store
-
-    def save_to_path(
-        self,
-        zarr_path,
-        chunks: Optional[Dict[str, tuple]] = dict(),
-        compressors: Union[str, numcodecs.abc.Codec, dict] = dict(),
-        if_exists="replace",
-        **kwargs,
-    ):
-        store = zarr.DirectoryStore(os.path.expanduser(zarr_path))
-        return self.save_to_store(store, chunks=chunks, compressors=compressors, if_exists=if_exists, **kwargs)
-
-    @staticmethod
-    def resolve_compressor(compressor="default"):
-        if compressor == "default":
-            compressor = numcodecs.Blosc(cname="lz4", clevel=5, shuffle=numcodecs.Blosc.NOSHUFFLE)
-        elif compressor == "disk":
-            compressor = numcodecs.Blosc("zstd", clevel=5, shuffle=numcodecs.Blosc.BITSHUFFLE)
-        return compressor
-
-    @classmethod
-    def _resolve_array_compressor(cls, compressors: Union[dict, str, numcodecs.abc.Codec], key, array):
-        # allows compressor to be explicitly set to None
-        cpr = "nil"
-        if isinstance(compressors, dict):
-            if key in compressors:
-                cpr = cls.resolve_compressor(compressors[key])
-            elif isinstance(array, zarr.Array):
-                cpr = array.compressor
-        else:
-            cpr = cls.resolve_compressor(compressors)
-        # backup default
-        if cpr == "nil":
-            cpr = cls.resolve_compressor("default")
-        return cpr
-
-    @classmethod
-    def _resolve_array_chunks(cls, chunks: Union[dict, tuple], key, array):
-        cks = None
-        if isinstance(chunks, dict):
-            if key in chunks:
-                cks = chunks[key]
-            elif isinstance(array, zarr.Array):
-                cks = array.chunks
-        elif isinstance(chunks, tuple):
-            cks = chunks
-        else:
-            raise TypeError(f"Unsupported chunks type {type(chunks)}")
-        # backup default
-        if cks is None:
-            cks = get_optimal_chunks(shape=array.shape, dtype=array.dtype)
-        # check
-        check_chunks_compatible(chunks=cks, shape=array.shape)
-        return cks
-
-    # ============= properties =================
-    @cached_property
-    def data(self):
-        return self.root["data"]
-
-    @cached_property
-    def meta(self):
-        return self.root["meta"]
-
-    def update_meta(self, data):
-        # sanitize data
-        np_data = dict()
-        for key, value in data.items():
-            if isinstance(value, np.ndarray):
-                np_data[key] = value
-            else:
-                arr = np.array(value)
-                if arr.dtype == object:
-                    raise TypeError(f"Invalid value type {type(value)}")
-                np_data[key] = arr
-
-        meta_group = self.meta
-        if self.backend == "zarr":
-            for key, value in np_data.items():
-                _ = meta_group.array(
-                    name=key,
-                    data=value,
-                    shape=value.shape,
-                    chunks=value.shape,
-                    overwrite=True,
-                )
-        else:
-            meta_group.update(np_data)
-
-        return meta_group
-
-    @property
-    def episode_ends(self):
-        return self.meta["episode_ends"]
-
-    def get_episode_idxs(self):
-        import numba
-
-        numba.jit(nopython=True)
-
-        def _get_episode_idxs(episode_ends):
-            result = np.zeros((episode_ends[-1],), dtype=np.int64)
-            for i in range(len(episode_ends)):
-                start = 0
-                if i > 0:
-                    start = episode_ends[i - 1]
-                end = episode_ends[i]
-                for idx in range(start, end):
-                    result[idx] = i
-            return result
-
-        return _get_episode_idxs(self.episode_ends)
-
-    @property
-    def backend(self):
-        backend = "numpy"
-        if isinstance(self.root, zarr.Group):
-            backend = "zarr"
-        return backend
-
-    # =========== dict-like API ==============
-    def __repr__(self) -> str:
-        if self.backend == "zarr":
-            return str(self.root.tree())
-        else:
-            return super().__repr__()
-
-    def keys(self):
-        return self.data.keys()
-
-    def values(self):
-        return self.data.values()
-
-    def items(self):
-        return self.data.items()
-
-    def __getitem__(self, key):
-        return self.data[key]
-
-    def __contains__(self, key):
-        return key in self.data
-
-    # =========== our API ==============
-    @property
-    def n_steps(self):
-        if len(self.episode_ends) == 0:
-            return 0
-        return self.episode_ends[-1]
-
-    @property
-    def n_episodes(self):
-        return len(self.episode_ends)
-
-    @property
-    def chunk_size(self):
-        if self.backend == "zarr":
-            return next(iter(self.data.arrays()))[-1].chunks[0]
-        return None
-
-    @property
-    def episode_lengths(self):
-        ends = self.episode_ends[:]
-        ends = np.insert(ends, 0, 0)
-        lengths = np.diff(ends)
-        return lengths
-
-    def add_episode(
-        self,
-        data: Dict[str, np.ndarray],
-        chunks: Optional[Dict[str, tuple]] = dict(),
-        compressors: Union[str, numcodecs.abc.Codec, dict] = dict(),
-    ):
-        assert len(data) > 0
-        is_zarr = self.backend == "zarr"
-
-        curr_len = self.n_steps
-        episode_length = None
-        for key, value in data.items():
-            assert len(value.shape) >= 1
-            if episode_length is None:
-                episode_length = len(value)
-            else:
-                assert episode_length == len(value)
-        new_len = curr_len + episode_length
-
-        for key, value in data.items():
-            new_shape = (new_len,) + value.shape[1:]
-            # create array
-            if key not in self.data:
-                if is_zarr:
-                    cks = self._resolve_array_chunks(chunks=chunks, key=key, array=value)
-                    cpr = self._resolve_array_compressor(compressors=compressors, key=key, array=value)
-                    arr = self.data.zeros(
-                        name=key,
-                        shape=new_shape,
-                        chunks=cks,
-                        dtype=value.dtype,
-                        compressor=cpr,
-                    )
-                else:
-                    # copy data to prevent modify
-                    arr = np.zeros(shape=new_shape, dtype=value.dtype)
-                    self.data[key] = arr
-            else:
-                arr = self.data[key]
-                assert value.shape[1:] == arr.shape[1:]
-                # same method for both zarr and numpy
-                if is_zarr:
-                    arr.resize(new_shape)
-                else:
-                    arr.resize(new_shape, refcheck=False)
-            # copy data
-            arr[-value.shape[0] :] = value
-
-        # append to episode ends
-        episode_ends = self.episode_ends
-        if is_zarr:
-            episode_ends.resize(episode_ends.shape[0] + 1)
-        else:
-            episode_ends.resize(episode_ends.shape[0] + 1, refcheck=False)
-        episode_ends[-1] = new_len
-
-        # rechunk
-        if is_zarr:
-            if episode_ends.chunks[0] < episode_ends.shape[0]:
-                rechunk_recompress_array(
-                    self.meta,
-                    "episode_ends",
-                    chunk_length=int(episode_ends.shape[0] * 1.5),
-                )
-
-    def drop_episode(self):
-        is_zarr = self.backend == "zarr"
-        episode_ends = self.episode_ends[:].copy()
-        assert len(episode_ends) > 0
-        start_idx = 0
-        if len(episode_ends) > 1:
-            start_idx = episode_ends[-2]
-        for key, value in self.data.items():
-            new_shape = (start_idx,) + value.shape[1:]
-            if is_zarr:
-                value.resize(new_shape)
-            else:
-                value.resize(new_shape, refcheck=False)
-        if is_zarr:
-            self.episode_ends.resize(len(episode_ends) - 1)
-        else:
-            self.episode_ends.resize(len(episode_ends) - 1, refcheck=False)
-
-    def pop_episode(self):
-        assert self.n_episodes > 0
-        episode = self.get_episode(self.n_episodes - 1, copy=True)
-        self.drop_episode()
-        return episode
-
-    def extend(self, data):
-        self.add_episode(data)
-
-    def get_episode(self, idx, copy=False):
-        idx = list(range(len(self.episode_ends)))[idx]
-        start_idx = 0
-        if idx > 0:
-            start_idx = self.episode_ends[idx - 1]
-        end_idx = self.episode_ends[idx]
-        result = self.get_steps_slice(start_idx, end_idx, copy=copy)
-        return result
-
-    def get_episode_slice(self, idx):
-        start_idx = 0
-        if idx > 0:
-            start_idx = self.episode_ends[idx - 1]
-        end_idx = self.episode_ends[idx]
-        return slice(start_idx, end_idx)
-
-    def get_steps_slice(self, start, stop, step=None, copy=False):
-        _slice = slice(start, stop, step)
-
-        result = dict()
-        for key, value in self.data.items():
-            x = value[_slice]
-            if copy and isinstance(value, np.ndarray):
-                x = x.copy()
-            result[key] = x
-        return result
-
-    # =========== chunking =============
-    def get_chunks(self) -> dict:
-        assert self.backend == "zarr"
-        chunks = dict()
-        for key, value in self.data.items():
-            chunks[key] = value.chunks
-        return chunks
-
-    def set_chunks(self, chunks: dict):
-        assert self.backend == "zarr"
-        for key, value in chunks.items():
-            if key in self.data:
-                arr = self.data[key]
-                if value != arr.chunks:
-                    check_chunks_compatible(chunks=value, shape=arr.shape)
-                    rechunk_recompress_array(self.data, key, chunks=value)
-
-    def get_compressors(self) -> dict:
-        assert self.backend == "zarr"
-        compressors = dict()
-        for key, value in self.data.items():
-            compressors[key] = value.compressor
-        return compressors
-
-    def set_compressors(self, compressors: dict):
-        assert self.backend == "zarr"
-        for key, value in compressors.items():
-            if key in self.data:
-                arr = self.data[key]
-                compressor = self.resolve_compressor(value)
-                if compressor != arr.compressor:
-                    rechunk_recompress_array(self.data, key, compressor=compressor)
diff --git a/roboverse_learn/il/utils/common/sampler.py b/roboverse_learn/il/utils/common/sampler.py
deleted file mode 100644
index 5f58624bd..000000000
--- a/roboverse_learn/il/utils/common/sampler.py
+++ /dev/null
@@ -1,166 +0,0 @@
-from __future__ import annotations
-
-import numba
-import numpy as np
-
-from roboverse_learn.il.utils.common.replay_buffer import ReplayBuffer
-
-
-@numba.jit(nopython=True)
-def create_indices(
-    episode_ends: np.ndarray,
-    sequence_length: int,
-    episode_mask: np.ndarray,
-    pad_before: int = 0,
-    pad_after: int = 0,
-    debug: bool = True,
-) -> np.ndarray:
-    pad_before = min(max(pad_before, 0), sequence_length - 1)
-    pad_after = min(max(pad_after, 0), sequence_length - 1)
-
-    indices = list()
-    for i in range(len(episode_ends)):
-        if not episode_mask[i]:
-            # skip episode
-            continue
-        start_idx = 0
-        if i > 0:
-            start_idx = episode_ends[i - 1]
-        end_idx = episode_ends[i]
-        episode_length = end_idx - start_idx
-
-        min_start = -pad_before
-        max_start = episode_length - sequence_length + pad_after
-
-        # range stops one idx before end
-        for idx in range(min_start, max_start + 1):
-            buffer_start_idx = max(idx, 0) + start_idx
-            buffer_end_idx = min(idx + sequence_length, episode_length) + start_idx
-            start_offset = buffer_start_idx - (idx + start_idx)
-            end_offset = (idx + sequence_length + start_idx) - buffer_end_idx
-            sample_start_idx = 0 + start_offset
-            sample_end_idx = sequence_length - end_offset
-            if debug:
-                assert start_offset >= 0
-                assert end_offset >= 0
-                assert (sample_end_idx - sample_start_idx) == (buffer_end_idx - buffer_start_idx)
-            indices.append([buffer_start_idx, buffer_end_idx, sample_start_idx, sample_end_idx])
-    indices = np.array(indices)
-    return indices
-
-
-def get_val_mask(n_episodes, val_ratio, seed=0):
-    val_mask = np.zeros(n_episodes, dtype=bool)
-    if val_ratio <= 0:
-        return val_mask
-
-    # have at least 1 episode for validation, and at least 1 episode for train
-    n_val = min(max(1, round(n_episodes * val_ratio)), n_episodes - 1)
-    rng = np.random.default_rng(seed=seed)
-    # val_idxs = rng.choice(n_episodes, size=n_val, replace=False)
-    val_idxs = -1
-    val_mask[val_idxs] = True
-    return val_mask
-
-
-def downsample_mask(mask, max_n, seed=0):
-    # subsample training data
-    train_mask = mask
-    if (max_n is not None) and (np.sum(train_mask) > max_n):
-        n_train = int(max_n)
-        curr_train_idxs = np.nonzero(train_mask)[0]
-        rng = np.random.default_rng(seed=seed)
-        train_idxs_idx = rng.choice(len(curr_train_idxs), size=n_train, replace=False)
-        train_idxs = curr_train_idxs[train_idxs_idx]
-        train_mask = np.zeros_like(train_mask)
-        train_mask[train_idxs] = True
-        assert np.sum(train_mask) == n_train
-    return train_mask
-
-
-class SequenceSampler:
-    def __init__(
-        self,
-        replay_buffer: ReplayBuffer,
-        sequence_length: int,
-        pad_before: int = 0,
-        pad_after: int = 0,
-        keys=None,
-        key_first_k=None,
-        episode_mask: np.ndarray | None = None,
-    ):
-        """
-        key_first_k: dict str: int
-            Only take first k data from these keys (to improve perf)
-        """
-
-        super().__init__()
-        if key_first_k is None:
-            key_first_k = dict()
-        assert sequence_length >= 1
-        if keys is None:
-            keys = list(replay_buffer.keys())
-
-        episode_ends = replay_buffer.episode_ends[:]
-        if episode_mask is None:
-            episode_mask = np.ones(episode_ends.shape, dtype=bool)
-
-        if np.any(episode_mask):
-            indices = create_indices(
-                episode_ends,
-                sequence_length=sequence_length,
-                pad_before=pad_before,
-                pad_after=pad_after,
-                episode_mask=episode_mask,
-            )
-        else:
-            indices = np.zeros((0, 4), dtype=np.int64)
-
-        # (buffer_start_idx, buffer_end_idx, sample_start_idx, sample_end_idx)
-        self.indices = indices
-        self.keys = list(keys)  # prevent OmegaConf list performance problem
-        self.sequence_length = sequence_length
-        self.replay_buffer = replay_buffer
-        self.key_first_k = key_first_k
-
-    def __len__(self):
-        return len(self.indices)
-
-    def sample_sequence(self, idx):
-        buffer_start_idx, buffer_end_idx, sample_start_idx, sample_end_idx = self.indices[idx]
-        result = dict()
-        for key in self.keys:
-            input_arr = self.replay_buffer[key]
-            # performance optimization, avoid small allocation if possible
-            if key not in self.key_first_k:
-                sample = input_arr[buffer_start_idx:buffer_end_idx]
-            else:
-                # performance optimization, only load used obs steps
-                n_data = buffer_end_idx - buffer_start_idx
-                k_data = min(self.key_first_k[key], n_data)
-                # fill value with Nan to catch bugs
-                # the non-loaded region should never be used
-                sample = np.full(
-                    (n_data,) + input_arr.shape[1:],
-                    fill_value=np.nan,
-                    dtype=input_arr.dtype,
-                )
-                try:
-                    sample[:k_data] = input_arr[buffer_start_idx : buffer_start_idx + k_data]
-                except Exception as e:
-                    import pdb
-
-                    pdb.set_trace()
-            data = sample
-            if (sample_start_idx > 0) or (sample_end_idx < self.sequence_length):
-                data = np.zeros(
-                    shape=(self.sequence_length,) + input_arr.shape[1:],
-                    dtype=input_arr.dtype,
-                )
-                if sample_start_idx > 0:
-                    data[:sample_start_idx] = sample[0]
-                if sample_end_idx < self.sequence_length:
-                    data[sample_end_idx:] = sample[-1]
-                data[sample_start_idx:sample_end_idx] = sample
-            result[key] = data
-        return result
diff --git a/roboverse_learn/il/utils/common/cv2_util.py b/roboverse_learn/il/utils/cv2_util.py
similarity index 100%
rename from roboverse_learn/il/utils/common/cv2_util.py
rename to roboverse_learn/il/utils/cv2_util.py
diff --git a/roboverse_learn/il/utils/common/dict_of_tensor_mixin.py b/roboverse_learn/il/utils/dict_of_tensor_mixin.py
similarity index 100%
rename from roboverse_learn/il/utils/common/dict_of_tensor_mixin.py
rename to roboverse_learn/il/utils/dict_of_tensor_mixin.py
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/__init__.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/__init__.py
deleted file mode 100644
index 7394cce83..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .workspace.robotworkspace import RobotWorkspace
-from .workspace.base_workspace import BaseWorkspace
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/checkpoint_util.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/checkpoint_util.py
deleted file mode 100644
index 049b6ceb2..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/checkpoint_util.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import os
-from typing import Dict, Optional
-
-
-class TopKCheckpointManager:
-    def __init__(
-        self,
-        save_dir,
-        monitor_key: str,
-        mode="min",
-        k=1,
-        format_str="epoch={epoch:03d}-train_loss={train_loss:.3f}.ckpt",
-    ):
-        assert mode in ["max", "min"]
-        assert k >= 0
-
-        self.save_dir = save_dir
-        self.monitor_key = monitor_key
-        self.mode = mode
-        self.k = k
-        self.format_str = format_str
-        self.path_value_map = dict()
-
-    def get_ckpt_path(self, data: Dict[str, float]) -> Optional[str]:
-        if self.k == 0:
-            return None
-
-        value = data[self.monitor_key]
-        ckpt_path = os.path.join(self.save_dir, self.format_str.format(**data))
-
-        if len(self.path_value_map) < self.k:
-            # under-capacity
-            self.path_value_map[ckpt_path] = value
-            return ckpt_path
-
-        # at capacity
-        sorted_map = sorted(self.path_value_map.items(), key=lambda x: x[1])
-        min_path, min_value = sorted_map[0]
-        max_path, max_value = sorted_map[-1]
-
-        delete_path = None
-        if self.mode == "max":
-            if value > min_value:
-                delete_path = min_path
-        else:
-            if value < max_value:
-                delete_path = max_path
-
-        if delete_path is None:
-            return None
-        else:
-            del self.path_value_map[delete_path]
-            self.path_value_map[ckpt_path] = value
-
-            if not os.path.exists(self.save_dir):
-                os.mkdir(self.save_dir)
-
-            if os.path.exists(delete_path):
-                os.remove(delete_path)
-            return ckpt_path
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/cv2_util.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/cv2_util.py
deleted file mode 100644
index c6c9e6446..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/cv2_util.py
+++ /dev/null
@@ -1,151 +0,0 @@
-import math
-from typing import Tuple
-
-import cv2
-import numpy as np
-
-
-def draw_reticle(img, u, v, label_color):
-    """
-    Draws a reticle (cross-hair) on the image at the given position on top of
-    the original image.
-    @param img (In/Out) uint8 3 channel image
-    @param u X coordinate (width)
-    @param v Y coordinate (height)
-    @param label_color tuple of 3 ints for RGB color used for drawing.
-    """
-    # Cast to int.
-    u = int(u)
-    v = int(v)
-
-    white = (255, 255, 255)
-    cv2.circle(img, (u, v), 10, label_color, 1)
-    cv2.circle(img, (u, v), 11, white, 1)
-    cv2.circle(img, (u, v), 12, label_color, 1)
-    cv2.line(img, (u, v + 1), (u, v + 3), white, 1)
-    cv2.line(img, (u + 1, v), (u + 3, v), white, 1)
-    cv2.line(img, (u, v - 1), (u, v - 3), white, 1)
-    cv2.line(img, (u - 1, v), (u - 3, v), white, 1)
-
-
-def draw_text(
-    img,
-    *,
-    text,
-    uv_top_left,
-    color=(255, 255, 255),
-    fontScale=0.5,
-    thickness=1,
-    fontFace=cv2.FONT_HERSHEY_SIMPLEX,
-    outline_color=(0, 0, 0),
-    line_spacing=1.5,
-):
-    """
-    Draws multiline with an outline.
-    """
-    assert isinstance(text, str)
-
-    uv_top_left = np.array(uv_top_left, dtype=float)
-    assert uv_top_left.shape == (2,)
-
-    for line in text.splitlines():
-        (w, h), _ = cv2.getTextSize(
-            text=line,
-            fontFace=fontFace,
-            fontScale=fontScale,
-            thickness=thickness,
-        )
-        uv_bottom_left_i = uv_top_left + [0, h]
-        org = tuple(uv_bottom_left_i.astype(int))
-
-        if outline_color is not None:
-            cv2.putText(
-                img,
-                text=line,
-                org=org,
-                fontFace=fontFace,
-                fontScale=fontScale,
-                color=outline_color,
-                thickness=thickness * 3,
-                lineType=cv2.LINE_AA,
-            )
-        cv2.putText(
-            img,
-            text=line,
-            org=org,
-            fontFace=fontFace,
-            fontScale=fontScale,
-            color=color,
-            thickness=thickness,
-            lineType=cv2.LINE_AA,
-        )
-
-        uv_top_left += [0, h * line_spacing]
-
-
-def get_image_transform(
-    input_res: Tuple[int, int] = (1280, 720),
-    output_res: Tuple[int, int] = (640, 480),
-    bgr_to_rgb: bool = False,
-):
-
-    iw, ih = input_res
-    ow, oh = output_res
-    rw, rh = None, None
-    interp_method = cv2.INTER_AREA
-
-    if (iw / ih) >= (ow / oh):
-        # input is wider
-        rh = oh
-        rw = math.ceil(rh / ih * iw)
-        if oh > ih:
-            interp_method = cv2.INTER_LINEAR
-    else:
-        rw = ow
-        rh = math.ceil(rw / iw * ih)
-        if ow > iw:
-            interp_method = cv2.INTER_LINEAR
-
-    w_slice_start = (rw - ow) // 2
-    w_slice = slice(w_slice_start, w_slice_start + ow)
-    h_slice_start = (rh - oh) // 2
-    h_slice = slice(h_slice_start, h_slice_start + oh)
-    c_slice = slice(None)
-    if bgr_to_rgb:
-        c_slice = slice(None, None, -1)
-
-    def transform(img: np.ndarray):
-        assert img.shape == ((ih, iw, 3))
-        # resize
-        img = cv2.resize(img, (rw, rh), interpolation=interp_method)
-        # crop
-        img = img[h_slice, w_slice, c_slice]
-        return img
-
-    return transform
-
-
-def optimal_row_cols(n_cameras, in_wh_ratio, max_resolution=(1920, 1080)):
-    out_w, out_h = max_resolution
-    out_wh_ratio = out_w / out_h
-
-    n_rows = np.arange(n_cameras, dtype=np.int64) + 1
-    n_cols = np.ceil(n_cameras / n_rows).astype(np.int64)
-    cat_wh_ratio = in_wh_ratio * (n_cols / n_rows)
-    ratio_diff = np.abs(out_wh_ratio - cat_wh_ratio)
-    best_idx = np.argmin(ratio_diff)
-    best_n_row = n_rows[best_idx]
-    best_n_col = n_cols[best_idx]
-    best_cat_wh_ratio = cat_wh_ratio[best_idx]
-
-    rw, rh = None, None
-    if best_cat_wh_ratio >= out_wh_ratio:
-        # cat is wider
-        rw = math.floor(out_w / best_n_col)
-        rh = math.floor(rw / in_wh_ratio)
-    else:
-        rh = math.floor(out_h / best_n_row)
-        rw = math.floor(rh * in_wh_ratio)
-
-    # crop_resolution = (rw, rh)
-    return rw, rh, best_n_col, best_n_row
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/env_util.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/env_util.py
deleted file mode 100644
index 30622fac6..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/env_util.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import cv2
-import numpy as np
-
-
-def render_env_video(env, states, actions=None):
-    observations = states
-    imgs = list()
-    for i in range(len(observations)):
-        state = observations[i]
-        env.set_state(state)
-        if i == 0:
-            env.set_state(state)
-        img = env.render()
-        # draw action
-        if actions is not None:
-            action = actions[i]
-            coord = (action / 512 * 96).astype(np.int32)
-            cv2.drawMarker(
-                img,
-                coord,
-                color=(255, 0, 0),
-                markerType=cv2.MARKER_CROSS,
-                markerSize=8,
-                thickness=1,
-            )
-        imgs.append(img)
-    imgs = np.array(imgs)
-    return imgs
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/json_logger.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/json_logger.py
deleted file mode 100644
index 9bd7f2973..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/json_logger.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import copy
-import json
-import numbers
-import os
-from typing import Any, Callable, Optional, Sequence
-
-import pandas as pd
-
-
-def read_json_log(path: str, required_keys: Sequence[str] = tuple(), **kwargs) -> pd.DataFrame:
-    """
-    Read json-per-line file, with potentially incomplete lines.
-    kwargs passed to pd.read_json
-    """
-    lines = list()
-    with open(path, "r") as f:
-        while True:
-            # one json per line
-            line = f.readline()
-            if len(line) == 0:
-                # EOF
-                break
-            elif not line.endswith("\n"):
-                # incomplete line
-                break
-            is_relevant = False
-            for k in required_keys:
-                if k in line:
-                    is_relevant = True
-                    break
-            if is_relevant:
-                lines.append(line)
-    if len(lines) < 1:
-        return pd.DataFrame()
-    json_buf = f'[{",".join([line for line in (line.strip() for line in lines) if line])}]'
-    df = pd.read_json(json_buf, **kwargs)
-    return df
-
-
-class JsonLogger:
-    def __init__(self, path: str, filter_fn: Optional[Callable[[str, Any], bool]] = None):
-        if filter_fn is None:
-            filter_fn = lambda k, v: isinstance(v, numbers.Number)
-
-        # default to append mode
-        self.path = path
-        self.filter_fn = filter_fn
-        self.file = None
-        self.last_log = None
-
-    def start(self):
-        # use line buffering
-        try:
-            self.file = file = open(self.path, "r+", buffering=1)
-        except FileNotFoundError:
-            self.file = file = open(self.path, "w+", buffering=1)
-
-        # Move the pointer (similar to a cursor in a text editor) to the end of the file
-        pos = file.seek(0, os.SEEK_END)
-
-        # Read each character in the file one at a time from the last
-        # character going backwards, searching for a newline character
-        # If we find a new line, exit the search
-        while pos > 0 and file.read(1) != "\n":
-            pos -= 1
-            file.seek(pos, os.SEEK_SET)
-        # now the file pointer is at one past the last '\n'
-        # and pos is at the last '\n'.
-        last_line_end = file.tell()
-
-        # find the start of second last line
-        pos = max(0, pos - 1)
-        file.seek(pos, os.SEEK_SET)
-        while pos > 0 and file.read(1) != "\n":
-            pos -= 1
-            file.seek(pos, os.SEEK_SET)
-        # now the file pointer is at one past the second last '\n'
-        last_line_start = file.tell()
-
-        if last_line_start < last_line_end:
-            # has last line of json
-            last_line = file.readline()
-            self.last_log = json.loads(last_line)
-
-        # remove the last incomplete line
-        file.seek(last_line_end)
-        file.truncate()
-
-    def stop(self):
-        self.file.close()
-        self.file = None
-
-    def __enter__(self):
-        self.start()
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.stop()
-
-    def log(self, data: dict):
-        filtered_data = dict(filter(lambda x: self.filter_fn(*x), data.items()))
-        # save current as last log
-        self.last_log = filtered_data
-        for k, v in filtered_data.items():
-            if isinstance(v, numbers.Integral):
-                filtered_data[k] = int(v)
-            elif isinstance(v, numbers.Number):
-                filtered_data[k] = float(v)
-        buf = json.dumps(filtered_data)
-        # ensure one line per json
-        buf = buf.replace("\n", "") + "\n"
-        self.file.write(buf)
-
-    def get_last_log(self):
-        return copy.deepcopy(self.last_log)
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/nested_dict_util.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/nested_dict_util.py
deleted file mode 100644
index 013bd0bd8..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/nested_dict_util.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import functools
-
-
-def nested_dict_map(f, x):
-    """
-    Map f over all leaf of nested dict x
-    """
-
-    if not isinstance(x, dict):
-        return f(x)
-    y = dict()
-    for key, value in x.items():
-        y[key] = nested_dict_map(f, value)
-    return y
-
-
-def nested_dict_reduce(f, x):
-    """
-    Map f over all values of nested dict x, and reduce to a single value
-    """
-    if not isinstance(x, dict):
-        return x
-
-    reduced_values = list()
-    for value in x.values():
-        reduced_values.append(nested_dict_reduce(f, value))
-    y = functools.reduce(f, reduced_values)
-    return y
-
-
-def nested_dict_check(f, x):
-    bool_dict = nested_dict_map(f, x)
-    result = nested_dict_reduce(lambda x, y: x and y, bool_dict)
-    return result
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/pose_trajectory_interpolator.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/pose_trajectory_interpolator.py
deleted file mode 100644
index e87bc3d98..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/pose_trajectory_interpolator.py
+++ /dev/null
@@ -1,208 +0,0 @@
-import numbers
-from typing import Union
-
-import numpy as np
-import scipy.interpolate as si
-import scipy.spatial.transform as st
-
-
-def rotation_distance(a: st.Rotation, b: st.Rotation) -> float:
-    return (b * a.inv()).magnitude()
-
-
-def pose_distance(start_pose, end_pose):
-    start_pose = np.array(start_pose)
-    end_pose = np.array(end_pose)
-    start_pos = start_pose[:3]
-    end_pos = end_pose[:3]
-    start_rot = st.Rotation.from_rotvec(start_pose[3:])
-    end_rot = st.Rotation.from_rotvec(end_pose[3:])
-    pos_dist = np.linalg.norm(end_pos - start_pos)
-    rot_dist = rotation_distance(start_rot, end_rot)
-    return pos_dist, rot_dist
-
-
-class PoseTrajectoryInterpolator:
-    def __init__(self, times: np.ndarray, poses: np.ndarray):
-        assert len(times) >= 1
-        assert len(poses) == len(times)
-        if not isinstance(times, np.ndarray):
-            times = np.array(times)
-        if not isinstance(poses, np.ndarray):
-            poses = np.array(poses)
-
-        if len(times) == 1:
-            # special treatment for single step interpolation
-            self.single_step = True
-            self._times = times
-            self._poses = poses
-        else:
-            self.single_step = False
-            assert np.all(times[1:] >= times[:-1])
-
-            pos = poses[:, :3]
-            rot = st.Rotation.from_rotvec(poses[:, 3:])
-
-            self.pos_interp = si.interp1d(times, pos, axis=0, assume_sorted=True)
-            self.rot_interp = st.Slerp(times, rot)
-
-    @property
-    def times(self) -> np.ndarray:
-        if self.single_step:
-            return self._times
-        else:
-            return self.pos_interp.x
-
-    @property
-    def poses(self) -> np.ndarray:
-        if self.single_step:
-            return self._poses
-        else:
-            n = len(self.times)
-            poses = np.zeros((n, 6))
-            poses[:, :3] = self.pos_interp.y
-            poses[:, 3:] = self.rot_interp(self.times).as_rotvec()
-            return poses
-
-    def trim(self, start_t: float, end_t: float) -> "PoseTrajectoryInterpolator":
-        assert start_t <= end_t
-        times = self.times
-        should_keep = (start_t < times) & (times < end_t)
-        keep_times = times[should_keep]
-        all_times = np.concatenate([[start_t], keep_times, [end_t]])
-        # remove duplicates, Slerp requires strictly increasing x
-        all_times = np.unique(all_times)
-        # interpolate
-        all_poses = self(all_times)
-        return PoseTrajectoryInterpolator(times=all_times, poses=all_poses)
-
-    def drive_to_waypoint(
-        self, pose, time, curr_time, max_pos_speed=np.inf, max_rot_speed=np.inf
-    ) -> "PoseTrajectoryInterpolator":
-        assert max_pos_speed > 0
-        assert max_rot_speed > 0
-        time = max(time, curr_time)
-
-        curr_pose = self(curr_time)
-        pos_dist, rot_dist = pose_distance(curr_pose, pose)
-        pos_min_duration = pos_dist / max_pos_speed
-        rot_min_duration = rot_dist / max_rot_speed
-        duration = time - curr_time
-        duration = max(duration, max(pos_min_duration, rot_min_duration))
-        assert duration >= 0
-        last_waypoint_time = curr_time + duration
-
-        # insert new pose
-        trimmed_interp = self.trim(curr_time, curr_time)
-        times = np.append(trimmed_interp.times, [last_waypoint_time], axis=0)
-        poses = np.append(trimmed_interp.poses, [pose], axis=0)
-
-        # create new interpolator
-        final_interp = PoseTrajectoryInterpolator(times, poses)
-        return final_interp
-
-    def schedule_waypoint(
-        self,
-        pose,
-        time,
-        max_pos_speed=np.inf,
-        max_rot_speed=np.inf,
-        curr_time=None,
-        last_waypoint_time=None,
-    ) -> "PoseTrajectoryInterpolator":
-        assert max_pos_speed > 0
-        assert max_rot_speed > 0
-        if last_waypoint_time is not None:
-            assert curr_time is not None
-
-        # trim current interpolator to between curr_time and last_waypoint_time
-        start_time = self.times[0]
-        end_time = self.times[-1]
-        assert start_time <= end_time
-
-        if curr_time is not None:
-            if time <= curr_time:
-                # if insert time is earlier than current time
-                # no effect should be done to the interpolator
-                return self
-            # now, curr_time < time
-            start_time = max(curr_time, start_time)
-
-            if last_waypoint_time is not None:
-                # if last_waypoint_time is earlier than start_time
-                # use start_time
-                if time <= last_waypoint_time:
-                    end_time = curr_time
-                else:
-                    end_time = max(last_waypoint_time, curr_time)
-            else:
-                end_time = curr_time
-
-        end_time = min(end_time, time)
-        start_time = min(start_time, end_time)
-        # end time should be the latest of all times except time
-        # after this we can assume order (proven by zhenjia, due to the 2 min operations)
-
-        # Constraints:
-        # start_time <= end_time <= time (proven by zhenjia)
-        # curr_time <= start_time (proven by zhenjia)
-        # curr_time <= time (proven by zhenjia)
-
-        # time can't change
-        # last_waypoint_time can't change
-        # curr_time can't change
-        assert start_time <= end_time
-        assert end_time <= time
-        if last_waypoint_time is not None:
-            if time <= last_waypoint_time:
-                assert end_time == curr_time
-            else:
-                assert end_time == max(last_waypoint_time, curr_time)
-
-        if curr_time is not None:
-            assert curr_time <= start_time
-            assert curr_time <= time
-
-        trimmed_interp = self.trim(start_time, end_time)
-        # after this, all waypoints in trimmed_interp is within start_time and end_time
-        # and is earlier than time
-
-        # determine speed
-        duration = time - end_time
-        end_pose = trimmed_interp(end_time)
-        pos_dist, rot_dist = pose_distance(pose, end_pose)
-        pos_min_duration = pos_dist / max_pos_speed
-        rot_min_duration = rot_dist / max_rot_speed
-        duration = max(duration, max(pos_min_duration, rot_min_duration))
-        assert duration >= 0
-        last_waypoint_time = end_time + duration
-
-        # insert new pose
-        times = np.append(trimmed_interp.times, [last_waypoint_time], axis=0)
-        poses = np.append(trimmed_interp.poses, [pose], axis=0)
-
-        # create new interpolator
-        final_interp = PoseTrajectoryInterpolator(times, poses)
-        return final_interp
-
-    def __call__(self, t: Union[numbers.Number, np.ndarray]) -> np.ndarray:
-        is_single = False
-        if isinstance(t, numbers.Number):
-            is_single = True
-            t = np.array([t])
-
-        pose = np.zeros((len(t), 6))
-        if self.single_step:
-            pose[:] = self._poses[0]
-        else:
-            start_time = self.times[0]
-            end_time = self.times[-1]
-            t = np.clip(t, start_time, end_time)
-
-            pose = np.zeros((len(t), 6))
-            pose[:, :3] = self.pos_interp(t)
-            pose[:, 3:] = self.rot_interp(t).as_rotvec()
-
-        if is_single:
-            pose = pose[0]
-        return pose
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/precise_sleep.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/precise_sleep.py
deleted file mode 100644
index 04d783ee7..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/precise_sleep.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import time
-
-
-def precise_sleep(dt: float, slack_time: float = 0.001, time_func=time.monotonic):
-    """
-    Use hybrid of time.sleep and spinning to minimize jitter.
-    Sleep dt - slack_time seconds first, then spin for the rest.
-    """
-    t_start = time_func()
-    if dt > slack_time:
-        time.sleep(dt - slack_time)
-    t_end = t_start + dt
-    while time_func() < t_end:
-        pass
-    return
-
-
-def precise_wait(t_end: float, slack_time: float = 0.001, time_func=time.monotonic):
-    t_start = time_func()
-    t_wait = t_end - t_start
-    if t_wait > 0:
-        t_sleep = t_wait - slack_time
-        if t_sleep > 0:
-            time.sleep(t_sleep)
-        while time_func() < t_end:
-            pass
-    return
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/pymunk_override.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/pymunk_override.py
deleted file mode 100644
index 4b9dd3b45..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/pymunk_override.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# ----------------------------------------------------------------------------
-# pymunk
-# Copyright (c) 2007-2016 Victor Blomqvist
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-# ----------------------------------------------------------------------------
-
-"""This submodule contains helper functions to help with quick prototyping
-using pymunk together with pygame.
-
-Intended to help with debugging and prototyping, not for actual production use
-in a full application. The methods contained in this module is opinionated
-about your coordinate system and not in any way optimized.
-"""
-
-__docformat__ = "reStructuredText"
-
-__all__ = [
-    "DrawOptions",
-    "get_mouse_pos",
-    "to_pygame",
-    "from_pygame",
-    "lighten",
-    "positive_y_is_up",
-]
-
-from typing import List, Sequence, Tuple
-
-import numpy as np
-import pygame
-import pymunk
-from pymunk.space_debug_draw_options import SpaceDebugColor
-from pymunk.vec2d import Vec2d
-
-positive_y_is_up: bool = False
-"""Make increasing values of y point upwards.
-
-When True::
-
-    y
-    ^
-    |      . (3, 3)
-    |
-    |   . (2, 2)
-    |
-    +------ > x
-
-When False::
-
-    +------ > x
-    |
-    |   . (2, 2)
-    |
-    |      . (3, 3)
-    v
-    y
-
-"""
-
-
-class DrawOptions(pymunk.SpaceDebugDrawOptions):
-    def __init__(self, surface: pygame.Surface) -> None:
-        """Draw a pymunk.Space on a pygame.Surface object.
-
-        Typical usage::
-
-        >>> import pymunk
-        >>> surface = pygame.Surface((10,10))
-        >>> space = pymunk.Space()
-        >>> options = pymunk.pygame_util.DrawOptions(surface)
-        >>> space.debug_draw(options)
-
-        You can control the color of a shape by setting shape.color to the color
-        you want it drawn in::
-
-        >>> c = pymunk.Circle(None, 10)
-        >>> c.color = pygame.Color("pink")
-
-        See pygame_util.demo.py for a full example
-
-        Since pygame uses a coordinate system where y points down (in contrast
-        to many other cases), you either have to make the physics simulation
-        with Pymunk also behave in that way, or flip everything when you draw.
-
-        The easiest is probably to just make the simulation behave the same
-        way as Pygame does. In that way all coordinates used are in the same
-        orientation and easy to reason about::
-
-        >>> space = pymunk.Space()
-        >>> space.gravity = (0, -1000)
-        >>> body = pymunk.Body()
-        >>> body.position = (0, 0) # will be positioned in the top left corner
-        >>> space.debug_draw(options)
-
-        To flip the drawing its possible to set the module property
-        :py:data:`positive_y_is_up` to True. Then the pygame drawing will flip
-        the simulation upside down before drawing::
-
-        >>> positive_y_is_up = True
-        >>> body = pymunk.Body()
-        >>> body.position = (0, 0)
-        >>> # Body will be position in bottom left corner
-
-        :Parameters:
-                surface : pygame.Surface
-                    Surface that the objects will be drawn on
-        """
-        self.surface = surface
-        super(DrawOptions, self).__init__()
-
-    def draw_circle(
-        self,
-        pos: Vec2d,
-        angle: float,
-        radius: float,
-        outline_color: SpaceDebugColor,
-        fill_color: SpaceDebugColor,
-    ) -> None:
-        p = to_pygame(pos, self.surface)
-
-        pygame.draw.circle(self.surface, fill_color.as_int(), p, round(radius), 0)
-        pygame.draw.circle(self.surface, light_color(fill_color).as_int(), p, round(radius - 4), 0)
-
-        circle_edge = pos + Vec2d(radius, 0).rotated(angle)
-        p2 = to_pygame(circle_edge, self.surface)
-        line_r = 2 if radius > 20 else 1
-        # pygame.draw.lines(self.surface, outline_color.as_int(), False, [p, p2], line_r)
-
-    def draw_segment(self, a: Vec2d, b: Vec2d, color: SpaceDebugColor) -> None:
-        p1 = to_pygame(a, self.surface)
-        p2 = to_pygame(b, self.surface)
-
-        pygame.draw.aalines(self.surface, color.as_int(), False, [p1, p2])
-
-    def draw_fat_segment(
-        self,
-        a: Tuple[float, float],
-        b: Tuple[float, float],
-        radius: float,
-        outline_color: SpaceDebugColor,
-        fill_color: SpaceDebugColor,
-    ) -> None:
-        p1 = to_pygame(a, self.surface)
-        p2 = to_pygame(b, self.surface)
-
-        r = round(max(1, radius * 2))
-        pygame.draw.lines(self.surface, fill_color.as_int(), False, [p1, p2], r)
-        if r > 2:
-            orthog = [abs(p2[1] - p1[1]), abs(p2[0] - p1[0])]
-            if orthog[0] == 0 and orthog[1] == 0:
-                return
-            scale = radius / (orthog[0] * orthog[0] + orthog[1] * orthog[1]) ** 0.5
-            orthog[0] = round(orthog[0] * scale)
-            orthog[1] = round(orthog[1] * scale)
-            points = [
-                (p1[0] - orthog[0], p1[1] - orthog[1]),
-                (p1[0] + orthog[0], p1[1] + orthog[1]),
-                (p2[0] + orthog[0], p2[1] + orthog[1]),
-                (p2[0] - orthog[0], p2[1] - orthog[1]),
-            ]
-            pygame.draw.polygon(self.surface, fill_color.as_int(), points)
-            pygame.draw.circle(
-                self.surface,
-                fill_color.as_int(),
-                (round(p1[0]), round(p1[1])),
-                round(radius),
-            )
-            pygame.draw.circle(
-                self.surface,
-                fill_color.as_int(),
-                (round(p2[0]), round(p2[1])),
-                round(radius),
-            )
-
-    def draw_polygon(
-        self,
-        verts: Sequence[Tuple[float, float]],
-        radius: float,
-        outline_color: SpaceDebugColor,
-        fill_color: SpaceDebugColor,
-    ) -> None:
-        ps = [to_pygame(v, self.surface) for v in verts]
-        ps += [ps[0]]
-
-        radius = 2
-        pygame.draw.polygon(self.surface, light_color(fill_color).as_int(), ps)
-
-        if radius > 0:
-            for i in range(len(verts)):
-                a = verts[i]
-                b = verts[(i + 1) % len(verts)]
-                self.draw_fat_segment(a, b, radius, fill_color, fill_color)
-
-    def draw_dot(self, size: float, pos: Tuple[float, float], color: SpaceDebugColor) -> None:
-        p = to_pygame(pos, self.surface)
-        pygame.draw.circle(self.surface, color.as_int(), p, round(size), 0)
-
-
-def get_mouse_pos(surface: pygame.Surface) -> Tuple[int, int]:
-    """Get position of the mouse pointer in pymunk coordinates."""
-    p = pygame.mouse.get_pos()
-    return from_pygame(p, surface)
-
-
-def to_pygame(p: Tuple[float, float], surface: pygame.Surface) -> Tuple[int, int]:
-    """Convenience method to convert pymunk coordinates to pygame surface
-    local coordinates.
-
-    Note that in case positive_y_is_up is False, this function won't actually do
-    anything except converting the point to integers.
-    """
-    if positive_y_is_up:
-        return round(p[0]), surface.get_height() - round(p[1])
-    else:
-        return round(p[0]), round(p[1])
-
-
-def from_pygame(p: Tuple[float, float], surface: pygame.Surface) -> Tuple[int, int]:
-    """Convenience method to convert pygame surface local coordinates to
-    pymunk coordinates
-    """
-    return to_pygame(p, surface)
-
-
-def light_color(color: SpaceDebugColor):
-    color = np.minimum(1.2 * np.float32([color.r, color.g, color.b, color.a]), np.float32([255]))
-    color = SpaceDebugColor(r=color[0], g=color[1], b=color[2], a=color[3])
-    return color
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/pymunk_util.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/pymunk_util.py
deleted file mode 100644
index 9fb2b5d6e..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/pymunk_util.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import numpy as np
-import pygame
-import pymunk
-import pymunk.pygame_util
-
-COLLTYPE_DEFAULT = 0
-COLLTYPE_MOUSE = 1
-COLLTYPE_BALL = 2
-
-
-def get_body_type(static=False):
-    body_type = pymunk.Body.DYNAMIC
-    if static:
-        body_type = pymunk.Body.STATIC
-    return body_type
-
-
-def create_rectangle(space, pos_x, pos_y, width, height, density=3, static=False):
-    body = pymunk.Body(body_type=get_body_type(static))
-    body.position = (pos_x, pos_y)
-    shape = pymunk.Poly.create_box(body, (width, height))
-    shape.density = density
-    space.add(body, shape)
-    return body, shape
-
-
-def create_rectangle_bb(space, left, bottom, right, top, **kwargs):
-    pos_x = (left + right) / 2
-    pos_y = (top + bottom) / 2
-    height = top - bottom
-    width = right - left
-    return create_rectangle(space, pos_x, pos_y, width, height, **kwargs)
-
-
-def create_circle(space, pos_x, pos_y, radius, density=3, static=False):
-    body = pymunk.Body(body_type=get_body_type(static))
-    body.position = (pos_x, pos_y)
-    shape = pymunk.Circle(body, radius=radius)
-    shape.density = density
-    shape.collision_type = COLLTYPE_BALL
-    space.add(body, shape)
-    return body, shape
-
-
-def get_body_state(body):
-    state = np.zeros(6, dtype=np.float32)
-    state[:2] = body.position
-    state[2] = body.angle
-    state[3:5] = body.velocity
-    state[5] = body.angular_velocity
-    return state
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/pytorch_util.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/pytorch_util.py
deleted file mode 100644
index fc64fa564..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/pytorch_util.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import collections
-from typing import Callable, Dict, List
-
-import torch
-import torch.nn as nn
-
-
-def dict_apply(x: Dict[str, torch.Tensor], func: Callable[[torch.Tensor], torch.Tensor]) -> Dict[str, torch.Tensor]:
-    result = dict()
-    for key, value in x.items():
-        if isinstance(value, dict):
-            result[key] = dict_apply(value, func)
-        else:
-            result[key] = func(value)
-    return result
-
-
-def pad_remaining_dims(x, target):
-    assert x.shape == target.shape[: len(x.shape)]
-    return x.reshape(x.shape + (1,) * (len(target.shape) - len(x.shape)))
-
-
-def dict_apply_split(
-    x: Dict[str, torch.Tensor],
-    split_func: Callable[[torch.Tensor], Dict[str, torch.Tensor]],
-) -> Dict[str, torch.Tensor]:
-    results = collections.defaultdict(dict)
-    for key, value in x.items():
-        result = split_func(value)
-        for k, v in result.items():
-            results[k][key] = v
-    return results
-
-
-def dict_apply_reduce(
-    x: List[Dict[str, torch.Tensor]],
-    reduce_func: Callable[[List[torch.Tensor]], torch.Tensor],
-) -> Dict[str, torch.Tensor]:
-    result = dict()
-    for key in x[0].keys():
-        result[key] = reduce_func([x_[key] for x_ in x])
-    return result
-
-
-def replace_submodules(
-    root_module: nn.Module,
-    predicate: Callable[[nn.Module], bool],
-    func: Callable[[nn.Module], nn.Module],
-) -> nn.Module:
-    """
-    predicate: Return true if the module is to be replaced.
-    func: Return new module to use.
-    """
-    if predicate(root_module):
-        return func(root_module)
-
-    bn_list = [k.split(".") for k, m in root_module.named_modules(remove_duplicate=True) if predicate(m)]
-    for *parent, k in bn_list:
-        parent_module = root_module
-        if len(parent) > 0:
-            parent_module = root_module.get_submodule(".".join(parent))
-        if isinstance(parent_module, nn.Sequential):
-            src_module = parent_module[int(k)]
-        else:
-            src_module = getattr(parent_module, k)
-        tgt_module = func(src_module)
-        if isinstance(parent_module, nn.Sequential):
-            parent_module[int(k)] = tgt_module
-        else:
-            setattr(parent_module, k, tgt_module)
-    # verify that all BN are replaced
-    bn_list = [k.split(".") for k, m in root_module.named_modules(remove_duplicate=True) if predicate(m)]
-    assert len(bn_list) == 0
-    return root_module
-
-
-def optimizer_to(optimizer, device):
-    for state in optimizer.state.values():
-        for k, v in state.items():
-            if isinstance(v, torch.Tensor):
-                state[k] = v.to(device=device)
-    return optimizer
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/robomimic_config_util.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/robomimic_config_util.py
deleted file mode 100644
index b992b15aa..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/robomimic_config_util.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import robomimic.scripts.generate_paper_configs as gpc
-from omegaconf import OmegaConf
-from robomimic.config import config_factory
-from robomimic.scripts.generate_paper_configs import (
-    modify_config_for_dataset,
-    modify_config_for_default_image_exp,
-    modify_config_for_default_low_dim_exp,
-)
-
-
-def get_robomimic_config(algo_name="bc_rnn", hdf5_type="low_dim", task_name="square", dataset_type="ph"):
-    base_dataset_dir = "/tmp/null"
-    filter_key = None
-
-    # decide whether to use low-dim or image training defaults
-    modifier_for_obs = modify_config_for_default_image_exp
-    if hdf5_type in ["low_dim", "low_dim_sparse", "low_dim_dense"]:
-        modifier_for_obs = modify_config_for_default_low_dim_exp
-
-    algo_config_name = "bc" if algo_name == "bc_rnn" else algo_name
-    config = config_factory(algo_name=algo_config_name)
-    # turn into default config for observation modalities (e.g.: low-dim or rgb)
-    config = modifier_for_obs(config)
-    # add in config based on the dataset
-    config = modify_config_for_dataset(
-        config=config,
-        task_name=task_name,
-        dataset_type=dataset_type,
-        hdf5_type=hdf5_type,
-        base_dataset_dir=base_dataset_dir,
-        filter_key=filter_key,
-    )
-    # add in algo hypers based on dataset
-    algo_config_modifier = getattr(gpc, f"modify_{algo_name}_config_for_dataset")
-    config = algo_config_modifier(
-        config=config,
-        task_name=task_name,
-        dataset_type=dataset_type,
-        hdf5_type=hdf5_type,
-    )
-    return config
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/robomimic_util.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/robomimic_util.py
deleted file mode 100644
index 652afb8bf..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/robomimic_util.py
+++ /dev/null
@@ -1,169 +0,0 @@
-import copy
-
-import h5py
-import numpy as np
-import robomimic.utils.env_utils as EnvUtils
-import robomimic.utils.file_utils as FileUtils
-import robomimic.utils.obs_utils as ObsUtils
-from robomimic.config import config_factory
-from scipy.spatial.transform import Rotation
-
-
-class RobomimicAbsoluteActionConverter:
-    def __init__(self, dataset_path, algo_name="bc"):
-        # default BC config
-        config = config_factory(algo_name=algo_name)
-
-        # read config to set up metadata for observation modalities (e.g. detecting rgb observations)
-        # must ran before create dataset
-        ObsUtils.initialize_obs_utils_with_config(config)
-
-        env_meta = FileUtils.get_env_metadata_from_dataset(dataset_path)
-        abs_env_meta = copy.deepcopy(env_meta)
-        abs_env_meta["env_kwargs"]["controller_configs"]["control_delta"] = False
-
-        env = EnvUtils.create_env_from_metadata(
-            env_meta=env_meta,
-            render=False,
-            render_offscreen=False,
-            use_image_obs=False,
-        )
-        assert len(env.env.robots) in (1, 2)
-
-        abs_env = EnvUtils.create_env_from_metadata(
-            env_meta=abs_env_meta,
-            render=False,
-            render_offscreen=False,
-            use_image_obs=False,
-        )
-        assert not abs_env.env.robots[0].controller.use_delta
-
-        self.env = env
-        self.abs_env = abs_env
-        self.file = h5py.File(dataset_path, "r")
-
-    def __len__(self):
-        return len(self.file["data"])
-
-    def convert_actions(self, states: np.ndarray, actions: np.ndarray) -> np.ndarray:
-        """
-        Given state and delta action sequence
-        generate equivalent goal position and orientation for each step
-        keep the original gripper action intact.
-        """
-        # in case of multi robot
-        # reshape (N,14) to (N,2,7)
-        # or (N,7) to (N,1,7)
-        stacked_actions = actions.reshape(*actions.shape[:-1], -1, 7)
-
-        env = self.env
-        # generate abs actions
-        action_goal_pos = np.zeros(stacked_actions.shape[:-1] + (3,), dtype=stacked_actions.dtype)
-        action_goal_ori = np.zeros(stacked_actions.shape[:-1] + (3,), dtype=stacked_actions.dtype)
-        action_gripper = stacked_actions[..., [-1]]
-        for i in range(len(states)):
-            _ = env.reset_to({"states": states[i]})
-
-            # taken from robot_env.py L#454
-            for idx, robot in enumerate(env.env.robots):
-                # run controller goal generator
-                robot.control(stacked_actions[i, idx], policy_step=True)
-
-                # read pos and ori from robots
-                controller = robot.controller
-                action_goal_pos[i, idx] = controller.goal_pos
-                action_goal_ori[i, idx] = Rotation.from_matrix(controller.goal_ori).as_rotvec()
-
-        stacked_abs_actions = np.concatenate([action_goal_pos, action_goal_ori, action_gripper], axis=-1)
-        abs_actions = stacked_abs_actions.reshape(actions.shape)
-        return abs_actions
-
-    def convert_idx(self, idx):
-        file = self.file
-        demo = file[f"data/demo_{idx}"]
-        # input
-        states = demo["states"][:]
-        actions = demo["actions"][:]
-
-        # generate abs actions
-        abs_actions = self.convert_actions(states, actions)
-        return abs_actions
-
-    def convert_and_eval_idx(self, idx):
-        env = self.env
-        abs_env = self.abs_env
-        file = self.file
-        # first step have high error for some reason, not representative
-        eval_skip_steps = 1
-
-        demo = file[f"data/demo_{idx}"]
-        # input
-        states = demo["states"][:]
-        actions = demo["actions"][:]
-
-        # generate abs actions
-        abs_actions = self.convert_actions(states, actions)
-
-        # verify
-        robot0_eef_pos = demo["obs"]["robot0_eef_pos"][:]
-        robot0_eef_quat = demo["obs"]["robot0_eef_quat"][:]
-
-        delta_error_info = self.evaluate_rollout_error(
-            env,
-            states,
-            actions,
-            robot0_eef_pos,
-            robot0_eef_quat,
-            metric_skip_steps=eval_skip_steps,
-        )
-        abs_error_info = self.evaluate_rollout_error(
-            abs_env,
-            states,
-            abs_actions,
-            robot0_eef_pos,
-            robot0_eef_quat,
-            metric_skip_steps=eval_skip_steps,
-        )
-
-        info = {"delta_max_error": delta_error_info, "abs_max_error": abs_error_info}
-        return abs_actions, info
-
-    @staticmethod
-    def evaluate_rollout_error(env, states, actions, robot0_eef_pos, robot0_eef_quat, metric_skip_steps=1):
-        # first step have high error for some reason, not representative
-
-        # evaluate abs actions
-        rollout_next_states = list()
-        rollout_next_eef_pos = list()
-        rollout_next_eef_quat = list()
-        obs = env.reset_to({"states": states[0]})
-        for i in range(len(states)):
-            obs = env.reset_to({"states": states[i]})
-            obs, reward, done, info = env.step(actions[i])
-            obs = env.get_observation()
-            rollout_next_states.append(env.get_state()["states"])
-            rollout_next_eef_pos.append(obs["robot0_eef_pos"])
-            rollout_next_eef_quat.append(obs["robot0_eef_quat"])
-        rollout_next_states = np.array(rollout_next_states)
-        rollout_next_eef_pos = np.array(rollout_next_eef_pos)
-        rollout_next_eef_quat = np.array(rollout_next_eef_quat)
-
-        next_state_diff = states[1:] - rollout_next_states[:-1]
-        max_next_state_diff = np.max(np.abs(next_state_diff[metric_skip_steps:]))
-
-        next_eef_pos_diff = robot0_eef_pos[1:] - rollout_next_eef_pos[:-1]
-        next_eef_pos_dist = np.linalg.norm(next_eef_pos_diff, axis=-1)
-        max_next_eef_pos_dist = next_eef_pos_dist[metric_skip_steps:].max()
-
-        next_eef_rot_diff = (
-            Rotation.from_quat(robot0_eef_quat[1:]) * Rotation.from_quat(rollout_next_eef_quat[:-1]).inv()
-        )
-        next_eef_rot_dist = next_eef_rot_diff.magnitude()
-        max_next_eef_rot_dist = next_eef_rot_dist[metric_skip_steps:].max()
-
-        info = {
-            "state": max_next_state_diff,
-            "pos": max_next_eef_pos_dist,
-            "rot": max_next_eef_rot_dist,
-        }
-        return info
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/timestamp_accumulator.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/timestamp_accumulator.py
deleted file mode 100644
index 7934c71d3..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/timestamp_accumulator.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import math
-from typing import Dict, List, Optional, Tuple
-
-import numpy as np
-
-
-def get_accumulate_timestamp_idxs(
-    timestamps: List[float],
-    start_time: float,
-    dt: float,
-    eps: float = 1e-5,
-    next_global_idx: Optional[int] = 0,
-    allow_negative=False,
-) -> Tuple[List[int], List[int], int]:
-    """
-    For each dt window, choose the first timestamp in the window.
-    Assumes timestamps sorted. One timestamp might be chosen multiple times due to dropped frames.
-    next_global_idx should start at 0 normally, and then use the returned next_global_idx.
-    However, when overwiting previous values are desired, set last_global_idx to None.
-
-    Returns:
-    local_idxs: which index in the given timestamps array to chose from
-    global_idxs: the global index of each chosen timestamp
-    next_global_idx: used for next call.
-    """
-    local_idxs = list()
-    global_idxs = list()
-    for local_idx, ts in enumerate(timestamps):
-        # add eps * dt to timestamps so that when ts == start_time + k * dt
-        # is always recorded as kth element (avoiding floating point errors)
-        global_idx = math.floor((ts - start_time) / dt + eps)
-        if (not allow_negative) and (global_idx < 0):
-            continue
-        if next_global_idx is None:
-            next_global_idx = global_idx
-
-        n_repeats = max(0, global_idx - next_global_idx + 1)
-        for i in range(n_repeats):
-            local_idxs.append(local_idx)
-            global_idxs.append(next_global_idx + i)
-        next_global_idx += n_repeats
-    return local_idxs, global_idxs, next_global_idx
-
-
-def align_timestamps(
-    timestamps: List[float],
-    target_global_idxs: List[int],
-    start_time: float,
-    dt: float,
-    eps: float = 1e-5,
-):
-    if isinstance(target_global_idxs, np.ndarray):
-        target_global_idxs = target_global_idxs.tolist()
-    assert len(target_global_idxs) > 0
-
-    local_idxs, global_idxs, _ = get_accumulate_timestamp_idxs(
-        timestamps=timestamps,
-        start_time=start_time,
-        dt=dt,
-        eps=eps,
-        next_global_idx=target_global_idxs[0],
-        allow_negative=True,
-    )
-    if len(global_idxs) > len(target_global_idxs):
-        # if more steps available, truncate
-        global_idxs = global_idxs[: len(target_global_idxs)]
-        local_idxs = local_idxs[: len(target_global_idxs)]
-
-    if len(global_idxs) == 0:
-        import pdb
-
-        pdb.set_trace()
-
-    for i in range(len(target_global_idxs) - len(global_idxs)):
-        # if missing, repeat
-        local_idxs.append(len(timestamps) - 1)
-        global_idxs.append(global_idxs[-1] + 1)
-    assert global_idxs == target_global_idxs
-    assert len(local_idxs) == len(global_idxs)
-    return local_idxs
-
-
-class TimestampObsAccumulator:
-    def __init__(self, start_time: float, dt: float, eps: float = 1e-5):
-        self.start_time = start_time
-        self.dt = dt
-        self.eps = eps
-        self.obs_buffer = dict()
-        self.timestamp_buffer = None
-        self.next_global_idx = 0
-
-    def __len__(self):
-        return self.next_global_idx
-
-    @property
-    def data(self):
-        if self.timestamp_buffer is None:
-            return dict()
-        result = dict()
-        for key, value in self.obs_buffer.items():
-            result[key] = value[: len(self)]
-        return result
-
-    @property
-    def actual_timestamps(self):
-        if self.timestamp_buffer is None:
-            return np.array([])
-        return self.timestamp_buffer[: len(self)]
-
-    @property
-    def timestamps(self):
-        if self.timestamp_buffer is None:
-            return np.array([])
-        return self.start_time + np.arange(len(self)) * self.dt
-
-    def put(self, data: Dict[str, np.ndarray], timestamps: np.ndarray):
-        """
-        data:
-            key: T,*
-        """
-
-        local_idxs, global_idxs, self.next_global_idx = get_accumulate_timestamp_idxs(
-            timestamps=timestamps,
-            start_time=self.start_time,
-            dt=self.dt,
-            eps=self.eps,
-            next_global_idx=self.next_global_idx,
-        )
-
-        if len(global_idxs) > 0:
-            if self.timestamp_buffer is None:
-                # first allocation
-                self.obs_buffer = dict()
-                for key, value in data.items():
-                    self.obs_buffer[key] = np.zeros_like(value)
-                self.timestamp_buffer = np.zeros((len(timestamps),), dtype=np.float64)
-
-            this_max_size = global_idxs[-1] + 1
-            if this_max_size > len(self.timestamp_buffer):
-                # reallocate
-                new_size = max(this_max_size, len(self.timestamp_buffer) * 2)
-                for key in list(self.obs_buffer.keys()):
-                    new_shape = (new_size,) + self.obs_buffer[key].shape[1:]
-                    self.obs_buffer[key] = np.resize(self.obs_buffer[key], new_shape)
-                self.timestamp_buffer = np.resize(self.timestamp_buffer, (new_size))
-
-            # write data
-            for key, value in self.obs_buffer.items():
-                value[global_idxs] = data[key][local_idxs]
-            self.timestamp_buffer[global_idxs] = timestamps[local_idxs]
-
-
-class TimestampActionAccumulator:
-    def __init__(self, start_time: float, dt: float, eps: float = 1e-5):
-        """
-        Different from Obs accumulator, the action accumulator
-        allows overwriting previous values.
-        """
-        self.start_time = start_time
-        self.dt = dt
-        self.eps = eps
-        self.action_buffer = None
-        self.timestamp_buffer = None
-        self.size = 0
-
-    def __len__(self):
-        return self.size
-
-    @property
-    def actions(self):
-        if self.action_buffer is None:
-            return np.array([])
-        return self.action_buffer[: len(self)]
-
-    @property
-    def actual_timestamps(self):
-        if self.timestamp_buffer is None:
-            return np.array([])
-        return self.timestamp_buffer[: len(self)]
-
-    @property
-    def timestamps(self):
-        if self.timestamp_buffer is None:
-            return np.array([])
-        return self.start_time + np.arange(len(self)) * self.dt
-
-    def put(self, actions: np.ndarray, timestamps: np.ndarray):
-        """
-        Note: timestamps is the time when the action will be issued,
-        not when the action will be completed (target_timestamp)
-        """
-
-        local_idxs, global_idxs, _ = get_accumulate_timestamp_idxs(
-            timestamps=timestamps,
-            start_time=self.start_time,
-            dt=self.dt,
-            eps=self.eps,
-            # allows overwriting previous actions
-            next_global_idx=None,
-        )
-
-        if len(global_idxs) > 0:
-            if self.timestamp_buffer is None:
-                # first allocation
-                self.action_buffer = np.zeros_like(actions)
-                self.timestamp_buffer = np.zeros((len(actions),), dtype=np.float64)
-
-            this_max_size = global_idxs[-1] + 1
-            if this_max_size > len(self.timestamp_buffer):
-                # reallocate
-                new_size = max(this_max_size, len(self.timestamp_buffer) * 2)
-                new_shape = (new_size,) + self.action_buffer.shape[1:]
-                self.action_buffer = np.resize(self.action_buffer, new_shape)
-                self.timestamp_buffer = np.resize(self.timestamp_buffer, (new_size,))
-
-            # potentially rewrite old data (as expected)
-            self.action_buffer[global_idxs] = actions[local_idxs]
-            self.timestamp_buffer[global_idxs] = timestamps[local_idxs]
-            self.size = max(self.size, this_max_size)
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/config/robot_dp.yaml b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/config/robot_dp.yaml
deleted file mode 100644
index 11aa2e5ca..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/config/robot_dp.yaml
+++ /dev/null
@@ -1,164 +0,0 @@
-defaults:
-  - _self_
-  - task: default_task
-
-name: robot_${task.name}
-_target_: diffusion_policy.workspace.robotworkspace.RobotWorkspace
-
-task_name: ${task.name}
-shape_meta: ${task.shape_meta}
-exp_name: "default"
-
-horizon: 8
-n_obs_steps: 3
-n_action_steps: 4
-n_latency_steps: 0
-dataset_obs_steps: ${n_obs_steps}
-past_action_visible: False
-keypoint_visible_rate: 1.0
-obs_as_global_cond: True
-
-policy_runner:
-  action:
-    action_type: "ee"
-    ee_cfg:
-      rotation_rep: "quaternion"
-      gripper_rep: "q_pos"
-    delta: False
-  obs:
-    obs_type: "ee"
-    ee_cfg:
-      rotation_rep: "quaternion"
-      gripper_rep: "q_pos"
-policy:
-  _target_: diffusion_policy.policy.diffusion_unet_image_policy.DiffusionUnetImagePolicy
-
-  shape_meta: ${shape_meta}
-
-  noise_scheduler:
-    _target_: diffusers.schedulers.scheduling_ddpm.DDPMScheduler
-    num_train_timesteps: 100
-    beta_start: 0.0001
-    beta_end: 0.02
-    beta_schedule: squaredcos_cap_v2
-    variance_type: fixed_small # Yilun's paper uses fixed_small_log instead, but easy to cause Nan
-    clip_sample: True # required when predict_epsilon=False
-    prediction_type: epsilon # or sample
-
-  obs_encoder:
-    _target_: diffusion_policy.model.vision.multi_image_obs_encoder.MultiImageObsEncoder
-    shape_meta: ${shape_meta}
-    rgb_model:
-      _target_: diffusion_policy.model.vision.model_getter.get_resnet
-      name: resnet18
-      weights: null
-    resize_shape: null
-    crop_shape: null
-    # constant center crop
-    random_crop: True
-    use_group_norm: True
-    share_rgb_model: False
-    imagenet_norm: True
-
-  horizon: ${horizon}
-  n_action_steps: ${eval:'${n_action_steps}+${n_latency_steps}'}
-  n_obs_steps: ${n_obs_steps}
-  num_inference_steps: 100
-  obs_as_global_cond: ${obs_as_global_cond}
-  # crop_shape: null
-  diffusion_step_embed_dim: 128
-  # down_dims: [512, 1024, 2048]
-  down_dims: [256, 512, 1024]
-  kernel_size: 5
-  n_groups: 8
-  cond_predict_scale: True
-
-  # scheduler.step params
-  # predict_epsilon: True
-
-ema:
-  _target_: diffusion_policy.model.diffusion.ema_model.EMAModel
-  update_after_step: 0
-  inv_gamma: 1.0
-  power: 0.75
-  min_value: 0.0
-  max_value: 0.9999
-
-dataloader:
-  batch_size: 32
-  num_workers: 0
-  shuffle: True
-  pin_memory: True
-  persistent_workers: False
-
-val_dataloader:
-  batch_size: 32
-  num_workers: 0
-  shuffle: False
-  pin_memory: True
-  persistent_workers: False
-
-optimizer:
-  _target_: torch.optim.AdamW
-  lr: 1.0e-4
-  betas: [0.95, 0.999]
-  eps: 1.0e-8
-  weight_decay: 1.0e-6
-
-training:
-  device: "cuda:0"
-  seed: 42
-  debug: False
-  resume: True
-  # optimization
-  lr_scheduler: cosine
-  lr_warmup_steps: 500
-  num_epochs: 1000
-  gradient_accumulate_every: 1
-  # EMA destroys performance when used with BatchNorm
-  # replace BatchNorm with GroupNorm.
-  use_ema: True
-  freeze_encoder: False
-  # training loop control
-  # in epochs
-  rollout_every: 50
-  checkpoint_every: 50
-  val_every: 5
-  sample_every: 5
-  # steps per epoch
-  max_train_steps: 250
-  max_val_steps: 250
-  # misc
-  tqdm_interval_sec: 1.0
-
-logging:
-  project: RoboVerse_DP
-  resume: True
-  mode: online
-  name: ${now:%Y.%m.%d-%H.%M.%S}_${task_name}
-  tags: ${now:%Y.%m.%d-%H.%M.%S}_${task_name}
-  id: null
-  group: null
-
-checkpoint:
-  topk:
-    monitor_key: test_mean_score
-    mode: max
-    k: 5
-    format_str: 'epoch={epoch:04d}-test_mean_score={test_mean_score:.3f}.ckpt'
-  save_last_ckpt: True
-  save_last_snapshot: False
-  save_root_dir: info/outputs/DP/${now:%Y.%m.%d}/${now:%H.%M.%S}_${task_name}
-
-multi_run:
-  run_dir: info/outputs/DP/${now:%Y.%m.%d}/${now:%H.%M.%S}_${task_name}
-  wandb_name_base: ${now:%Y.%m.%d-%H.%M.%S}_${task_name}
-
-hydra:
-  job:
-    override_dirname: ${name}
-  run:
-    dir: info/outputs/DP/${now:%Y.%m.%d}/${now:%H.%M.%S}_${task_name}
-  sweep:
-    dir: info/outputs/DP/${now:%Y.%m.%d}/${now:%H.%M.%S}_${task_name}
-    subdir: ${hydra.job.num}
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/config/task/default_task.yaml b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/config/task/default_task.yaml
deleted file mode 100644
index 094cd81f7..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/config/task/default_task.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-name: task_config
-
-image_shape: &image_shape [3, 256, 256]
-shape_meta: &shape_meta
-  # acceptable types: rgb, low_dim
-  obs:
-    head_cam:
-      shape: *image_shape
-      type: rgb
-    agent_pos:
-      shape: [9]
-      type: low_dim
-  action:
-    shape: [9]
-
-env_runner:
-  _target_: diffusion_policy.env_runner.pusht_image_runner.PushTImageRunner
-  n_train: 6
-  n_train_vis: 2
-  train_start_seed: 0
-  n_test: 50
-  n_test_vis: 4
-  legacy_test: True
-  test_start_seed: 100000
-  max_steps: 300
-  n_obs_steps: ${n_obs_steps}
-  n_action_steps: ${n_action_steps}
-  fps: 10
-  past_action: ${past_action_visible}
-  n_envs: null
-
-dataset:
-  _target_: diffusion_policy.dataset.robot_image_dataset.RobotImageDataset
-  zarr_path: diffusion_policy/data/useless.zarr
-  horizon: ${horizon}
-  pad_before: ${eval:'${n_obs_steps}-1'}
-  pad_after: ${eval:'${n_action_steps}-1'}
-  seed: 42
-  val_ratio: 0.02
-  batch_size: 32
-  max_train_episodes: null
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/dataset/robot_image_dataset.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/dataset/robot_image_dataset.py
deleted file mode 100644
index d0de35ba2..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/dataset/robot_image_dataset.py
+++ /dev/null
@@ -1,177 +0,0 @@
-import copy
-from typing import Dict
-
-import numba
-import numpy as np
-import torch
-from diffusion_policy.common.normalize_util import get_image_range_normalizer
-from diffusion_policy.common.pytorch_util import dict_apply
-from diffusion_policy.common.replay_buffer import ReplayBuffer
-from diffusion_policy.common.sampler import (
-    SequenceSampler,
-    downsample_mask,
-    get_val_mask,
-)
-from diffusion_policy.dataset.base_dataset import BaseImageDataset
-from diffusion_policy.model.common.normalizer import LinearNormalizer
-from termcolor import cprint
-
-
-class RobotImageDataset(BaseImageDataset):
-    def __init__(
-        self,
-        zarr_path,
-        horizon=1,
-        pad_before=0,
-        pad_after=0,
-        seed=42,
-        val_ratio=0.0,
-        batch_size=64,
-        max_train_episodes=None,
-    ):
-
-        super().__init__()
-        # cprint(zarr_path, "red")
-        # cprint(batch_size, "red")
-        self.replay_buffer = ReplayBuffer.copy_from_path(
-            zarr_path,
-            # keys=['head_camera', 'front_camera', 'left_camera', 'right_camera', 'state', 'action'],
-            keys=["head_camera", "state", "action"],
-        )
-
-        val_mask = get_val_mask(n_episodes=self.replay_buffer.n_episodes, val_ratio=val_ratio, seed=seed)
-        train_mask = ~val_mask
-        train_mask = downsample_mask(mask=train_mask, max_n=max_train_episodes, seed=seed)
-
-        self.sampler = SequenceSampler(
-            replay_buffer=self.replay_buffer,
-            sequence_length=horizon,
-            pad_before=pad_before,
-            pad_after=pad_after,
-            episode_mask=train_mask,
-        )
-        self.train_mask = train_mask
-        self.horizon = horizon
-        self.pad_before = pad_before
-        self.pad_after = pad_after
-
-        self.batch_size = batch_size
-        sequence_length = self.sampler.sequence_length
-        self.buffers = {
-            k: np.zeros((batch_size, sequence_length, *v.shape[1:]), dtype=v.dtype)
-            for k, v in self.sampler.replay_buffer.items()
-        }
-        self.buffers_torch = {k: torch.from_numpy(v) for k, v in self.buffers.items()}
-        for v in self.buffers_torch.values():
-            v.pin_memory()
-
-    def get_validation_dataset(self):
-        val_set = copy.copy(self)
-        val_set.sampler = SequenceSampler(
-            replay_buffer=self.replay_buffer,
-            sequence_length=self.horizon,
-            pad_before=self.pad_before,
-            pad_after=self.pad_after,
-            episode_mask=~self.train_mask,
-        )
-        val_set.train_mask = ~self.train_mask
-        return val_set
-
-    def get_normalizer(self, mode="limits", **kwargs):
-        data = {
-            "action": self.replay_buffer["action"],
-            "agent_pos": self.replay_buffer["state"],
-        }
-        normalizer = LinearNormalizer()
-        normalizer.fit(data=data, last_n_dims=1, mode=mode, **kwargs)
-        normalizer["head_cam"] = get_image_range_normalizer()
-        normalizer["front_cam"] = get_image_range_normalizer()
-        normalizer["left_cam"] = get_image_range_normalizer()
-        normalizer["right_cam"] = get_image_range_normalizer()
-        return normalizer
-
-    def __len__(self) -> int:
-        return len(self.sampler)
-
-    def _sample_to_data(self, sample):
-        agent_pos = sample["state"].astype(np.float32)  # (agent_posx2, block_posex3)
-        head_cam = np.moveaxis(sample["head_camera"], -1, 1) / 255.0
-
-        data = {
-            "obs": {
-                "head_cam": head_cam,  # T, 3, H, W
-                "agent_pos": agent_pos,  # T, D
-            },
-            "action": sample["action"].astype(np.float32),  # T, D
-        }
-        return data
-
-    def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
-        if isinstance(idx, slice):
-            raise NotImplementedError  # Specialized
-        elif isinstance(idx, int):
-            sample = self.sampler.sample_sequence(idx)
-            sample = dict_apply(sample, torch.from_numpy)
-            return sample
-        elif isinstance(idx, np.ndarray):
-            # print(idx, len(idx))
-            # print(self.batch_size)
-            assert len(idx) == self.batch_size
-            for k, v in self.sampler.replay_buffer.items():
-                batch_sample_sequence(
-                    self.buffers[k],
-                    v,
-                    self.sampler.indices,
-                    idx,
-                    self.sampler.sequence_length,
-                )
-            return self.buffers_torch
-        else:
-            raise ValueError(idx)
-
-    def postprocess(self, samples, device):
-        agent_pos = samples["state"].to(device, non_blocking=True)
-        head_cam = samples["head_camera"].to(device, non_blocking=True) / 255.0
-        action = samples["action"].to(device, non_blocking=True)
-        return {
-            "obs": {
-                "head_cam": head_cam,  # B, T, 3, H, W
-                "agent_pos": agent_pos,  # B, T, D
-            },
-            "action": action,  # B, T, D
-        }
-
-
-def _batch_sample_sequence(
-    data: np.ndarray,
-    input_arr: np.ndarray,
-    indices: np.ndarray,
-    idx: np.ndarray,
-    sequence_length: int,
-):
-    for i in numba.prange(len(idx)):
-        buffer_start_idx, buffer_end_idx, sample_start_idx, sample_end_idx = indices[idx[i]]
-        data[i, sample_start_idx:sample_end_idx] = input_arr[buffer_start_idx:buffer_end_idx]
-        if sample_start_idx > 0:
-            data[i, :sample_start_idx] = data[i, sample_start_idx]
-        if sample_end_idx < sequence_length:
-            data[i, sample_end_idx:] = data[i, sample_end_idx - 1]
-
-
-_batch_sample_sequence_sequential = numba.jit(_batch_sample_sequence, nopython=True, parallel=False)
-_batch_sample_sequence_parallel = numba.jit(_batch_sample_sequence, nopython=True, parallel=True)
-
-
-def batch_sample_sequence(
-    data: np.ndarray,
-    input_arr: np.ndarray,
-    indices: np.ndarray,
-    idx: np.ndarray,
-    sequence_length: int,
-):
-    batch_size = len(idx)
-    assert data.shape == (batch_size, sequence_length, *input_arr.shape[1:])
-    if batch_size >= 16 and data.nbytes // batch_size >= 2**16:
-        _batch_sample_sequence_parallel(data, input_arr, indices, idx, sequence_length)
-    else:
-        _batch_sample_sequence_sequential(data, input_arr, indices, idx, sequence_length)
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/dict_of_tensor_mixin.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/dict_of_tensor_mixin.py
deleted file mode 100644
index 2ba358d2b..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/dict_of_tensor_mixin.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import torch
-import torch.nn as nn
-
-
-class DictOfTensorMixin(nn.Module):
-    def __init__(self, params_dict=None):
-        super().__init__()
-        if params_dict is None:
-            params_dict = nn.ParameterDict()
-        self.params_dict = params_dict
-
-    @property
-    def device(self):
-        return next(iter(self.parameters())).device
-
-    def _load_from_state_dict(
-        self,
-        state_dict,
-        prefix,
-        local_metadata,
-        strict,
-        missing_keys,
-        unexpected_keys,
-        error_msgs,
-    ):
-        def dfs_add(dest, keys, value: torch.Tensor):
-            if len(keys) == 1:
-                dest[keys[0]] = value
-                return
-
-            if keys[0] not in dest:
-                dest[keys[0]] = nn.ParameterDict()
-            dfs_add(dest[keys[0]], keys[1:], value)
-
-        def load_dict(state_dict, prefix):
-            out_dict = nn.ParameterDict()
-            for key, value in state_dict.items():
-                value: torch.Tensor
-                if key.startswith(prefix):
-                    param_keys = key[len(prefix) :].split(".")[1:]
-                    dfs_add(out_dict, param_keys, value.clone())
-            return out_dict
-
-        self.params_dict = load_dict(state_dict, prefix + "params_dict")
-        self.params_dict.requires_grad_(False)
-        return
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/lr_scheduler.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/lr_scheduler.py
deleted file mode 100644
index 1c653b3ba..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/lr_scheduler.py
+++ /dev/null
@@ -1,55 +0,0 @@
-from diffusers.optimization import (
-    TYPE_TO_SCHEDULER_FUNCTION,
-    Optimizer,
-    Optional,
-    SchedulerType,
-    Union,
-)
-
-
-def get_scheduler(
-    name: Union[str, SchedulerType],
-    optimizer: Optimizer,
-    num_warmup_steps: Optional[int] = None,
-    num_training_steps: Optional[int] = None,
-    **kwargs,
-):
-    """
-    Added kwargs vs diffuser's original implementation
-
-    Unified API to get any scheduler from its name.
-
-    Args:
-        name (`str` or `SchedulerType`):
-            The name of the scheduler to use.
-        optimizer (`torch.optim.Optimizer`):
-            The optimizer that will be used during training.
-        num_warmup_steps (`int`, *optional*):
-            The number of warmup steps to do. This is not required by all schedulers (hence the argument being
-            optional), the function will raise an error if it's unset and the scheduler type requires it.
-        num_training_steps (`int``, *optional*):
-            The number of training steps to do. This is not required by all schedulers (hence the argument being
-            optional), the function will raise an error if it's unset and the scheduler type requires it.
-    """
-    name = SchedulerType(name)
-    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
-    if name == SchedulerType.CONSTANT:
-        return schedule_func(optimizer, **kwargs)
-
-    # All other schedulers require `num_warmup_steps`
-    if num_warmup_steps is None:
-        raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
-
-    if name == SchedulerType.CONSTANT_WITH_WARMUP:
-        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, **kwargs)
-
-    # All other schedulers require `num_training_steps`
-    if num_training_steps is None:
-        raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
-
-    return schedule_func(
-        optimizer,
-        num_warmup_steps=num_warmup_steps,
-        num_training_steps=num_training_steps,
-        **kwargs,
-    )
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/module_attr_mixin.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/module_attr_mixin.py
deleted file mode 100644
index 5d2cf4ea9..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/module_attr_mixin.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import torch.nn as nn
-
-
-class ModuleAttrMixin(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self._dummy_variable = nn.Parameter()
-
-    @property
-    def device(self):
-        return next(iter(self.parameters())).device
-
-    @property
-    def dtype(self):
-        return next(iter(self.parameters())).dtype
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/rotation_transformer.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/rotation_transformer.py
deleted file mode 100644
index 24ed1015c..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/rotation_transformer.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import functools
-from typing import Union
-
-import numpy as np
-import pytorch3d.transforms as pt
-import torch
-
-
-class RotationTransformer:
-    valid_reps = ["axis_angle", "euler_angles", "quaternion", "rotation_6d", "matrix"]
-
-    def __init__(
-        self,
-        from_rep="axis_angle",
-        to_rep="rotation_6d",
-        from_convention=None,
-        to_convention=None,
-    ):
-        """
-        Valid representations
-
-        Always use matrix as intermediate representation.
-        """
-        assert from_rep != to_rep
-        assert from_rep in self.valid_reps
-        assert to_rep in self.valid_reps
-        if from_rep == "euler_angles":
-            assert from_convention is not None
-        if to_rep == "euler_angles":
-            assert to_convention is not None
-
-        forward_funcs = list()
-        inverse_funcs = list()
-
-        if from_rep != "matrix":
-            funcs = [
-                getattr(pt, f"{from_rep}_to_matrix"),
-                getattr(pt, f"matrix_to_{from_rep}"),
-            ]
-            if from_convention is not None:
-                funcs = [functools.partial(func, convention=from_convention) for func in funcs]
-            forward_funcs.append(funcs[0])
-            inverse_funcs.append(funcs[1])
-
-        if to_rep != "matrix":
-            funcs = [
-                getattr(pt, f"matrix_to_{to_rep}"),
-                getattr(pt, f"{to_rep}_to_matrix"),
-            ]
-            if to_convention is not None:
-                funcs = [functools.partial(func, convention=to_convention) for func in funcs]
-            forward_funcs.append(funcs[0])
-            inverse_funcs.append(funcs[1])
-
-        inverse_funcs = inverse_funcs[::-1]
-
-        self.forward_funcs = forward_funcs
-        self.inverse_funcs = inverse_funcs
-
-    @staticmethod
-    def _apply_funcs(x: Union[np.ndarray, torch.Tensor], funcs: list) -> Union[np.ndarray, torch.Tensor]:
-        x_ = x
-        if isinstance(x, np.ndarray):
-            x_ = torch.from_numpy(x)
-        x_: torch.Tensor
-        for func in funcs:
-            x_ = func(x_)
-        y = x_
-        if isinstance(x, np.ndarray):
-            y = x_.numpy()
-        return y
-
-    def forward(self, x: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
-        return self._apply_funcs(x, self.forward_funcs)
-
-    def inverse(self, x: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
-        return self._apply_funcs(x, self.inverse_funcs)
-
-
-def test():
-    tf = RotationTransformer()
-
-    rotvec = np.random.uniform(-2 * np.pi, 2 * np.pi, size=(1000, 3))
-    rot6d = tf.forward(rotvec)
-    new_rotvec = tf.inverse(rot6d)
-
-    from scipy.spatial.transform import Rotation
-
-    diff = Rotation.from_rotvec(rotvec) * Rotation.from_rotvec(new_rotvec).inv()
-    dist = diff.magnitude()
-    assert dist.max() < 1e-7
-
-    tf = RotationTransformer("rotation_6d", "matrix")
-    rot6d_wrong = rot6d + np.random.normal(scale=0.1, size=rot6d.shape)
-    mat = tf.forward(rot6d_wrong)
-    mat_det = np.linalg.det(mat)
-    assert np.allclose(mat_det, 1)
-    # rotaiton_6d will be normalized to rotation matrix
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/shape_util.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/shape_util.py
deleted file mode 100644
index eec4b66e0..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/shape_util.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from typing import Callable, Dict, List, Tuple
-
-import torch
-import torch.nn as nn
-
-
-def get_module_device(m: nn.Module):
-    device = torch.device("cpu")
-    try:
-        param = next(iter(m.parameters()))
-        device = param.device
-    except StopIteration:
-        pass
-    return device
-
-
-@torch.no_grad()
-def get_output_shape(input_shape: Tuple[int], net: Callable[[torch.Tensor], torch.Tensor]):
-    device = get_module_device(net)
-    test_input = torch.zeros((1,) + tuple(input_shape), device=device)
-    test_output = net(test_input)
-    output_shape = tuple(test_output.shape[1:])
-    return output_shape
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/tensor_util.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/tensor_util.py
deleted file mode 100644
index 4cb7b9ac8..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/tensor_util.py
+++ /dev/null
@@ -1,973 +0,0 @@
-"""
-A collection of utilities for working with nested tensor structures consisting
-of numpy arrays and torch tensors.
-"""
-
-import collections
-
-import numpy as np
-import torch
-
-
-def recursive_dict_list_tuple_apply(x, type_func_dict):
-    """
-    Recursively apply functions to a nested dictionary or list or tuple, given a dictionary of
-    {data_type: function_to_apply}.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        type_func_dict (dict): a mapping from data types to the functions to be
-            applied for each data type.
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    assert list not in type_func_dict
-    assert tuple not in type_func_dict
-    assert dict not in type_func_dict
-
-    if isinstance(x, (dict, collections.OrderedDict)):
-        new_x = collections.OrderedDict() if isinstance(x, collections.OrderedDict) else dict()
-        for k, v in x.items():
-            new_x[k] = recursive_dict_list_tuple_apply(v, type_func_dict)
-        return new_x
-    elif isinstance(x, (list, tuple)):
-        ret = [recursive_dict_list_tuple_apply(v, type_func_dict) for v in x]
-        if isinstance(x, tuple):
-            ret = tuple(ret)
-        return ret
-    else:
-        for t, f in type_func_dict.items():
-            if isinstance(x, t):
-                return f(x)
-        else:
-            raise NotImplementedError("Cannot handle data type %s" % str(type(x)))
-
-
-def map_tensor(x, func):
-    """
-    Apply function @func to torch.Tensor objects in a nested dictionary or
-    list or tuple.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        func (function): function to apply to each tensor
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: func,
-            type(None): lambda x: x,
-        },
-    )
-
-
-def map_ndarray(x, func):
-    """
-    Apply function @func to np.ndarray objects in a nested dictionary or
-    list or tuple.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        func (function): function to apply to each array
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            np.ndarray: func,
-            type(None): lambda x: x,
-        },
-    )
-
-
-def map_tensor_ndarray(x, tensor_func, ndarray_func):
-    """
-    Apply function @tensor_func to torch.Tensor objects and @ndarray_func to
-    np.ndarray objects in a nested dictionary or list or tuple.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        tensor_func (function): function to apply to each tensor
-        ndarray_Func (function): function to apply to each array
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: tensor_func,
-            np.ndarray: ndarray_func,
-            type(None): lambda x: x,
-        },
-    )
-
-
-def clone(x):
-    """
-    Clones all torch tensors and numpy arrays in nested dictionary or list
-    or tuple and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x.clone(),
-            np.ndarray: lambda x: x.copy(),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def detach(x):
-    """
-    Detaches all torch tensors in nested dictionary or list
-    or tuple and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x.detach(),
-        },
-    )
-
-
-def to_batch(x):
-    """
-    Introduces a leading batch dimension of 1 for all torch tensors and numpy
-    arrays in nested dictionary or list or tuple and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x[None, ...],
-            np.ndarray: lambda x: x[None, ...],
-            type(None): lambda x: x,
-        },
-    )
-
-
-def to_sequence(x):
-    """
-    Introduces a time dimension of 1 at dimension 1 for all torch tensors and numpy
-    arrays in nested dictionary or list or tuple and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x[:, None, ...],
-            np.ndarray: lambda x: x[:, None, ...],
-            type(None): lambda x: x,
-        },
-    )
-
-
-def index_at_time(x, ind):
-    """
-    Indexes all torch tensors and numpy arrays in dimension 1 with index @ind in
-    nested dictionary or list or tuple and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        ind (int): index
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x[:, ind, ...],
-            np.ndarray: lambda x: x[:, ind, ...],
-            type(None): lambda x: x,
-        },
-    )
-
-
-def unsqueeze(x, dim):
-    """
-    Adds dimension of size 1 at dimension @dim in all torch tensors and numpy arrays
-    in nested dictionary or list or tuple and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        dim (int): dimension
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x.unsqueeze(dim=dim),
-            np.ndarray: lambda x: np.expand_dims(x, axis=dim),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def contiguous(x):
-    """
-    Makes all torch tensors and numpy arrays contiguous in nested dictionary or
-    list or tuple and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x.contiguous(),
-            np.ndarray: lambda x: np.ascontiguousarray(x),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def to_device(x, device):
-    """
-    Sends all torch tensors in nested dictionary or list or tuple to device
-    @device, and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        device (torch.Device): device to send tensors to
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x, d=device: x.to(d),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def to_tensor(x):
-    """
-    Converts all numpy arrays in nested dictionary or list or tuple to
-    torch tensors (and leaves existing torch Tensors as-is), and returns
-    a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x,
-            np.ndarray: lambda x: torch.from_numpy(x),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def to_numpy(x):
-    """
-    Converts all torch tensors in nested dictionary or list or tuple to
-    numpy (and leaves existing numpy arrays as-is), and returns
-    a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-
-    def f(tensor):
-        if tensor.is_cuda:
-            return tensor.detach().cpu().numpy()
-        else:
-            return tensor.detach().numpy()
-
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: f,
-            np.ndarray: lambda x: x,
-            type(None): lambda x: x,
-        },
-    )
-
-
-def to_list(x):
-    """
-    Converts all torch tensors and numpy arrays in nested dictionary or list
-    or tuple to a list, and returns a new nested structure. Useful for
-    json encoding.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-
-    def f(tensor):
-        if tensor.is_cuda:
-            return tensor.detach().cpu().numpy().tolist()
-        else:
-            return tensor.detach().numpy().tolist()
-
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: f,
-            np.ndarray: lambda x: x.tolist(),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def to_float(x):
-    """
-    Converts all torch tensors and numpy arrays in nested dictionary or list
-    or tuple to float type entries, and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x.float(),
-            np.ndarray: lambda x: x.astype(np.float32),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def to_uint8(x):
-    """
-    Converts all torch tensors and numpy arrays in nested dictionary or list
-    or tuple to uint8 type entries, and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x.byte(),
-            np.ndarray: lambda x: x.astype(np.uint8),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def to_torch(x, device):
-    """
-    Converts all numpy arrays and torch tensors in nested dictionary or list or tuple to
-    torch tensors on device @device and returns a new nested structure.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        device (torch.Device): device to send tensors to
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return to_device(to_float(to_tensor(x)), device)
-
-
-def to_one_hot_single(tensor, num_class):
-    """
-    Convert tensor to one-hot representation, assuming a certain number of total class labels.
-
-    Args:
-        tensor (torch.Tensor): tensor containing integer labels
-        num_class (int): number of classes
-
-    Returns:
-        x (torch.Tensor): tensor containing one-hot representation of labels
-    """
-    x = torch.zeros(tensor.size() + (num_class,)).to(tensor.device)
-    x.scatter_(-1, tensor.unsqueeze(-1), 1)
-    return x
-
-
-def to_one_hot(tensor, num_class):
-    """
-    Convert all tensors in nested dictionary or list or tuple to one-hot representation,
-    assuming a certain number of total class labels.
-
-    Args:
-        tensor (dict or list or tuple): a possibly nested dictionary or list or tuple
-        num_class (int): number of classes
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return map_tensor(tensor, func=lambda x, nc=num_class: to_one_hot_single(x, nc))
-
-
-def flatten_single(x, begin_axis=1):
-    """
-    Flatten a tensor in all dimensions from @begin_axis onwards.
-
-    Args:
-        x (torch.Tensor): tensor to flatten
-        begin_axis (int): which axis to flatten from
-
-    Returns:
-        y (torch.Tensor): flattened tensor
-    """
-    fixed_size = x.size()[:begin_axis]
-    _s = list(fixed_size) + [-1]
-    return x.reshape(*_s)
-
-
-def flatten(x, begin_axis=1):
-    """
-    Flatten all tensors in nested dictionary or list or tuple, from @begin_axis onwards.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        begin_axis (int): which axis to flatten from
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x, b=begin_axis: flatten_single(x, begin_axis=b),
-        },
-    )
-
-
-def reshape_dimensions_single(x, begin_axis, end_axis, target_dims):
-    """
-    Reshape selected dimensions in a tensor to a target dimension.
-
-    Args:
-        x (torch.Tensor): tensor to reshape
-        begin_axis (int): begin dimension
-        end_axis (int): end dimension
-        target_dims (tuple or list): target shape for the range of dimensions
-            (@begin_axis, @end_axis)
-
-    Returns:
-        y (torch.Tensor): reshaped tensor
-    """
-    assert begin_axis <= end_axis
-    assert begin_axis >= 0
-    assert end_axis < len(x.shape)
-    assert isinstance(target_dims, (tuple, list))
-    s = x.shape
-    final_s = []
-    for i in range(len(s)):
-        if i == begin_axis:
-            final_s.extend(target_dims)
-        elif i < begin_axis or i > end_axis:
-            final_s.append(s[i])
-    return x.reshape(*final_s)
-
-
-def reshape_dimensions(x, begin_axis, end_axis, target_dims):
-    """
-    Reshape selected dimensions for all tensors in nested dictionary or list or tuple
-    to a target dimension.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        begin_axis (int): begin dimension
-        end_axis (int): end dimension
-        target_dims (tuple or list): target shape for the range of dimensions
-            (@begin_axis, @end_axis)
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x, b=begin_axis, e=end_axis, t=target_dims: reshape_dimensions_single(
-                x, begin_axis=b, end_axis=e, target_dims=t
-            ),
-            np.ndarray: lambda x, b=begin_axis, e=end_axis, t=target_dims: reshape_dimensions_single(
-                x, begin_axis=b, end_axis=e, target_dims=t
-            ),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def join_dimensions(x, begin_axis, end_axis):
-    """
-    Joins all dimensions between dimensions (@begin_axis, @end_axis) into a flat dimension, for
-    all tensors in nested dictionary or list or tuple.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        begin_axis (int): begin dimension
-        end_axis (int): end dimension
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x, b=begin_axis, e=end_axis: reshape_dimensions_single(
-                x, begin_axis=b, end_axis=e, target_dims=[-1]
-            ),
-            np.ndarray: lambda x, b=begin_axis, e=end_axis: reshape_dimensions_single(
-                x, begin_axis=b, end_axis=e, target_dims=[-1]
-            ),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def expand_at_single(x, size, dim):
-    """
-    Expand a tensor at a single dimension @dim by @size
-
-    Args:
-        x (torch.Tensor): input tensor
-        size (int): size to expand
-        dim (int): dimension to expand
-
-    Returns:
-        y (torch.Tensor): expanded tensor
-    """
-    assert dim < x.ndimension()
-    assert x.shape[dim] == 1
-    expand_dims = [-1] * x.ndimension()
-    expand_dims[dim] = size
-    return x.expand(*expand_dims)
-
-
-def expand_at(x, size, dim):
-    """
-    Expand all tensors in nested dictionary or list or tuple at a single
-    dimension @dim by @size.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        size (int): size to expand
-        dim (int): dimension to expand
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return map_tensor(x, lambda t, s=size, d=dim: expand_at_single(t, s, d))
-
-
-def unsqueeze_expand_at(x, size, dim):
-    """
-    Unsqueeze and expand a tensor at a dimension @dim by @size.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        size (int): size to expand
-        dim (int): dimension to unsqueeze and expand
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    x = unsqueeze(x, dim)
-    return expand_at(x, size, dim)
-
-
-def repeat_by_expand_at(x, repeats, dim):
-    """
-    Repeat a dimension by combining expand and reshape operations.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        repeats (int): number of times to repeat the target dimension
-        dim (int): dimension to repeat on
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    x = unsqueeze_expand_at(x, repeats, dim + 1)
-    return join_dimensions(x, dim, dim + 1)
-
-
-def named_reduce_single(x, reduction, dim):
-    """
-    Reduce tensor at a dimension by named reduction functions.
-
-    Args:
-        x (torch.Tensor): tensor to be reduced
-        reduction (str): one of ["sum", "max", "mean", "flatten"]
-        dim (int): dimension to be reduced (or begin axis for flatten)
-
-    Returns:
-        y (torch.Tensor): reduced tensor
-    """
-    assert x.ndimension() > dim
-    assert reduction in ["sum", "max", "mean", "flatten"]
-    if reduction == "flatten":
-        x = flatten(x, begin_axis=dim)
-    elif reduction == "max":
-        x = torch.max(x, dim=dim)[0]  # [B, D]
-    elif reduction == "sum":
-        x = torch.sum(x, dim=dim)
-    else:
-        x = torch.mean(x, dim=dim)
-    return x
-
-
-def named_reduce(x, reduction, dim):
-    """
-    Reduces all tensors in nested dictionary or list or tuple at a dimension
-    using a named reduction function.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        reduction (str): one of ["sum", "max", "mean", "flatten"]
-        dim (int): dimension to be reduced (or begin axis for flatten)
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return map_tensor(x, func=lambda t, r=reduction, d=dim: named_reduce_single(t, r, d))
-
-
-def gather_along_dim_with_dim_single(x, target_dim, source_dim, indices):
-    """
-    This function indexes out a target dimension of a tensor in a structured way,
-    by allowing a different value to be selected for each member of a flat index
-    tensor (@indices) corresponding to a source dimension. This can be interpreted
-    as moving along the source dimension, using the corresponding index value
-    in @indices to select values for all other dimensions outside of the
-    source and target dimensions. A common use case is to gather values
-    in target dimension 1 for each batch member (target dimension 0).
-
-    Args:
-        x (torch.Tensor): tensor to gather values for
-        target_dim (int): dimension to gather values along
-        source_dim (int): dimension to hold constant and use for gathering values
-            from the other dimensions
-        indices (torch.Tensor): flat index tensor with same shape as tensor @x along
-            @source_dim
-
-    Returns:
-        y (torch.Tensor): gathered tensor, with dimension @target_dim indexed out
-    """
-    assert len(indices.shape) == 1
-    assert x.shape[source_dim] == indices.shape[0]
-
-    # unsqueeze in all dimensions except the source dimension
-    new_shape = [1] * x.ndimension()
-    new_shape[source_dim] = -1
-    indices = indices.reshape(*new_shape)
-
-    # repeat in all dimensions - but preserve shape of source dimension,
-    # and make sure target_dimension has singleton dimension
-    expand_shape = list(x.shape)
-    expand_shape[source_dim] = -1
-    expand_shape[target_dim] = 1
-    indices = indices.expand(*expand_shape)
-
-    out = x.gather(dim=target_dim, index=indices)
-    return out.squeeze(target_dim)
-
-
-def gather_along_dim_with_dim(x, target_dim, source_dim, indices):
-    """
-    Apply @gather_along_dim_with_dim_single to all tensors in a nested
-    dictionary or list or tuple.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        target_dim (int): dimension to gather values along
-        source_dim (int): dimension to hold constant and use for gathering values
-            from the other dimensions
-        indices (torch.Tensor): flat index tensor with same shape as tensor @x along
-            @source_dim
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple
-    """
-    return map_tensor(
-        x,
-        lambda y, t=target_dim, s=source_dim, i=indices: gather_along_dim_with_dim_single(y, t, s, i),
-    )
-
-
-def gather_sequence_single(seq, indices):
-    """
-    Given a tensor with leading dimensions [B, T, ...], gather an element from each sequence in
-    the batch given an index for each sequence.
-
-    Args:
-        seq (torch.Tensor): tensor with leading dimensions [B, T, ...]
-        indices (torch.Tensor): tensor indices of shape [B]
-
-    Return:
-        y (torch.Tensor): indexed tensor of shape [B, ....]
-    """
-    return gather_along_dim_with_dim_single(seq, target_dim=1, source_dim=0, indices=indices)
-
-
-def gather_sequence(seq, indices):
-    """
-    Given a nested dictionary or list or tuple, gathers an element from each sequence of the batch
-    for tensors with leading dimensions [B, T, ...].
-
-    Args:
-        seq (dict or list or tuple): a possibly nested dictionary or list or tuple with tensors
-            of leading dimensions [B, T, ...]
-        indices (torch.Tensor): tensor indices of shape [B]
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple with tensors of shape [B, ...]
-    """
-    return gather_along_dim_with_dim(seq, target_dim=1, source_dim=0, indices=indices)
-
-
-def pad_sequence_single(seq, padding, batched=False, pad_same=True, pad_values=None):
-    """
-    Pad input tensor or array @seq in the time dimension (dimension 1).
-
-    Args:
-        seq (np.ndarray or torch.Tensor): sequence to be padded
-        padding (tuple): begin and end padding, e.g. [1, 1] pads both begin and end of the sequence by 1
-        batched (bool): if sequence has the batch dimension
-        pad_same (bool): if pad by duplicating
-        pad_values (scalar or (ndarray, Tensor)): values to be padded if not pad_same
-
-    Returns:
-        padded sequence (np.ndarray or torch.Tensor)
-    """
-    assert isinstance(seq, (np.ndarray, torch.Tensor))
-    assert pad_same or pad_values is not None
-    if pad_values is not None:
-        assert isinstance(pad_values, float)
-    repeat_func = np.repeat if isinstance(seq, np.ndarray) else torch.repeat_interleave
-    concat_func = np.concatenate if isinstance(seq, np.ndarray) else torch.cat
-    ones_like_func = np.ones_like if isinstance(seq, np.ndarray) else torch.ones_like
-    seq_dim = 1 if batched else 0
-
-    begin_pad = []
-    end_pad = []
-
-    if padding[0] > 0:
-        pad = seq[[0]] if pad_same else ones_like_func(seq[[0]]) * pad_values
-        begin_pad.append(repeat_func(pad, padding[0], seq_dim))
-    if padding[1] > 0:
-        pad = seq[[-1]] if pad_same else ones_like_func(seq[[-1]]) * pad_values
-        end_pad.append(repeat_func(pad, padding[1], seq_dim))
-
-    return concat_func(begin_pad + [seq] + end_pad, seq_dim)
-
-
-def pad_sequence(seq, padding, batched=False, pad_same=True, pad_values=None):
-    """
-    Pad a nested dictionary or list or tuple of sequence tensors in the time dimension (dimension 1).
-
-    Args:
-        seq (dict or list or tuple): a possibly nested dictionary or list or tuple with tensors
-            of leading dimensions [B, T, ...]
-        padding (tuple): begin and end padding, e.g. [1, 1] pads both begin and end of the sequence by 1
-        batched (bool): if sequence has the batch dimension
-        pad_same (bool): if pad by duplicating
-        pad_values (scalar or (ndarray, Tensor)): values to be padded if not pad_same
-
-    Returns:
-        padded sequence (dict or list or tuple)
-    """
-    return recursive_dict_list_tuple_apply(
-        seq,
-        {
-            torch.Tensor: lambda x, p=padding, b=batched, ps=pad_same, pv=pad_values: pad_sequence_single(
-                x, p, b, ps, pv
-            ),
-            np.ndarray: lambda x, p=padding, b=batched, ps=pad_same, pv=pad_values: pad_sequence_single(
-                x, p, b, ps, pv
-            ),
-            type(None): lambda x: x,
-        },
-    )
-
-
-def assert_size_at_dim_single(x, size, dim, msg):
-    """
-    Ensure that array or tensor @x has size @size in dim @dim.
-
-    Args:
-        x (np.ndarray or torch.Tensor): input array or tensor
-        size (int): size that tensors should have at @dim
-        dim (int): dimension to check
-        msg (str): text to display if assertion fails
-    """
-    assert x.shape[dim] == size, msg
-
-
-def assert_size_at_dim(x, size, dim, msg):
-    """
-    Ensure that arrays and tensors in nested dictionary or list or tuple have
-    size @size in dim @dim.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-        size (int): size that tensors should have at @dim
-        dim (int): dimension to check
-    """
-    map_tensor(x, lambda t, s=size, d=dim, m=msg: assert_size_at_dim_single(t, s, d, m))
-
-
-def get_shape(x):
-    """
-    Get all shapes of arrays and tensors in nested dictionary or list or tuple.
-
-    Args:
-        x (dict or list or tuple): a possibly nested dictionary or list or tuple
-
-    Returns:
-        y (dict or list or tuple): new nested dict-list-tuple that contains each array or
-            tensor's shape
-    """
-    return recursive_dict_list_tuple_apply(
-        x,
-        {
-            torch.Tensor: lambda x: x.shape,
-            np.ndarray: lambda x: x.shape,
-            type(None): lambda x: x,
-        },
-    )
-
-
-def list_of_flat_dict_to_dict_of_list(list_of_dict):
-    """
-    Helper function to go from a list of flat dictionaries to a dictionary of lists.
-    By "flat" we mean that none of the values are dictionaries, but are numpy arrays,
-    floats, etc.
-
-    Args:
-        list_of_dict (list): list of flat dictionaries
-
-    Returns:
-        dict_of_list (dict): dictionary of lists
-    """
-    assert isinstance(list_of_dict, list)
-    dic = collections.OrderedDict()
-    for i in range(len(list_of_dict)):
-        for k in list_of_dict[i]:
-            if k not in dic:
-                dic[k] = []
-            dic[k].append(list_of_dict[i][k])
-    return dic
-
-
-def flatten_nested_dict_list(d, parent_key="", sep="_", item_key=""):
-    """
-    Flatten a nested dict or list to a list.
-
-    For example, given a dict
-    {
-        a: 1
-        b: {
-            c: 2
-        }
-        c: 3
-    }
-
-    the function would return [(a, 1), (b_c, 2), (c, 3)]
-
-    Args:
-        d (dict, list): a nested dict or list to be flattened
-        parent_key (str): recursion helper
-        sep (str): separator for nesting keys
-        item_key (str): recursion helper
-    Returns:
-        list: a list of (key, value) tuples
-    """
-    items = []
-    if isinstance(d, (tuple, list)):
-        new_key = parent_key + sep + item_key if len(parent_key) > 0 else item_key
-        for i, v in enumerate(d):
-            items.extend(flatten_nested_dict_list(v, new_key, sep=sep, item_key=str(i)))
-        return items
-    elif isinstance(d, dict):
-        new_key = parent_key + sep + item_key if len(parent_key) > 0 else item_key
-        for k, v in d.items():
-            assert isinstance(k, str)
-            items.extend(flatten_nested_dict_list(v, new_key, sep=sep, item_key=k))
-        return items
-    else:
-        new_key = parent_key + sep + item_key if len(parent_key) > 0 else item_key
-        return [(new_key, d)]
-
-
-def time_distributed(inputs, op, activation=None, inputs_as_kwargs=False, inputs_as_args=False, **kwargs):
-    """
-    Apply function @op to all tensors in nested dictionary or list or tuple @inputs in both the
-    batch (B) and time (T) dimension, where the tensors are expected to have shape [B, T, ...].
-    Will do this by reshaping tensors to [B * T, ...], passing through the op, and then reshaping
-    outputs to [B, T, ...].
-
-    Args:
-        inputs (list or tuple or dict): a possibly nested dictionary or list or tuple with tensors
-            of leading dimensions [B, T, ...]
-        op: a layer op that accepts inputs
-        activation: activation to apply at the output
-        inputs_as_kwargs (bool): whether to feed input as a kwargs dict to the op
-        inputs_as_args (bool) whether to feed input as a args list to the op
-        kwargs (dict): other kwargs to supply to the op
-
-    Returns:
-        outputs (dict or list or tuple): new nested dict-list-tuple with tensors of leading dimension [B, T].
-    """
-    batch_size, seq_len = flatten_nested_dict_list(inputs)[0][1].shape[:2]
-    inputs = join_dimensions(inputs, 0, 1)
-    if inputs_as_kwargs:
-        outputs = op(**inputs, **kwargs)
-    elif inputs_as_args:
-        outputs = op(*inputs, **kwargs)
-    else:
-        outputs = op(inputs, **kwargs)
-
-    if activation is not None:
-        outputs = map_tensor(outputs, activation)
-    outputs = reshape_dimensions(outputs, begin_axis=0, end_axis=0, target_dims=(batch_size, seq_len))
-    return outputs
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/policy/diffusion_unet_image_policy.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/policy/diffusion_unet_image_policy.py
deleted file mode 100644
index d763e45b7..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/policy/diffusion_unet_image_policy.py
+++ /dev/null
@@ -1,262 +0,0 @@
-from typing import Dict
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
-from diffusion_policy.common.pytorch_util import dict_apply
-from diffusion_policy.model.common.normalizer import LinearNormalizer
-from diffusion_policy.model.diffusion.conditional_unet1d import ConditionalUnet1D
-from diffusion_policy.model.diffusion.mask_generator import LowdimMaskGenerator
-from diffusion_policy.model.vision.multi_image_obs_encoder import MultiImageObsEncoder
-from diffusion_policy.policy.base_image_policy import BaseImagePolicy
-from einops import rearrange, reduce
-
-
-class DiffusionUnetImagePolicy(BaseImagePolicy):
-    def __init__(
-        self,
-        shape_meta: dict,
-        noise_scheduler: DDPMScheduler,
-        obs_encoder: MultiImageObsEncoder,
-        horizon,
-        n_action_steps,
-        n_obs_steps,
-        num_inference_steps=None,
-        obs_as_global_cond=True,
-        diffusion_step_embed_dim=256,
-        down_dims=(256, 512, 1024),
-        kernel_size=5,
-        n_groups=8,
-        cond_predict_scale=True,
-        # parameters passed to step
-        **kwargs,
-    ):
-        super().__init__()
-
-        # parse shapes
-        action_shape = shape_meta["action"]["shape"]
-        assert len(action_shape) == 1
-        action_dim = action_shape[0]
-        # get feature dim
-        obs_feature_dim = obs_encoder.output_shape()[0]
-
-        # create diffusion model
-        input_dim = action_dim + obs_feature_dim
-        global_cond_dim = None
-        if obs_as_global_cond:
-            input_dim = action_dim
-            global_cond_dim = obs_feature_dim * n_obs_steps
-
-        model = ConditionalUnet1D(
-            input_dim=input_dim,
-            local_cond_dim=None,
-            global_cond_dim=global_cond_dim,
-            diffusion_step_embed_dim=diffusion_step_embed_dim,
-            down_dims=down_dims,
-            kernel_size=kernel_size,
-            n_groups=n_groups,
-            cond_predict_scale=cond_predict_scale,
-        )
-
-        self.obs_encoder = obs_encoder
-        self.model = model
-        self.noise_scheduler = noise_scheduler
-        self.mask_generator = LowdimMaskGenerator(
-            action_dim=action_dim,
-            obs_dim=0 if obs_as_global_cond else obs_feature_dim,
-            max_n_obs_steps=n_obs_steps,
-            fix_obs_steps=True,
-            action_visible=False,
-        )
-        self.normalizer = LinearNormalizer()
-        self.horizon = horizon
-        self.obs_feature_dim = obs_feature_dim
-        self.action_dim = action_dim
-        self.n_action_steps = n_action_steps
-        self.n_obs_steps = n_obs_steps
-        self.obs_as_global_cond = obs_as_global_cond
-        self.kwargs = kwargs
-
-        if num_inference_steps is None:
-            num_inference_steps = noise_scheduler.config.num_train_timesteps
-        self.num_inference_steps = num_inference_steps
-
-    # ========= inference  ============
-    def conditional_sample(
-        self,
-        condition_data,
-        condition_mask,
-        local_cond=None,
-        global_cond=None,
-        generator=None,
-        # keyword arguments to scheduler.step
-        **kwargs,
-    ):
-        model = self.model
-        scheduler = self.noise_scheduler
-
-        trajectory = torch.randn(
-            size=condition_data.shape,
-            dtype=condition_data.dtype,
-            device=condition_data.device,
-            generator=generator,
-        )
-
-        # set step values
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        for t in scheduler.timesteps:
-            # 1. apply conditioning
-            trajectory[condition_mask] = condition_data[condition_mask]
-
-            # 2. predict model output
-            model_output = model(trajectory, t, local_cond=local_cond, global_cond=global_cond)
-
-            # 3. compute previous image: x_t -> x_t-1
-            trajectory = scheduler.step(model_output, t, trajectory, generator=generator, **kwargs).prev_sample
-
-        # finally make sure conditioning is enforced
-        trajectory[condition_mask] = condition_data[condition_mask]
-
-        return trajectory
-
-    def predict_action(self, obs_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
-        """
-        obs_dict: must include "obs" key
-        result: must include "action" key
-        """
-        assert "past_action" not in obs_dict  # not implemented yet
-        # print("!!obs_dict", obs_dict["head_cam"].shape)
-        # normalize input
-        nobs = self.normalizer.normalize(obs_dict)
-        # print("!!nobs", nobs["head_cam"].shape)
-        value = next(iter(nobs.values()))
-        B, To = value.shape[:2]
-        T = self.horizon
-        Da = self.action_dim
-        Do = self.obs_feature_dim
-        To = self.n_obs_steps
-
-        # build input
-        device = self.device
-        dtype = self.dtype
-
-        # handle different ways of passing observation
-        local_cond = None
-        global_cond = None
-        if self.obs_as_global_cond:
-            # condition through global feature
-            this_nobs = dict_apply(nobs, lambda x: x[:, :To, ...].reshape(-1, *x.shape[2:]))
-            # print("!!To", To)
-            # print(this_nobs["head_cam"].shape, this_nobs["agent_pos"].shape)
-            nobs_features = self.obs_encoder(this_nobs)
-            # reshape back to B, Do
-            # print("!!if", nobs_features.shape)
-            global_cond = nobs_features.reshape(B, -1)
-            # empty data for action
-            cond_data = torch.zeros(size=(B, T, Da), device=device, dtype=dtype)
-            cond_mask = torch.zeros_like(cond_data, dtype=torch.bool)
-        else:
-            # condition through impainting
-            this_nobs = dict_apply(nobs, lambda x: x[:, :To, ...].reshape(-1, *x.shape[2:]))
-            nobs_features = self.obs_encoder(this_nobs)
-            # reshape back to B, T, Do
-            nobs_features = nobs_features.reshape(B, To, -1)
-            cond_data = torch.zeros(size=(B, T, Da + Do), device=device, dtype=dtype)
-            cond_mask = torch.zeros_like(cond_data, dtype=torch.bool)
-            cond_data[:, :To, Da:] = nobs_features
-            cond_mask[:, :To, Da:] = True
-
-        # run sampling
-        nsample = self.conditional_sample(
-            cond_data,
-            cond_mask,
-            local_cond=local_cond,
-            global_cond=global_cond,
-            **self.kwargs,
-        )
-
-        # unnormalize prediction
-        naction_pred = nsample[..., :Da]
-        action_pred = self.normalizer["action"].unnormalize(naction_pred)
-
-        # get action
-        start = To - 1
-        end = start + self.n_action_steps
-        action = action_pred[:, start:end]
-
-        result = {"action": action, "action_pred": action_pred}
-        return result
-
-    # ========= training  ============
-    def set_normalizer(self, normalizer: LinearNormalizer):
-        self.normalizer.load_state_dict(normalizer.state_dict())
-
-    def compute_loss(self, batch):
-        # normalize input
-        assert "valid_mask" not in batch
-        nobs = self.normalizer.normalize(batch["obs"])
-        nactions = self.normalizer["action"].normalize(batch["action"])
-        batch_size = nactions.shape[0]
-        horizon = nactions.shape[1]
-
-        # handle different ways of passing observation
-        local_cond = None
-        global_cond = None
-        trajectory = nactions
-        cond_data = trajectory
-        if self.obs_as_global_cond:
-            # reshape B, T, ... to B*T
-            this_nobs = dict_apply(nobs, lambda x: x[:, : self.n_obs_steps, ...].reshape(-1, *x.shape[2:]))
-            nobs_features = self.obs_encoder(this_nobs)
-            # reshape back to B, Do
-            global_cond = nobs_features.reshape(batch_size, -1)
-        else:
-            # reshape B, T, ... to B*T
-            this_nobs = dict_apply(nobs, lambda x: x.reshape(-1, *x.shape[2:]))
-            nobs_features = self.obs_encoder(this_nobs)
-            # reshape back to B, T, Do
-            nobs_features = nobs_features.reshape(batch_size, horizon, -1)
-            cond_data = torch.cat([nactions, nobs_features], dim=-1)
-            trajectory = cond_data.detach()
-
-        # generate impainting mask
-        condition_mask = self.mask_generator(trajectory.shape)
-
-        # Sample noise that we'll add to the images
-        noise = torch.randn(trajectory.shape, device=trajectory.device)
-        bsz = trajectory.shape[0]
-        # Sample a random timestep for each image
-        timesteps = torch.randint(
-            0,
-            self.noise_scheduler.config.num_train_timesteps,
-            (bsz,),
-            device=trajectory.device,
-        ).long()
-        # Add noise to the clean images according to the noise magnitude at each timestep
-        # (this is the forward diffusion process)
-        noisy_trajectory = self.noise_scheduler.add_noise(trajectory, noise, timesteps)
-
-        # compute loss mask
-        loss_mask = ~condition_mask
-
-        # apply conditioning
-        noisy_trajectory[condition_mask] = cond_data[condition_mask]
-
-        # Predict the noise residual
-        pred = self.model(noisy_trajectory, timesteps, local_cond=local_cond, global_cond=global_cond)
-
-        pred_type = self.noise_scheduler.config.prediction_type
-        if pred_type == "epsilon":
-            target = noise
-        elif pred_type == "sample":
-            target = trajectory
-        else:
-            raise ValueError(f"Unsupported prediction type {pred_type}")
-
-        loss = F.mse_loss(pred, target, reduction="none")
-        loss = loss * loss_mask.type(loss.dtype)
-        loss = reduce(loss, "b ... -> b (...)", "mean")
-        loss = loss.mean()
-        return loss
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/workspace/base_workspace.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/workspace/base_workspace.py
deleted file mode 100644
index abd1b40d7..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/workspace/base_workspace.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import copy
-import os
-import pathlib
-import threading
-from typing import Optional
-
-import dill
-import hydra
-import torch
-from hydra.core.hydra_config import HydraConfig
-from omegaconf import OmegaConf
-
-
-class BaseWorkspace:
-    include_keys = tuple()
-    exclude_keys = tuple()
-
-    def __init__(self, cfg: OmegaConf, output_dir: Optional[str] = None):
-        self.cfg = cfg
-        self._output_dir = output_dir
-        self._saving_thread = None
-
-    @property
-    def output_dir(self):
-        output_dir = self._output_dir
-        if output_dir is None:
-            output_dir = HydraConfig.get().runtime.output_dir
-        return output_dir
-
-    def run(self):
-        """
-        Create any resource shouldn't be serialized as local variables
-        """
-        pass
-
-    def save_checkpoint(
-        self,
-        path=None,
-        tag="latest",
-        exclude_keys=None,
-        include_keys=None,
-        use_thread=True,
-    ):
-        if path is None:
-            path = pathlib.Path(self.output_dir).joinpath("checkpoints", f"{tag}.ckpt")
-        else:
-            path = pathlib.Path(path)
-        if exclude_keys is None:
-            exclude_keys = tuple(self.exclude_keys)
-        if include_keys is None:
-            include_keys = tuple(self.include_keys) + ("_output_dir",)
-
-        path.parent.mkdir(parents=True, exist_ok=True)
-        payload = {"cfg": self.cfg, "state_dicts": dict(), "pickles": dict()}
-
-        for key, value in self.__dict__.items():
-            if hasattr(value, "state_dict") and hasattr(value, "load_state_dict"):
-                # modules, optimizers and samplers etc
-                if key not in exclude_keys:
-                    if use_thread:
-                        payload["state_dicts"][key] = _copy_to_cpu(value.state_dict())
-                    else:
-                        payload["state_dicts"][key] = value.state_dict()
-            elif key in include_keys:
-                payload["pickles"][key] = dill.dumps(value)
-        if use_thread:
-            self._saving_thread = threading.Thread(
-                target=lambda: torch.save(payload, path.open("wb"), pickle_module=dill)
-            )
-            self._saving_thread.start()
-        else:
-            torch.save(payload, path.open("wb"), pickle_module=dill)
-        return str(path.absolute())
-
-    def get_checkpoint_path(self, tag="latest"):
-        return pathlib.Path(self.output_dir).joinpath("checkpoints", f"{tag}.ckpt")
-
-    def load_payload(self, payload, exclude_keys=None, include_keys=None, **kwargs):
-        if exclude_keys is None:
-            exclude_keys = tuple()
-        if include_keys is None:
-            include_keys = payload["pickles"].keys()
-
-        for key, value in payload["state_dicts"].items():
-            if key not in exclude_keys:
-                self.__dict__[key].load_state_dict(value, **kwargs)
-        for key in include_keys:
-            if key in payload["pickles"]:
-                self.__dict__[key] = dill.loads(payload["pickles"][key])
-
-    def load_checkpoint(self, path=None, tag="latest", exclude_keys=None, include_keys=None, **kwargs):
-        if path is None:
-            path = self.get_checkpoint_path(tag=tag)
-        else:
-            path = pathlib.Path(path)
-        payload = torch.load(path.open("rb"), pickle_module=dill, **kwargs)
-        self.load_payload(payload, exclude_keys=exclude_keys, include_keys=include_keys)
-        return payload
-
-    @classmethod
-    def create_from_checkpoint(cls, path, exclude_keys=None, include_keys=None, **kwargs):
-        payload = torch.load(open(path, "rb"), pickle_module=dill)
-        instance = cls(payload["cfg"])
-        instance.load_payload(
-            payload=payload,
-            exclude_keys=exclude_keys,
-            include_keys=include_keys,
-            **kwargs,
-        )
-        return instance
-
-    def save_snapshot(self, tag="latest"):
-        """
-        Quick loading and saving for reserach, saves full state of the workspace.
-
-        However, loading a snapshot assumes the code stays exactly the same.
-        Use save_checkpoint for long-term storage.
-        """
-        path = pathlib.Path(self.output_dir).joinpath("snapshots", f"{tag}.pkl")
-        path.parent.mkdir(parents=False, exist_ok=True)
-        torch.save(self, path.open("wb"), pickle_module=dill)
-        return str(path.absolute())
-
-    @classmethod
-    def create_from_snapshot(cls, path):
-        return torch.load(open(path, "rb"), pickle_module=dill)
-
-
-def _copy_to_cpu(x):
-    if isinstance(x, torch.Tensor):
-        return x.detach().to("cpu")
-    elif isinstance(x, dict):
-        result = dict()
-        for k, v in x.items():
-            result[k] = _copy_to_cpu(v)
-        return result
-    elif isinstance(x, list):
-        return [_copy_to_cpu(k) for k in x]
-    else:
-        return copy.deepcopy(x)
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/workspace/robotworkspace.py b/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/workspace/robotworkspace.py
deleted file mode 100644
index 0871c51e2..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/workspace/robotworkspace.py
+++ /dev/null
@@ -1,358 +0,0 @@
-import copy
-import os
-import pathlib
-import random
-
-import hydra
-import numpy as np
-import torch
-import tqdm
-import wandb
-from diffusion_policy.common.checkpoint_util import TopKCheckpointManager
-from diffusion_policy.common.json_logger import JsonLogger
-from diffusion_policy.common.pytorch_util import dict_apply, optimizer_to
-from diffusion_policy.dataset.base_dataset import BaseImageDataset
-from diffusion_policy.model.common.lr_scheduler import get_scheduler
-from diffusion_policy.model.diffusion.ema_model import EMAModel
-from diffusion_policy.policy.diffusion_unet_image_policy import DiffusionUnetImagePolicy
-from diffusion_policy.workspace.base_workspace import BaseWorkspace
-from omegaconf import OmegaConf
-from torch.utils.data import DataLoader
-
-OmegaConf.register_new_resolver("eval", eval, replace=True)
-
-
-class RobotWorkspace(BaseWorkspace):
-    include_keys = ["global_step", "epoch"]
-
-    def __init__(self, cfg: OmegaConf, output_dir=None):
-        super().__init__(cfg, output_dir=output_dir)
-
-        # set seed
-        seed = cfg.training.seed
-        torch.manual_seed(seed)
-        np.random.seed(seed)
-        random.seed(seed)
-
-        # configure model
-        self.model: DiffusionUnetImagePolicy = hydra.utils.instantiate(cfg.policy)
-
-        self.ema_model: DiffusionUnetImagePolicy = None
-        if cfg.training.use_ema:
-            self.ema_model = copy.deepcopy(self.model)
-
-        # configure training state
-        self.optimizer = hydra.utils.instantiate(cfg.optimizer, params=self.model.parameters())
-
-        # configure training state
-        self.global_step = 0
-        self.epoch = 0
-
-    def run(self):
-        cfg = copy.deepcopy(self.cfg)
-
-        # resume training
-        if cfg.training.resume:
-            lastest_ckpt_path = self.get_checkpoint_path()
-            if lastest_ckpt_path.is_file():
-                print(f"Resuming from checkpoint {lastest_ckpt_path}")
-                self.load_checkpoint(path=lastest_ckpt_path)
-
-        # configure dataset
-        dataset: BaseImageDataset
-        dataset = hydra.utils.instantiate(cfg.task.dataset)
-        assert isinstance(dataset, BaseImageDataset)
-        train_dataloader = create_dataloader(dataset, **cfg.dataloader)
-        normalizer = dataset.get_normalizer()
-
-        # configure validation dataset
-        val_dataset = dataset.get_validation_dataset()
-        val_dataloader = create_dataloader(val_dataset, **cfg.val_dataloader)
-
-        self.model.set_normalizer(normalizer)
-        if cfg.training.use_ema:
-            self.ema_model.set_normalizer(normalizer)
-
-        # configure lr scheduler
-        lr_scheduler = get_scheduler(
-            cfg.training.lr_scheduler,
-            optimizer=self.optimizer,
-            num_warmup_steps=cfg.training.lr_warmup_steps,
-            num_training_steps=(len(train_dataloader) * cfg.training.num_epochs)
-            // cfg.training.gradient_accumulate_every,
-            # pytorch assumes stepping LRScheduler every epoch
-            # however huggingface diffusers steps it every batch
-            last_epoch=self.global_step - 1,
-        )
-
-        # configure ema
-        ema: EMAModel = None
-        if cfg.training.use_ema:
-            ema = hydra.utils.instantiate(cfg.ema, model=self.ema_model)
-
-        # configure env
-        # env_runner: BaseImageRunner
-        # env_runner = hydra.utils.instantiate(
-        #     cfg.task.env_runner,
-        #     output_dir=self.output_dir)
-        # assert isinstance(env_runner, BaseImageRunner)
-        env_runner = None
-        wandb_run = None
-
-        # configure logging
-        if cfg.logging.mode == "online":
-            wandb_run = wandb.init(
-                dir=str(self.output_dir),
-                config=OmegaConf.to_container(cfg, resolve=True),
-                **cfg.logging,
-            )
-            wandb.config.update({
-                "output_dir": self.output_dir,
-            })
-
-        # configure checkpoint
-        topk_manager = TopKCheckpointManager(
-            save_dir=os.path.join(self.output_dir, "checkpoints"), **cfg.checkpoint.topk
-        )
-
-        # device transfer
-        device = torch.device(cfg.training.device)
-        self.model.to(device)
-        if self.ema_model is not None:
-            self.ema_model.to(device)
-        optimizer_to(self.optimizer, device)
-
-        # save batch for sampling
-        train_sampling_batch = None
-
-        if cfg.training.debug:
-            cfg.training.num_epochs = 2
-            cfg.training.max_train_steps = 3
-            cfg.training.max_val_steps = 3
-            cfg.training.rollout_every = 1
-            cfg.training.checkpoint_every = 1
-            cfg.training.val_every = 1
-            cfg.training.sample_every = 1
-
-        # training loop
-        log_path = os.path.join(self.output_dir, "logs.json.txt")
-        with JsonLogger(log_path) as json_logger:
-            for local_epoch_idx in range(cfg.training.num_epochs):
-                step_log = dict()
-                # ========= train for this epoch ==========
-                if cfg.training.freeze_encoder:
-                    self.model.obs_encoder.eval()
-                    self.model.obs_encoder.requires_grad_(False)
-
-                train_losses = list()
-                with tqdm.tqdm(
-                    train_dataloader,
-                    desc=f"Training epoch {self.epoch}",
-                    leave=False,
-                    mininterval=cfg.training.tqdm_interval_sec,
-                ) as tepoch:
-                    for batch_idx, batch in enumerate(tepoch):
-                        batch = dataset.postprocess(batch, device)
-                        if train_sampling_batch is None:
-                            train_sampling_batch = batch
-                        # print("obs_dict:", batch)
-                        # print("dict_keys:", batch.keys())
-                        # print("dict_items:", batch.items())
-                        # print()
-                        # from pprint import pprint
-
-                        # pprint(batch)
-                        # compute loss
-                        raw_loss = self.model.compute_loss(batch)
-                        loss = raw_loss / cfg.training.gradient_accumulate_every
-                        loss.backward()
-
-                        # step optimizer
-                        if self.global_step % cfg.training.gradient_accumulate_every == 0:
-                            self.optimizer.step()
-                            self.optimizer.zero_grad()
-                            lr_scheduler.step()
-
-                        # update ema
-                        if cfg.training.use_ema:
-                            ema.step(self.model)
-
-                        # logging
-                        raw_loss_cpu = raw_loss.item()
-                        tepoch.set_postfix(loss=raw_loss_cpu, refresh=False)
-                        train_losses.append(raw_loss_cpu)
-                        step_log = {
-                            "train_loss": raw_loss_cpu,
-                            "global_step": self.global_step,
-                            "epoch": self.epoch,
-                            "lr": lr_scheduler.get_last_lr()[0],
-                        }
-
-                        is_last_batch = batch_idx == (len(train_dataloader) - 1)
-                        if not is_last_batch:
-                            # log of last step is combined with validation and rollout
-                            if wandb_run is not None:
-                                wandb_run.log(step_log, step=self.global_step)
-                            json_logger.log(step_log)
-                            self.global_step += 1
-
-                        if (cfg.training.max_train_steps is not None) and batch_idx >= (
-                            cfg.training.max_train_steps - 1
-                        ):
-                            break
-
-                # at the end of each epoch
-                # replace train_loss with epoch average
-                train_loss = np.mean(train_losses)
-                step_log["train_loss"] = train_loss
-
-                # ========= eval for this epoch ==========
-                policy = self.model
-                if cfg.training.use_ema:
-                    policy = self.ema_model
-                policy.eval()
-
-                # run rollout
-                # if (self.epoch % cfg.training.rollout_every) == 0:
-                #     runner_log = env_runner.run(policy)
-                #     # log all
-                #     step_log.update(runner_log)
-
-                # run validation
-                if (self.epoch % cfg.training.val_every) == 0:
-                    with torch.no_grad():
-                        val_losses = list()
-                        with tqdm.tqdm(
-                            val_dataloader,
-                            desc=f"Validation epoch {self.epoch}",
-                            leave=False,
-                            mininterval=cfg.training.tqdm_interval_sec,
-                        ) as tepoch:
-                            for batch_idx, batch in enumerate(tepoch):
-                                batch = dataset.postprocess(batch, device)
-                                loss = self.model.compute_loss(batch)
-                                val_losses.append(loss)
-                                if (cfg.training.max_val_steps is not None) and batch_idx >= (
-                                    cfg.training.max_val_steps - 1
-                                ):
-                                    break
-                        if len(val_losses) > 0:
-                            val_loss = torch.mean(torch.tensor(val_losses)).item()
-                            # log epoch average validation loss
-                            step_log["val_loss"] = val_loss
-
-                # run diffusion sampling on a training batch
-                if (self.epoch % cfg.training.sample_every) == 0:
-                    with torch.no_grad():
-                        # sample trajectory from training set, and evaluate difference
-                        batch = train_sampling_batch
-                        obs_dict = batch["obs"]
-                        # print("obs_dict:", obs_dict)
-                        # print("dict_keys:", obs_dict.keys())
-                        # print("dict_items:", obs_dict.items())
-                        # print()
-                        # from pprint import pprint
-                        # pprint(obs_dict)
-                        gt_action = batch["action"]
-
-                        result = policy.predict_action(obs_dict)
-                        pred_action = result["action_pred"]
-                        mse = torch.nn.functional.mse_loss(pred_action, gt_action)
-                        step_log["train_action_mse_error"] = mse.item()
-                        del batch
-                        del obs_dict
-                        del gt_action
-                        del result
-                        del pred_action
-                        del mse
-
-                # checkpoint
-                if ((self.epoch + 1) % cfg.training.checkpoint_every) == 0:
-                    # checkpointing
-                    save_name = pathlib.Path(self.cfg.task.dataset.zarr_path).stem
-                    self.save_checkpoint(cfg.checkpoint.save_root_dir + f"/checkpoints/{self.epoch + 1}.ckpt")  # TODO
-
-                # ========= eval end for this epoch ==========
-                policy.train()
-
-                # end of epoch
-                # log of last step is combined with validation and rollout
-                json_logger.log(step_log)
-                if wandb_run is not None:
-                    wandb_run.log(step_log, step=self.global_step)
-                self.global_step += 1
-                self.epoch += 1
-
-
-class BatchSampler:
-    def __init__(
-        self,
-        data_size: int,
-        batch_size: int,
-        shuffle: bool = False,
-        seed: int = 0,
-        drop_last: bool = True,
-    ):
-        assert drop_last
-        self.data_size = data_size
-        self.batch_size = batch_size
-        self.num_batch = data_size // batch_size
-        self.discard = data_size - batch_size * self.num_batch
-        self.shuffle = shuffle
-        self.rng = np.random.default_rng(seed) if shuffle else None
-
-    def __iter__(self):
-        if self.shuffle:
-            perm = self.rng.permutation(self.data_size)
-        else:
-            perm = np.arange(self.data_size)
-        if self.discard > 0:
-            perm = perm[: -self.discard]
-        perm = perm.reshape(self.num_batch, self.batch_size)
-        for i in range(self.num_batch):
-            yield perm[i]
-
-    def __len__(self):
-        return self.num_batch
-
-
-def create_dataloader(
-    dataset,
-    *,
-    batch_size: int,
-    shuffle: bool,
-    num_workers: int,
-    pin_memory: bool,
-    persistent_workers: bool,
-    seed: int = 0,
-):
-    # print("create_dataloader_batch_size", batch_size)
-    batch_sampler = BatchSampler(len(dataset), batch_size, shuffle=shuffle, seed=seed, drop_last=True)
-
-    def collate(x):
-        assert len(x) == 1
-        return x[0]
-
-    dataloader = DataLoader(
-        dataset,
-        collate_fn=collate,
-        sampler=batch_sampler,
-        num_workers=num_workers,
-        pin_memory=False,
-        persistent_workers=persistent_workers,
-    )
-    return dataloader
-
-
-@hydra.main(
-    version_base=None,
-    config_path=str(pathlib.Path(__file__).parent.parent.joinpath("config")),
-    config_name=pathlib.Path(__file__).stem,
-)
-def main(cfg):
-    workspace = RobotWorkspace(cfg)
-    workspace.run()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/roboverse_learn/il/utils/diffusion_policy/pyproject.toml b/roboverse_learn/il/utils/diffusion_policy/pyproject.toml
deleted file mode 100644
index c951d2d06..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/pyproject.toml
+++ /dev/null
@@ -1,27 +0,0 @@
-[build-system]
-requires = ["flit_core >=3.7,<4"]
-build-backend = "flit_core.buildapi"
-
-[project]
-name = "diffusion_policy"
-version = "0.1.0"
-description = "Diffusion policy for RoboVerse"
-requires-python = ">=3.8"
-dependencies = [
-    "zarr==2.12.0",
-    "ipdb",
-    "gpustat",
-    "omegaconf",
-    "hydra-core==1.2.0",
-    "dill==0.3.5.1",
-    "einops==0.4.1",
-    "diffusers",
-    "numba",
-    "moviepy",
-    "imageio",
-    "av",
-    "matplotlib",
-    "termcolor",
-    "huggingface_hub",
-    "pillow",
-]
diff --git a/roboverse_learn/il/utils/diffusion_policy/scripts/prune_and_rename.py b/roboverse_learn/il/utils/diffusion_policy/scripts/prune_and_rename.py
deleted file mode 100644
index 8cf5e12de..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/scripts/prune_and_rename.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import os
-import sys
-import shutil
-
-def main():
-    if len(sys.argv) != 2:
-        print(f"Usage: {sys.argv[0]} <root_directory>")
-        sys.exit(1)
-
-    root_dir = sys.argv[1]
-    if not os.path.isdir(root_dir):
-        print(f"Error: '{root_dir}' is not a directory.")
-        sys.exit(1)
-
-    # List subdirectories matching 'demo_XXXX'
-    subdirs = [d for d in os.listdir(root_dir)
-               if os.path.isdir(os.path.join(root_dir, d)) and d.startswith('demo_')]
-    # Sort by numeric suffix
-    subdirs.sort(key=lambda x: int(x.split('_')[1]))
-
-    valid_dirs = []
-    # Identify and remove empty ones
-    for d in subdirs:
-        path = os.path.join(root_dir, d)
-        metadata_path = os.path.join(path, 'metadata.json')
-        if not os.path.isfile(metadata_path):
-            print(f"Removing empty folder: {d}")
-            shutil.rmtree(path)
-        else:
-            valid_dirs.append(d)
-
-    # Renumber remaining directories
-    for new_idx, old_name in enumerate(valid_dirs):
-        new_name = f"demo_{new_idx:04d}"
-        if old_name != new_name:
-            old_path = os.path.join(root_dir, old_name)
-            new_path = os.path.join(root_dir, new_name)
-            print(f"Renaming {old_name} -> {new_name}")
-            os.rename(old_path, new_path)
-
-if __name__ == '__main__':
-    main()
diff --git a/roboverse_learn/il/utils/diffusion_policy/train.py b/roboverse_learn/il/utils/diffusion_policy/train.py
deleted file mode 100644
index 0f36df59d..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/train.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import sys
-
-# use line-buffering for both stdout and stderr
-sys.stdout = open(sys.stdout.fileno(), mode="w", buffering=1)
-sys.stderr = open(sys.stderr.fileno(), mode="w", buffering=1)
-
-import pathlib
-
-import hydra
-from diffusion_policy.workspace.base_workspace import BaseWorkspace
-from omegaconf import OmegaConf
-
-import rootutils
-rootutils.setup_root(__file__, pythonpath=True)
-
-# allows arbitrary python code execution in configs using the ${eval:''} resolver
-OmegaConf.register_new_resolver("eval", eval, replace=True)
-
-
-abs_config_path = str(pathlib.Path(__file__).resolve().parent.joinpath("diffusion_policy", "config").absolute())
-
-@hydra.main(
-    version_base=None,
-    config_path=abs_config_path,
-)
-def main(cfg: OmegaConf):
-
-    OmegaConf.resolve(cfg)
-
-    cls = hydra.utils.get_class(cfg._target_)
-
-    workspace: BaseWorkspace = cls(cfg)
-    workspace.run()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/roboverse_learn/il/utils/diffusion_policy/train_dp.sh b/roboverse_learn/il/utils/diffusion_policy/train_dp.sh
deleted file mode 100644
index c0b143423..000000000
--- a/roboverse_learn/il/utils/diffusion_policy/train_dp.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-# Examples:
-# bash roboverse_learn/algorithms/diffusion_policy/train_dp.sh roboverse_demo/demo_isaaclab/CloseBox-Level0/robot-franka CloseBoxFrankaL0 100 0 200 joint_pos joint_pos
-
-# 'metadata_dir' means path to metadata directory. e.g. roboverse_demo/demo_isaaclab/CloseBox-Level0/robot-franka
-# 'task_name' gives a name to the policy, which can include the task robot and level ie CloseBoxFrankaL0
-# 'expert_data_num' means number of training data. e.g.100
-# 'gpu_id' means single gpu id, e.g.0
-
-metadata_dir=${1}
-task_name=${2}
-expert_data_num=${3}
-gpu_id=${4}
-num_epochs=${5}
-obs_space=${6} # joint_pos or ee
-act_space=${7} # joint_pos or ee
-delta_ee=${8:-0} # 0 or 1 (only matters if act_space is ee, 0 means absolute 1 means delta control )
-
-config_name=robot_dp
-horizon=8
-n_obs_steps=3
-n_action_steps=4
-seed=42
-
-# adding the obs and action space as additional info
-extra="obs:${obs_space}_act:${act_space}"
-if [ "${delta_ee}" = 1 ]; then
-  extra="${extra}_delta"
-fi
-
-python roboverse_learn/algorithms/data2zarr_dp.py \
---task_name ${task_name}_${extra} \
---expert_data_num ${expert_data_num} \
---metadata_dir ${metadata_dir} \
---action_space ${act_space} \
---observation_space ${obs_space} \
---delta_ee ${delta_ee}
-
-echo -e "\033[33mgpu id (to use): ${gpu_id}\033[0m"
-export HYDRA_FULL_ERROR=1
-export CUDA_VISIBLE_DEVICES=${gpu_id}
-python roboverse_learn/algorithms/diffusion_policy/train.py --config-name=${config_name}.yaml \
-task.name=${task_name}_${extra} \
-task.dataset.zarr_path="data_policy/${task_name}_${extra}_${expert_data_num}.zarr" \
-training.seed=${seed} \
-horizon=${horizon} \
-n_obs_steps=${n_obs_steps} \
-n_action_steps=${n_action_steps} \
-training.num_epochs=${num_epochs} \
-policy_runner.obs.obs_type=${obs_space} \
-policy_runner.action.action_type=${act_space} \
-policy_runner.action.delta=${delta_ee} \
-training.device="cuda:${gpu_id}"
diff --git a/roboverse_learn/il/utils/common/env_util.py b/roboverse_learn/il/utils/env_util.py
similarity index 100%
rename from roboverse_learn/il/utils/common/env_util.py
rename to roboverse_learn/il/utils/env_util.py
diff --git a/roboverse_learn/il/utils/common/eval_args.py b/roboverse_learn/il/utils/eval_args.py
similarity index 100%
rename from roboverse_learn/il/utils/common/eval_args.py
rename to roboverse_learn/il/utils/eval_args.py
diff --git a/roboverse_learn/il/utils/common/eval_runner_getter.py b/roboverse_learn/il/utils/eval_runner_getter.py
similarity index 79%
rename from roboverse_learn/il/utils/common/eval_runner_getter.py
rename to roboverse_learn/il/utils/eval_runner_getter.py
index 1d6ddb015..17ffacd8a 100644
--- a/roboverse_learn/il/utils/common/eval_runner_getter.py
+++ b/roboverse_learn/il/utils/eval_runner_getter.py
@@ -1,6 +1,6 @@
 def get_runner(algo):
     if algo == "diffusion_policy":
-        from roboverse_learn.il.dp.eval_runner.dp_eval_runner import DPEvalRunner
+        from roboverse_learn.il.eval_runner.dp_eval_runner import DPEvalRunner
 
         return DPEvalRunner
     else:
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/flow_matchers.py b/roboverse_learn/il/utils/flow_matchers.py
similarity index 100%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/flow_matchers.py
rename to roboverse_learn/il/utils/flow_matchers.py
diff --git a/roboverse_learn/il/utils/common/json_logger.py b/roboverse_learn/il/utils/json_logger.py
similarity index 100%
rename from roboverse_learn/il/utils/common/json_logger.py
rename to roboverse_learn/il/utils/json_logger.py
diff --git a/roboverse_learn/il/utils/common/lr_scheduler.py b/roboverse_learn/il/utils/lr_scheduler.py
similarity index 100%
rename from roboverse_learn/il/utils/common/lr_scheduler.py
rename to roboverse_learn/il/utils/lr_scheduler.py
diff --git a/roboverse_learn/il/utils/common/module_attr_mixin.py b/roboverse_learn/il/utils/module_attr_mixin.py
similarity index 100%
rename from roboverse_learn/il/utils/common/module_attr_mixin.py
rename to roboverse_learn/il/utils/module_attr_mixin.py
diff --git a/roboverse_learn/il/utils/common/nested_dict_util.py b/roboverse_learn/il/utils/nested_dict_util.py
similarity index 100%
rename from roboverse_learn/il/utils/common/nested_dict_util.py
rename to roboverse_learn/il/utils/nested_dict_util.py
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/normalize_util.py b/roboverse_learn/il/utils/normalize_util.py
similarity index 98%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/normalize_util.py
rename to roboverse_learn/il/utils/normalize_util.py
index 2d7fe825e..8221cb346 100644
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/normalize_util.py
+++ b/roboverse_learn/il/utils/normalize_util.py
@@ -1,10 +1,10 @@
 import numpy as np
-from diffusion_policy.common.pytorch_util import (
+from roboverse_learn.il.utils.pytorch_util import (
     dict_apply,
     dict_apply_reduce,
     dict_apply_split,
 )
-from diffusion_policy.model.common.normalizer import SingleFieldLinearNormalizer
+from roboverse_learn.il.utils.normalizer import SingleFieldLinearNormalizer
 
 
 def get_range_normalizer_from_stat(stat, output_max=1, output_min=-1, range_eps=1e-7):
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/normalizer.py b/roboverse_learn/il/utils/normalizer.py
similarity index 98%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/normalizer.py
rename to roboverse_learn/il/utils/normalizer.py
index 499a5e100..0fa75e2a5 100644
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/model/common/normalizer.py
+++ b/roboverse_learn/il/utils/normalizer.py
@@ -5,8 +5,8 @@
 import torch
 import torch.nn as nn
 import zarr
-from diffusion_policy.common.pytorch_util import dict_apply
-from diffusion_policy.model.common.dict_of_tensor_mixin import DictOfTensorMixin
+from roboverse_learn.il.utils.pytorch_util import dict_apply
+from roboverse_learn.il.utils.dict_of_tensor_mixin import DictOfTensorMixin
 
 
 class LinearNormalizer(DictOfTensorMixin):
diff --git a/roboverse_learn/il/utils/common/pose_trajectory_interpolator.py b/roboverse_learn/il/utils/pose_trajectory_interpolator.py
similarity index 100%
rename from roboverse_learn/il/utils/common/pose_trajectory_interpolator.py
rename to roboverse_learn/il/utils/pose_trajectory_interpolator.py
diff --git a/roboverse_learn/il/utils/common/precise_sleep.py b/roboverse_learn/il/utils/precise_sleep.py
similarity index 100%
rename from roboverse_learn/il/utils/common/precise_sleep.py
rename to roboverse_learn/il/utils/precise_sleep.py
diff --git a/roboverse_learn/il/utils/common/pymunk_override.py b/roboverse_learn/il/utils/pymunk_override.py
similarity index 100%
rename from roboverse_learn/il/utils/common/pymunk_override.py
rename to roboverse_learn/il/utils/pymunk_override.py
diff --git a/roboverse_learn/il/utils/common/pymunk_util.py b/roboverse_learn/il/utils/pymunk_util.py
similarity index 100%
rename from roboverse_learn/il/utils/common/pymunk_util.py
rename to roboverse_learn/il/utils/pymunk_util.py
diff --git a/roboverse_learn/il/utils/common/pytorch_util.py b/roboverse_learn/il/utils/pytorch_util.py
similarity index 100%
rename from roboverse_learn/il/utils/common/pytorch_util.py
rename to roboverse_learn/il/utils/pytorch_util.py
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/replay_buffer.py b/roboverse_learn/il/utils/replay_buffer.py
similarity index 100%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/replay_buffer.py
rename to roboverse_learn/il/utils/replay_buffer.py
diff --git a/roboverse_learn/il/utils/common/robomimic_config_util.py b/roboverse_learn/il/utils/robomimic_config_util.py
similarity index 100%
rename from roboverse_learn/il/utils/common/robomimic_config_util.py
rename to roboverse_learn/il/utils/robomimic_config_util.py
diff --git a/roboverse_learn/il/utils/common/robomimic_util.py b/roboverse_learn/il/utils/robomimic_util.py
similarity index 100%
rename from roboverse_learn/il/utils/common/robomimic_util.py
rename to roboverse_learn/il/utils/robomimic_util.py
diff --git a/roboverse_learn/il/utils/common/rotation_transformer.py b/roboverse_learn/il/utils/rotation_transformer.py
similarity index 100%
rename from roboverse_learn/il/utils/common/rotation_transformer.py
rename to roboverse_learn/il/utils/rotation_transformer.py
diff --git a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/sampler.py b/roboverse_learn/il/utils/sampler.py
similarity index 98%
rename from roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/sampler.py
rename to roboverse_learn/il/utils/sampler.py
index 2525de5e7..74663c9e9 100644
--- a/roboverse_learn/il/utils/diffusion_policy/diffusion_policy/common/sampler.py
+++ b/roboverse_learn/il/utils/sampler.py
@@ -2,7 +2,7 @@
 
 import numba
 import numpy as np
-from diffusion_policy.common.replay_buffer import ReplayBuffer
+from roboverse_learn.il.utils.replay_buffer import ReplayBuffer
 
 
 @numba.jit(nopython=True)
diff --git a/roboverse_learn/il/utils/common/shape_util.py b/roboverse_learn/il/utils/shape_util.py
similarity index 100%
rename from roboverse_learn/il/utils/common/shape_util.py
rename to roboverse_learn/il/utils/shape_util.py
diff --git a/roboverse_learn/il/utils/common/tensor_util.py b/roboverse_learn/il/utils/tensor_util.py
similarity index 100%
rename from roboverse_learn/il/utils/common/tensor_util.py
rename to roboverse_learn/il/utils/tensor_util.py
diff --git a/roboverse_learn/il/utils/common/timestamp_accumulator.py b/roboverse_learn/il/utils/timestamp_accumulator.py
similarity index 100%
rename from roboverse_learn/il/utils/common/timestamp_accumulator.py
rename to roboverse_learn/il/utils/timestamp_accumulator.py
diff --git a/roboverse_learn/il/vita/README.md b/roboverse_learn/il/vita/README.md
new file mode 100644
index 000000000..ebdefd8fa
--- /dev/null
+++ b/roboverse_learn/il/vita/README.md
@@ -0,0 +1,34 @@
+# VITA Policy (IL)
+
+VITA is a vision-to-action Flow Matching policy built on the shared IL runners under `il/dp/`.
+
+## Install
+
+```bash
+cd roboverse_learn/il/dp
+pip install -r requirements.txt
+```
+
+Create a Weights & Biases account to obtain an API key for logging.
+
+## Collect and process data
+
+```bash
+./roboverse_learn/il/collect_demo.sh
+```
+
+## Train and eval
+
+Use the shared driver and select the VITA model:
+
+```bash
+export algo_model="vita_model"
+./roboverse_learn/il/dp/dp_run.sh
+```
+
+Inside `dp_run.sh` you can toggle `train_enable` / `eval_enable`, set task names, seeds, GPU id, and checkpoint paths for evaluation.
+
+## References
+
+- Dechen Gao et al., "VITA: Vision-to-Action Flow Matching Policy." (2025).
+- Yaron Lipman et al., "Flow Matching for Generative Modeling." (2023).
diff --git a/roboverse_learn/il/dp/models/vita_policy.py b/roboverse_learn/il/vita/policies/vita_policy.py
similarity index 93%
rename from roboverse_learn/il/dp/models/vita_policy.py
rename to roboverse_learn/il/vita/policies/vita_policy.py
index 367ffe80d..b47e3f225 100644
--- a/roboverse_learn/il/dp/models/vita_policy.py
+++ b/roboverse_learn/il/vita/policies/vita_policy.py
@@ -3,16 +3,15 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from einops import reduce
 
-from roboverse_learn.il.utils.common.normalizer import LinearNormalizer
-from roboverse_learn.il.utils.common.pytorch_util import dict_apply
-from diffusion_policy.policy.base_image_policy import BaseImagePolicy
+from roboverse_learn.il.utils.normalizer import LinearNormalizer
+from roboverse_learn.il.utils.pytorch_util import dict_apply
+from roboverse_learn.il.base.base_image_policy import BaseImagePolicy
 
-from diffusion_policy.model.diffusion.flow_net import SimpleFlowNet
-from diffusion_policy.model.diffusion.action_ae import CNNActionEncoder, SimpleActionDecoder
-from diffusion_policy.model.vision.multi_image_obs_encoder import MultiImageObsEncoder
-from diffusion_policy.common.flow_matchers import TorchFlowMatcher
+from roboverse_learn.il.dp.models.diffusion.flow_net import SimpleFlowNet
+from roboverse_learn.il.dp.models.diffusion.action_ae import CNNActionEncoder, SimpleActionDecoder
+from roboverse_learn.il.dp.models.vision.multi_image_obs_encoder import MultiImageObsEncoder
+from roboverse_learn.il.utils.flow_matchers import TorchFlowMatcher
 
 
 class VITAImagePolicy(BaseImagePolicy):
diff --git a/roboverse_learn/il/vita/requirements.txt b/roboverse_learn/il/vita/requirements.txt
new file mode 100644
index 000000000..fb7e62adf
--- /dev/null
+++ b/roboverse_learn/il/vita/requirements.txt
@@ -0,0 +1,19 @@
+zarr==2.12.0
+ipdb
+gpustat
+omegaconf
+hydra-core==1.2.0
+dill==0.3.5.1
+einops==0.4.1
+diffusers
+numba
+moviepy
+imageio
+av
+matplotlib
+termcolor
+huggingface_hub
+pillow
+pandas
+wandb
+torchcfm
diff --git a/roboverse_learn/vla/OpenVLA/vla_eval.py b/roboverse_learn/vla/OpenVLA/vla_eval.py
index acae067ee..b400b84c0 100644
--- a/roboverse_learn/vla/OpenVLA/vla_eval.py
+++ b/roboverse_learn/vla/OpenVLA/vla_eval.py
@@ -24,7 +24,7 @@
 from metasim.utils.demo_util import get_traj
 from metasim.utils.setup_util import get_robot
 from metasim.randomization import DomainRandomizationManager, DRConfig
-from roboverse_learn.il.dp.runner.base_policy import BasePolicyCfg, ActionCfg, ObsCfg, EndEffectorCfg
+from roboverse_learn.il.runner.base_policy import BasePolicyCfg, ActionCfg, ObsCfg, EndEffectorCfg
 
 
 @configclass
diff --git a/roboverse_learn/vla/SmolVLA/smolvla_eval.py b/roboverse_learn/vla/SmolVLA/smolvla_eval.py
index a2f4dd4df..97cd78209 100755
--- a/roboverse_learn/vla/SmolVLA/smolvla_eval.py
+++ b/roboverse_learn/vla/SmolVLA/smolvla_eval.py
@@ -31,7 +31,7 @@
 from metasim.utils.demo_util import get_traj
 from metasim.utils.setup_util import get_robot
 from metasim.randomization import DomainRandomizationManager, DRConfig
-from roboverse_learn.il.dp.runner.base_policy import BasePolicyCfg, ActionCfg, ObsCfg, EndEffectorCfg
+from roboverse_learn.il.runner.base_policy import BasePolicyCfg, ActionCfg, ObsCfg, EndEffectorCfg
 
 
 @configclass
diff --git a/roboverse_learn/vla/pi0/pi_eval.py b/roboverse_learn/vla/pi0/pi_eval.py
index 4437c211f..9d8687c23 100644
--- a/roboverse_learn/vla/pi0/pi_eval.py
+++ b/roboverse_learn/vla/pi0/pi_eval.py
@@ -26,7 +26,7 @@
 
 from openpi_client import image_tools, websocket_client_policy
 
-from roboverse_learn.il.dp.runner.base_policy import ActionCfg, BasePolicyCfg, ObsCfg
+from roboverse_learn.il.runner.base_policy import ActionCfg, BasePolicyCfg, ObsCfg
 
 
 @configclass