From 80cde171dd15ced0a84b5ee8f175a67498448b4d Mon Sep 17 00:00:00 2001
From: Eetu <rantala.eetu@gmail.com>
Date: Wed, 3 May 2023 15:51:42 +0300
Subject: [PATCH] Add option for local obs for MAPPO with CTDE

---
 .../cfg/task/MobileFrankaMARL.yaml            |   2 +
 .../cfg/task/MobileFrankaMARL_cv.yaml         | 133 ++++++++++++++++++
 .../cfg/train/MobileFrankaMARL_cvPPO.yaml     |  88 ++++++++++++
 omniisaacgymenvs/tasks/mobile_franka_marl.py  |  62 ++++++--
 omniisaacgymenvs/utils/task_util.py           |   3 +-
 5 files changed, 275 insertions(+), 13 deletions(-)
 create mode 100644 omniisaacgymenvs/cfg/task/MobileFrankaMARL_cv.yaml
 create mode 100644 omniisaacgymenvs/cfg/train/MobileFrankaMARL_cvPPO.yaml

diff --git a/omniisaacgymenvs/cfg/task/MobileFrankaMARL.yaml b/omniisaacgymenvs/cfg/task/MobileFrankaMARL.yaml
index 7d5c0927..6d0ff0cd 100644
--- a/omniisaacgymenvs/cfg/task/MobileFrankaMARL.yaml
+++ b/omniisaacgymenvs/cfg/task/MobileFrankaMARL.yaml
@@ -31,6 +31,8 @@ env:
   actionPenaltyScale: 0.01
   fingerCloseRewardScale: 10.0
 
+  useLocalObs: False
+
 sim:
   dt: 0.0083 # 1/120 s
   use_gpu_pipeline: ${eq:${...pipeline},"gpu"}
diff --git a/omniisaacgymenvs/cfg/task/MobileFrankaMARL_cv.yaml b/omniisaacgymenvs/cfg/task/MobileFrankaMARL_cv.yaml
new file mode 100644
index 00000000..62756436
--- /dev/null
+++ b/omniisaacgymenvs/cfg/task/MobileFrankaMARL_cv.yaml
@@ -0,0 +1,133 @@
+# used to create the object
+name: MobileFrankaMARL
+
+physics_engine: ${..physics_engine}
+
+# if given, will override the device setting in gym. 
+env:
+  numEnvs: ${resolve_default:512,${...num_envs}}
+  envSpacing: 3.0
+  episodeLength: 500 # maybe need to play with this
+  enableDebugVis: False
+
+  clipObservations: 7.0
+  clipActions: 1.0
+
+  controlFrequencyInv: 2 # 30 Hz 2 for 60 hz, maybe need to play with this too
+
+  startPositionNoise: 0.0
+  startRotationNoise: 0.0
+
+  numProps: 4
+  aggregateMode: 3
+
+  actionScale: 7.5
+  dofVelocityScale: 0.1
+  distRewardScale: 2.0
+  rotRewardScale: 0.5
+  aroundHandleRewardScale: 10.0
+  openRewardScale: 7.5
+  fingerDistRewardScale: 100.0
+  actionPenaltyScale: 0.01
+  fingerCloseRewardScale: 10.0
+
+  useLocalObs: True
+
+sim:
+  dt: 0.0083 # 1/120 s
+  use_gpu_pipeline: ${eq:${...pipeline},"gpu"}
+  gravity: [0.0, 0.0, -9.81]
+  add_ground_plane: True
+  use_flatcache: True
+  enable_scene_query_support: False
+  disable_contact_processing: False
+
+  # set to True if you use camera sensors in the environment
+  enable_cameras: False
+
+  default_physics_material:
+    static_friction: 1.0
+    dynamic_friction: 1.0
+    restitution: 0.0
+
+  physx:
+    worker_thread_count: ${....num_threads}
+    solver_type: 0 # use pgs because otherwise target velocities is not stable 0: pgs, 1: tgs default: ${....solver_type}
+    use_gpu: ${eq:${....sim_device},"gpu"} # set to False to run on CPU
+    solver_position_iteration_count: 12
+    solver_velocity_iteration_count: 6
+    contact_offset: 0.005
+    rest_offset: 0.0
+    bounce_threshold_velocity: 0.2
+    friction_offset_threshold: 0.04
+    friction_correlation_distance: 0.025
+    enable_sleeping: True
+    enable_stabilization: True
+    max_depenetration_velocity: 1000.0
+
+    # GPU buffers
+    gpu_max_rigid_contact_count: 524288
+    gpu_max_rigid_patch_count: 33554432
+    gpu_found_lost_pairs_capacity: 524288
+    gpu_found_lost_aggregate_pairs_capacity: 262144
+    gpu_total_aggregate_pairs_capacity: 1048576
+    gpu_max_soft_body_contacts: 1048576
+    gpu_max_particle_contacts: 1048576
+    gpu_heap_capacity: 33554432
+    gpu_temp_buffer_capacity: 16777216
+    gpu_max_num_partitions: 8
+
+  mobile_franka:
+    # -1 to use default values
+    override_usd_defaults: False
+    fixed_base: False
+    enable_self_collisions: True
+    enable_gyroscopic_forces: True
+    # also in stage params
+    # per-actor
+    solver_position_iteration_count: 12
+    solver_velocity_iteration_count: 1
+    sleep_threshold: 0.005
+    stabilization_threshold: 0.001
+    # per-body
+    density: -1
+    max_depenetration_velocity: 1000.0
+    # per-shape
+    contact_offset: 0.005
+    rest_offset: 0.0
+  cabinet:
+    # -1 to use default values
+    override_usd_defaults: False
+    fixed_base: False
+    enable_self_collisions: False
+    enable_gyroscopic_forces: True
+    # also in stage params
+    # per-actor
+    solver_position_iteration_count: 12
+    solver_velocity_iteration_count: 1
+    sleep_threshold: 0.0
+    stabilization_threshold: 0.001
+    # per-body
+    density: -1
+    max_depenetration_velocity: 1000.0
+    # per-shape
+    contact_offset: 0.005
+    rest_offset: 0.0
+  prop:
+    # -1 to use default values
+    override_usd_defaults: False
+    fixed_base: False
+    enable_self_collisions: False
+    enable_gyroscopic_forces: True
+    # also in stage params
+    # per-actor
+    solver_position_iteration_count: 12
+    solver_velocity_iteration_count: 1
+    sleep_threshold: 0.005
+    stabilization_threshold: 0.001
+    # per-body
+    density: 100
+    max_depenetration_velocity: 1000.0
+    # per-shape
+    contact_offset: 0.005
+    rest_offset: 0.0
diff --git a/omniisaacgymenvs/cfg/train/MobileFrankaMARL_cvPPO.yaml b/omniisaacgymenvs/cfg/train/MobileFrankaMARL_cvPPO.yaml
new file mode 100644
index 00000000..25664d4c
--- /dev/null
+++ b/omniisaacgymenvs/cfg/train/MobileFrankaMARL_cvPPO.yaml
@@ -0,0 +1,88 @@
+params:
+  seed: ${...seed}
+  algo:
+    name: a2c_continuous
+
+  model:
+    name: continuous_a2c_logstd
+
+  network:
+    name: actor_critic
+    separate: False
+
+    space:
+      continuous:
+        mu_activation: None
+        sigma_activation: None
+        mu_init:
+          name: default
+        sigma_init:
+          name: const_initializer
+          val: 0
+        fixed_sigma: True
+    mlp:
+      units: [512, 256, 128] #[256, 128, 64]
+      activation: elu
+      d2rl: False
+
+      initializer:
+        name: default
+      regularizer:
+        name: None
+
+  load_checkpoint: ${if:${...checkpoint},True,False} # flag which sets whether to load the checkpoint
+  load_path: ${...checkpoint} # path to the checkpoint to load
+
+  config:
+    name: ${resolve_default:MobileFrankaMARL,${....experiment}}
+    full_experiment_name: ${.name}
+    env_name: rlgpu
+    device: ${....rl_device}
+    device_name: ${....rl_device}
+    ppo: True
+    mixed_precision: False
+    normalize_input: True
+    normalize_value: True
+    num_actors: ${....task.env.numEnvs}
+    reward_shaper:
+      scale_value: 0.01
+    normalize_advantage: True
+    gamma: 0.95
+    tau: 0.95
+    learning_rate: 3e-4
+    lr_schedule: adaptive
+    kl_threshold: 0.008
+    score_to_win: 100000000
+    max_epochs: ${resolve_default:1500,${....max_iterations}}
+    save_best_after: 200
+    save_frequency: 100
+    print_stats: True
+    grad_norm: 1.0
+    entropy_coef: 0.0
+    truncate_grads: True
+    e_clip: 0.2
+    horizon_length: 16
+    minibatch_size: 4096 #128 #1024
+    mini_epochs: 8
+    critic_coef: 4
+    clip_value: True
+    seq_len: 4
+    bounds_loss_coef: 0.0001
+
+    central_value_config:
+      minibatch_size: 2048
+      mini_epochs: 4
+      learning_rate: 3e-4
+      clip_value: False
+      normalize_input: True
+      network:
+        name: actor_critic
+        central_value: True
+        mlp:
+          units: [512, 256, 128]
+          activation: elu
+          initializer:
+            name: default
+            scale: 2 
+          regularizer:
+            name: None
diff --git a/omniisaacgymenvs/tasks/mobile_franka_marl.py b/omniisaacgymenvs/tasks/mobile_franka_marl.py
index c0e2d1d5..5dead676 100644
--- a/omniisaacgymenvs/tasks/mobile_franka_marl.py
+++ b/omniisaacgymenvs/tasks/mobile_franka_marl.py
@@ -65,6 +65,8 @@ def __init__(
         self.action_penalty_scale = self._task_cfg["env"]["actionPenaltyScale"]
         self.finger_close_reward_scale = self._task_cfg["env"]["fingerCloseRewardScale"]
 
+        self.use_local_obs = self._task_cfg["env"]["useLocalObs"]
+
         self.distX_offset = 0.04
         #self.dt = 1/60.
         # these values depend on the task and how we interface with the real robot
@@ -75,6 +77,10 @@ def __init__(
         self._num_actions = 9
         self._num_agents = 2
 
+        if self.use_local_obs:
+            self._num_observations = 26
+            self._num_states = 27 + 3 #27
+
         self.initial_target_pos = np.array([2.0, 0.0, 0.5])
 
         # set the ranges for the target randomization
@@ -243,24 +249,56 @@ def get_observations(self) -> dict:
 
         self.to_target = self.target_positions - self.franka_lfinger_pos
 
-        obs = torch.hstack((
-            base_pos_xy, 
-            base_yaw, 
-            arm_dof_pos_scaled,
-            #base_vel_xy, 
-            #base_angvel_z, 
-            franka_dof_vel[:, 3:] * self.dof_vel_scale,
-            self.franka_lfinger_pos,
-            self.target_positions
-        )).to(dtype=torch.float32)
+        if self.use_local_obs:
+            # pad base_obs with zeros to match the arm_obs
+            base_obs = torch.hstack((
+                base_pos_xy,
+                base_yaw,
+                self.franka_lfinger_pos,
+                self.target_positions,
+                torch.zeros((self.num_envs, 15), device=self._device)
+            )).to(dtype=torch.float32)
+
+            arm_obs = torch.hstack((
+                arm_dof_pos_scaled,
+                franka_dof_vel[:, 3:] * self.dof_vel_scale,
+                self.franka_lfinger_pos,
+                self.target_positions
+            )).to(dtype=torch.float32)
+
+            self.states_buf = torch.hstack((
+                base_pos_xy,
+                base_yaw,
+                base_vel_xy, 
+                base_angvel_z, 
+                arm_dof_pos_scaled,
+                franka_dof_vel[:, 3:] * self.dof_vel_scale,
+                self.franka_lfinger_pos,
+                self.target_positions
+            )).to(dtype=torch.float32)
+
+        else:
+            obs = torch.hstack((
+                base_pos_xy, 
+                base_yaw, 
+                arm_dof_pos_scaled,
+                #base_vel_xy, 
+                #base_angvel_z, 
+                franka_dof_vel[:, 3:] * self.dof_vel_scale,
+                self.franka_lfinger_pos,
+                self.target_positions
+            )).to(dtype=torch.float32)
+
+            base_obs = obs
+            arm_obs = obs
 
         #print("obs", obs)
         #input()
         
         base_id = torch.tensor([1.0, 0.0], device=self._device)
         arm_id = torch.tensor([0.0, 1.0], device=self._device)
-        base_obs = torch.hstack((obs, base_id.repeat(self.num_envs, 1)))
-        arm_obs = torch.hstack((obs, arm_id.repeat(self.num_envs, 1)))
+        base_obs = torch.hstack((base_obs, base_id.repeat(self.num_envs, 1)))
+        arm_obs = torch.hstack((arm_obs, arm_id.repeat(self.num_envs, 1)))
 
         self.obs_buf = torch.vstack((base_obs, arm_obs))
 
diff --git a/omniisaacgymenvs/utils/task_util.py b/omniisaacgymenvs/utils/task_util.py
index 78598c77..5f9f1234 100644
--- a/omniisaacgymenvs/utils/task_util.py
+++ b/omniisaacgymenvs/utils/task_util.py
@@ -65,7 +65,8 @@ def initialize_task(config, env, init_sim=True):
         "Jetbot_CNN": JetbotTask,
         "FrankaExample": FrankaExampleTask,
         "MobileFranka": MobileFrankaTask,
-        "MobileFrankaMARL": MobileFrankaMARLTask
+        "MobileFrankaMARL": MobileFrankaMARLTask,
+        "MobileFrankaMARL_cv": MobileFrankaMARLTask
     }
 
     from .config_utils.sim_config import SimConfig