From ddefa43e0ea9ba04d16c28b58607b276af9b4085 Mon Sep 17 00:00:00 2001
From: stretch-se3-3055 <grail-hello-stretch@nyu.edu>
Date: Tue, 20 Aug 2024 15:15:30 -0700
Subject: [PATCH 001/410] update

---
 src/config/default_planner.yaml               |  16 +-
 src/config/dynav_config.yaml                  |  76 ++++
 src/stretch/agent/zmq_client.py               | 126 ++++++
 src/stretch/dynav/__init__.py                 |   1 +
 src/stretch/dynav/ok_robot_hw/README.md       |  38 ++
 src/stretch/dynav/ok_robot_hw/camera.py       |  61 +++
 .../dynav/ok_robot_hw/global_parameters.py    |  14 +
 .../dynav/ok_robot_hw/image_publisher.py      |  59 +++
 src/stretch/dynav/ok_robot_hw/robot.py        | 326 ++++++++++++++
 .../dynav/ok_robot_hw/utils/__init__.py       |   4 +
 .../ok_robot_hw/utils/communication_utils.py  |  47 ++
 .../dynav/ok_robot_hw/utils/grasper_utils.py  | 234 ++++++++++
 .../dynav/ok_robot_hw/utils/urdf_utils.py     |  81 ++++
 src/stretch/dynav/ok_robot_hw/utils/utils.py  |  21 +
 src/stretch/dynav/robot_agent_manip_mdp.py    | 403 ++++++++++++++++++
 src/stretch/dynav/run_manip.py                |  96 +++++
 src/stretch/dynav/run_ok_nav.py               | 138 ++++++
 src/stretch/motion/constants.py               |   3 +-
 src/stretch/motion/kinematics.py              |   7 +-
 src/stretch/motion/pinocchio_ik_solver.py     |  17 +
 .../stretch_ros2_bridge/remote/server.py      |   4 +
 21 files changed, 1763 insertions(+), 9 deletions(-)
 create mode 100644 src/config/dynav_config.yaml
 create mode 100644 src/stretch/dynav/__init__.py
 create mode 100644 src/stretch/dynav/ok_robot_hw/README.md
 create mode 100644 src/stretch/dynav/ok_robot_hw/camera.py
 create mode 100644 src/stretch/dynav/ok_robot_hw/global_parameters.py
 create mode 100644 src/stretch/dynav/ok_robot_hw/image_publisher.py
 create mode 100644 src/stretch/dynav/ok_robot_hw/robot.py
 create mode 100644 src/stretch/dynav/ok_robot_hw/utils/__init__.py
 create mode 100644 src/stretch/dynav/ok_robot_hw/utils/communication_utils.py
 create mode 100644 src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
 create mode 100644 src/stretch/dynav/ok_robot_hw/utils/urdf_utils.py
 create mode 100644 src/stretch/dynav/ok_robot_hw/utils/utils.py
 create mode 100644 src/stretch/dynav/robot_agent_manip_mdp.py
 create mode 100644 src/stretch/dynav/run_manip.py
 create mode 100644 src/stretch/dynav/run_ok_nav.py

diff --git a/src/config/default_planner.yaml b/src/config/default_planner.yaml
index 167f8977a..d59703b5a 100644
--- a/src/config/default_planner.yaml
+++ b/src/config/default_planner.yaml
@@ -49,11 +49,17 @@ motion:
   angle_threshold: 0.01  # How much the robot has to rotate to be considered "rotating"
   min_steps_not_moving: 2  # How many steps the robot has to not move before we consider it "stopped"
   joint_tolerance:
-    arm: 0.05
-    base_x: 0.05
-    lift: 0.05
-    wrist_roll: 0.25
-    wrist_pitch: 0.25
+    # arm: 0.005
+    # base_x: 0.005
+    # lift: 0.005
+    # wrist_roll: 0.01
+    # wrist_pitch: 0.01
+    # wrist_yaw: 0.01
+    arm: 0.01
+    base_x: 0.01
+    lift: 0.01
+    wrist_roll: 0.05
+    wrist_pitch: 0.05
     wrist_yaw: 0.05
     head_pan: 0.01
     head_tilt: 0.01
diff --git a/src/config/dynav_config.yaml b/src/config/dynav_config.yaml
new file mode 100644
index 000000000..1d567b5fe
--- /dev/null
+++ b/src/config/dynav_config.yaml
@@ -0,0 +1,76 @@
+# Encoder setup
+# Encoder is used to compute per-object embeddings.
+encoder: "clip"
+encoder_args: "ViT-B/32"
+open_vocab_cat_map_file: projects/real_world_ovmm/configs/example_cat_map.json
+
+# Sparse Voxel Map parameters
+voxel_size: 0.1
+# 0.05 seems too low
+obs_min_height: 0.1  # Ignore things less than this high when planning motions
+obs_max_height: 1.5  # Ignore things over this height (eg ceilings)
+obs_min_density: 10  # This many points makes it an obstacle
+exp_min_density: 1
+
+# Padding
+pad_obstacles: 2  # Add this many units (voxel_size) to the area around obstacles
+min_pad_obstacles: 1  # Do not pad LESS than this amount, for safety.
+
+local_radius: 0.5  # Area around the robot to mark as explored (kind of a hack)
+remove_visited_from_obstacles: False
+min_depth: 0.25
+max_depth: 2.5
+
+# Point cloud cleanup
+filters:
+        smooth_kernel_size: 2
+        use_median_filter: True
+        median_filter_size: 2
+        median_filter_max_error: 0.01
+        use_derivative_filter: False
+        derivative_filter_threshold: 0.1
+        # use_voxel_filter: True
+
+# Exploration
+# in_place_rotation_steps: 8
+in_place_rotation_steps: 0
+
+# TAMP parameters
+guarantee_instance_is_reachable: True
+plan_with_reachable_instances: True
+plan_with_scene_graph: True
+max_near_distance: 0.3
+
+# Navigation space - used for motion planning and computing goals.
+step_size: 0.1
+rotation_step_size: 0.2 
+dilate_frontier_size: 2  # Used to shrink the frontier back from the edges of the world
+dilate_obstacle_size: 0  # Used when selecting navigation goals 
+default_expand_frontier_size: 5  # margin along the frontier where final robot position can be
+motion_planner:
+        simplify_plans: False
+        shortcut_plans: True
+        shortcut_iter: 100
+
+# Trajectory following - how closely we follow intermediate waypoints
+# These should be less strict than whatever parameters the low-level controller is using; this will
+# make sure that the motions end up looking smooth.
+trajectory_pos_err_threshold: 0.15
+trajectory_rot_err_threshold: 0.3
+trajectory_per_step_timeout: 3.0
+
+# User interface
+# Choose one of: (object_to_find, location_to_place), command, or chat
+# Don't use all of them!
+name: "stretch_demo"  # for logging - currently not used
+chat: False
+start_ui_server: False
+vlm_context_length: 20  # How long messages sent to the vlm server can be if we are using it
+limited_obs_publish_sleep: 0.5
+
+# High level stuff: commands to execute 
+command: "pick up a bottle and put it on the chair"
+name: "stretch"
+exploration_steps: 10
+object_to_find: "bottle"
+location_to_place: "chair"
diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index f89c389f7..2b400cced 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -35,6 +35,9 @@
 from stretch.utils.point_cloud import show_point_cloud
 from stretch.visualization.rerun import RerunVsualizer
 
+from scipy.spatial.transform import Rotation
+from stretch.utils.geometry import posquat2sophus, sophus2posquat
+
 # TODO: debug code - remove later if necessary
 # import faulthandler
 # faulthandler.enable()
@@ -219,6 +222,11 @@ def get_joint_positions(self, timeout: float = 5.0) -> np.ndarray:
             joint_positions = self._state["joint_positions"]
         return joint_positions
 
+    def get_six_joints(self, timeout: float = 5.0) -> np.ndarray:
+        """Get the six major joint positions"""
+        joint_positions = self.get_joint_positions(timeout = timeout)
+        return  np.array(self._extract_joint_pos(joint_positions))
+
     def get_joint_velocities(self, timeout: float = 5.0) -> np.ndarray:
         """Get the current joint velocities"""
         t0 = timeit.default_timer()
@@ -266,6 +274,94 @@ def get_base_pose(self, timeout: float = 5.0) -> np.ndarray:
                 xyt = self._state["base_pose"]
         return xyt
 
+    def get_pan_tilt(self):
+        joint_positions, _, _ = self.get_joint_state()
+        return joint_positions[HelloStretchIdx.HEAD_PAN], joint_positions[HelloStretchIdx.HEAD_TILT]
+
+    def get_gripper_position(self):
+        joint_state = self.get_joint_positions()
+        return joint_state[HelloStretchIdx.GRIPPER]
+
+    def get_ee_pose(self, matrix=False):
+        q = self.get_joint_positions()
+        pos, quat = self._robot_model.manip_fk(q)
+        pos[0] += q[HelloStretchIdx.BASE_X]
+
+        if matrix:
+            pose = posquat2sophus(pos, quat)
+            return pose.matrix()
+        else:
+            return pos, quat
+
+    def get_frame_pose(self, q: np.ndarray, node_a: str, node_b: str):
+        return self._robot_model.manip_ik_solver.get_frame_pose(q, node_a, node_b)
+
+    def solve_ik(
+        self,
+        pos: List[float],
+        quat: Optional[List[float]] = None,
+        relative: bool = False,
+        initial_cfg: np.ndarray = None,
+        debug: bool = False,
+    ) -> Optional[np.ndarray]:
+        """Solve inverse kinematics appropriately (or at least try to) and get the joint position
+        that we will be moving to.
+
+        Note: When relative==True, the delta orientation is still defined in the world frame
+
+        Returns None if no solution is found, else returns an executable solution
+        """
+
+        pos_ee_curr, quat_ee_curr = self.get_ee_pose()
+        if quat is None:
+            quat = [0, 0, 0, 1] if relative else quat_ee_curr
+
+        # Compute IK goal: pose relative to base
+        pose_desired = posquat2sophus(np.array(pos), np.array(quat))
+
+        if relative:
+            pose_base2ee_curr = posquat2sophus(pos_ee_curr, quat_ee_curr)
+
+            pos_desired = pos_ee_curr + pose_input.translation()
+            so3_desired = pose_input.so3() * pose_base2ee_curr.so3()
+            quat_desired = R.from_matrix(so3_desired.matrix()).as_quat()
+
+            pose_base2ee_desired = posquat2sophus(pos_desired, quat_desired)
+
+        else:
+            pose_base2ee_desired = pose_desired
+
+        pos_ik_goal, quat_ik_goal = sophus2posquat(pose_base2ee_desired)
+
+        # Execute joint command
+        if debug:
+            print("=== EE goto command ===")
+            print(f"Initial EE pose: pos={pos_ee_curr}; quat={quat_ee_curr}")
+            print(f"Input EE pose: pos={np.array(pos)}; quat={np.array(quat)}")
+            print(f"Desired EE pose: pos={pos_ik_goal}; quat={quat_ik_goal}")
+
+        # Perform IK
+        full_body_cfg, ik_success, ik_debug_info = self._robot_model.manip_ik(
+            (pos_ik_goal, quat_ik_goal), q0=initial_cfg
+        )
+
+        # Expected to return None if we did not get a solution
+        if not ik_success or full_body_cfg is None:
+            return None
+        # Return a valid solution to the IK problem here
+        return full_body_cfg
+
+    def _extract_joint_pos(self, q):
+        """Helper to convert from the general-purpose config including full robot state, into the command space used in just the manip controller. Extracts just lift/arm/wrist information."""
+        return [
+            q[HelloStretchIdx.BASE_X],
+            q[HelloStretchIdx.LIFT],
+            q[HelloStretchIdx.ARM],
+            q[HelloStretchIdx.WRIST_YAW],
+            q[HelloStretchIdx.WRIST_PITCH],
+            q[HelloStretchIdx.WRIST_ROLL],
+        ]
+
     def robot_to(self, joint_angles: np.ndarray, blocking: bool = False, timeout: float = 10.0):
         """Move the robot to a particular joint configuration."""
         next_action = {"joint": joint_angles, "manip_blocking": blocking}
@@ -294,6 +390,14 @@ def head_to(
             whole_body_q[HelloStretchIdx.HEAD_TILT] = float(head_tilt)
             self._wait_for_head(whole_body_q)
 
+    def look_front(self, blocking: bool = True, timeout: float = 10.0):
+        """Let robot look to its front."""
+        self.head_to(constants.look_front[0], constants.look_front[1], blocking = blocking, timeout = timeout)
+
+    def look_at_ee(self, blocking: bool = True, timeout: float = 10.0):
+        """Let robot look to its arm."""
+        self.head_to(constants.look_at_ee[0], constants.look_at_ee[1], blocking = blocking, timeout = timeout)
+
     def arm_to(
         self,
         joint_angles: Optional[np.ndarray] = None,
@@ -436,6 +540,13 @@ def navigate_to(
         next_action = {"xyt": xyt, "nav_relative": relative, "nav_blocking": blocking}
         self.send_action(next_action, timeout=timeout)
 
+    def set_velocity(
+        self, v: float, w: float
+    ):
+        """Move to xyt in global coordinates or relative coordinates."""
+        next_action = {"v": v, "w": w}
+        self.send_action(next_action)
+
     def reset(self):
         """Reset everything in the robot's internal state"""
         self._control_mode = None
@@ -749,6 +860,21 @@ def get_observation(self):
             observation.camera_pose = self._obs.get("camera_pose", None)
             observation.seq_id = self._seq_id
         return observation
+ 
+    def get_images(self, compute_xyz = False):
+        obs = self.get_observation()
+        if compute_xyz:
+            return obs.rgb, obs.depth, obs.xyz
+        else:
+            return obs.rgb, obs.depth
+
+    def get_camera_K(self):
+        obs = self.get_observation()
+        return obs.camera_K
+
+    def get_head_pose(self):
+        obs = self.get_observation()
+        return obs.camera_pose
 
     def execute_trajectory(
         self,
diff --git a/src/stretch/dynav/__init__.py b/src/stretch/dynav/__init__.py
new file mode 100644
index 000000000..3448d7994
--- /dev/null
+++ b/src/stretch/dynav/__init__.py
@@ -0,0 +1 @@
+from .robot_agent_manip_mdp import RobotAgentMDP
\ No newline at end of file
diff --git a/src/stretch/dynav/ok_robot_hw/README.md b/src/stretch/dynav/ok_robot_hw/README.md
new file mode 100644
index 000000000..3c773b67f
--- /dev/null
+++ b/src/stretch/dynav/ok_robot_hw/README.md
@@ -0,0 +1,38 @@
+# Robot Side Code
+Most of the heavy code will be runnning in the workstation and will communicate with the robot through sockets
+
+## Preparation to run robot side codes
+Our hardware codes heavily rely on robot controller provided by [home-robot](https://github.com/facebookresearch/home-robot).
+
+You need to install the home-robot on stretch robot following intructions provided by [home-robot-hw](https://github.com/facebookresearch/home-robot/blob/main/docs/install_robot.md) before running any robot side codes on robot.
+
+To check whether home-robot is installed properly and got familiar with running home-robot based codes, we recommend you try to run [these test scripts](https://github.com/facebookresearch/home-robot/blob/main/tests/hw_manual_test.py).
+
+Once you finised installing home-robot, follow [these steps](../docs/robot-installation.md) to enable OK-Robot use home-robot contollers.
+
+The success of OK-Robot system also depends on robot calibration and accurate urdf file, so make sure you follow [these calibration steps](../docs/robot-calibration.md) to obtain an accurate urdf file for your robot.
+
+## Start home-robot
+```
+stretch_robot_home.py
+roslaunch home_robot_hw startup_stretch_hector_slam.launch
+```
+
+## Start Robot Control
+```
+python run.py -x1 [x1] -y1 [y1] -x2 [x2] -y2 [y2] -ip [your workstation ip]
+```
+
+* **[x1, y1]** - Co-ordinated of tape on which the base of the robot is on
+* **[x2, y2]** - Co-ordinates of the secondary tape.
+* **ip** - Your workstation ip, the robot will try to communicate with this ip
+* **np** - Navigation port number, the robot will listen to this port number to get planned navigation paths (default: 5555)
+* **mp** - Manipulation port number, the robot will listen to this port number to get estimated manipulation pose (default: 5556)
+
+After running run.py it will go through 4 states in each cycle. 
+* Picking Navigation
+* Manipulation
+* Placing Navigation
+* Placing 
+
+For each navigation stage it asks A [Object Name], B [Near by Object Name]
\ No newline at end of file
diff --git a/src/stretch/dynav/ok_robot_hw/camera.py b/src/stretch/dynav/ok_robot_hw/camera.py
new file mode 100644
index 000000000..09def2415
--- /dev/null
+++ b/src/stretch/dynav/ok_robot_hw/camera.py
@@ -0,0 +1,61 @@
+## License: Apache 2.0. See LICENSE file in root directory.
+## Copyright(c) 2017 Intel Corporation. All Rights Reserved.
+
+#####################################################
+##              Align Depth to Color               ##
+#####################################################
+
+import numpy as np
+import cv2
+import open3d as o3d
+import matplotlib.pyplot as plt
+
+class RealSenseCamera:
+    def __init__(self, robot):
+        self.robot = robot
+        self.depth_scale = 0.001
+
+        # Camera intrinsics
+        intrinsics = self.robot.get_camera_K() 
+        self.fy = intrinsics[0, 0]
+        self.fx = intrinsics[1, 1]
+        self.cy = intrinsics[0, 2]
+        self.cx = intrinsics[1, 2]
+        print(self.fx, self.fy, self.cx, self.cy)
+
+        # selected ix and iy co-ordinates
+        self.ix, self.iy = None, None
+
+    def capture_image(self, visualize=False):
+        self.rgb_image, self.depth_image, self.points = self.robot.get_images(compute_xyz=True)
+        self.rgb_image = np.rot90(self.rgb_image, k = 1)[:, :, [2, 1, 0]]
+        self.depth_image = np.rot90(self.depth_image, k = 1)
+        self.points = np.rot90(self.points, k = 1)
+
+        cv2.imwrite("./images/input.jpg", np.rot90(self.rgb_image, k=-1))
+        # cv2.imwrite("depth.jpg", np.rot90(self.depth_image, k=-1)/np.max(self.depth_image))
+        self.rgb_image = cv2.cvtColor(self.rgb_image, cv2.COLOR_BGR2RGB)
+        
+        if visualize:
+            fig, ax = plt.subplots(1, 2, figsize=(10,5))
+            timer = fig.canvas.new_timer(interval = 5000) #creating a timer object and setting an interval of 3000 milliseconds
+            timer.add_callback(lambda : plt.close())
+
+            ax[0].imshow(np.rot90(self.rgb_image, k=-1))
+            ax[0].set_title("Color Image")
+
+            ax[1].imshow(np.rot90(self.depth_image, k=-1))
+            ax[1].set_title("Depth Image")
+                
+            plt.savefig("./images/rgb_dpt.png")
+            plt.pause(3)
+            plt.close()
+        
+        return self.rgb_image, self.depth_image, self.points
+
+    def pixel2d_to_point3d(self, ix, iy):
+        return self.points[iy, ix][[1, 0, 2]]
+
+if __name__ == "__main__":
+    camera = RealSenseCamera()
+    camera.capture_image()
diff --git a/src/stretch/dynav/ok_robot_hw/global_parameters.py b/src/stretch/dynav/ok_robot_hw/global_parameters.py
new file mode 100644
index 000000000..98faca81e
--- /dev/null
+++ b/src/stretch/dynav/ok_robot_hw/global_parameters.py
@@ -0,0 +1,14 @@
+POS_TOL = 0.1
+YAW_TOL = 0.2
+
+TOP_CAMERA_NODE = "camera_depth_optical_frame"
+GRIPPER_FINGERTIP_LEFT_NODE = "link_gripper_fingertip_left"
+GRIPPER_FINGERTIP_RIGHT_NODE = "link_gripper_fingertip_right"
+
+INIT_LIFT_POS = 0.45 
+INIT_WRIST_PITCH = -1.57 
+INIT_ARM_POS = 0
+INIT_WRIST_ROLL = 0
+INIT_WRIST_YAW = 0
+INIT_HEAD_PAN = -1.57
+INIT_HEAD_TILT = -0.65
diff --git a/src/stretch/dynav/ok_robot_hw/image_publisher.py b/src/stretch/dynav/ok_robot_hw/image_publisher.py
new file mode 100644
index 000000000..ab9064a30
--- /dev/null
+++ b/src/stretch/dynav/ok_robot_hw/image_publisher.py
@@ -0,0 +1,59 @@
+import zmq
+import numpy as np
+from PIL import Image as PILImage
+
+from stretch.dynav.ok_robot_hw.utils.communication_utils import send_array, recv_array, send_depth_img, recv_depth_img, send_rgb_img,  recv_rgb_img
+
+class ImagePublisher():
+
+    def __init__(self, camera, socket):
+        self.camera = camera
+        self.socket = socket
+
+    def publish_image(self, text, mode, head_tilt=-1):
+        image, depth, points = self.camera.capture_image()
+        # camera_pose = self.camera.robot.head.get_pose_in_base_coords()
+
+        rotated_image = np.rot90(image, k=-1)
+        rotated_depth = np.rot90(depth, k=-1)
+        rotated_point = np.rot90(points, k=-1)
+        PILImage.fromarray(rotated_image).save("./test_rgb.png")
+        np.save("./test_depth", rotated_depth)
+
+        ## Send RGB, depth and camera intrinsics data
+        send_rgb_img(self.socket, rotated_image)
+        print(self.socket.recv_string())
+        send_depth_img(self.socket, rotated_depth)
+        print(self.socket.recv_string()) 
+        # send_array(self.socket, rotated_image)
+        # print(self.socket.recv_string())
+        # send_array(self.socket, rotated_depth)
+        # print(self.socket.recv_string()) 
+        send_array(self.socket, np.array([self.camera.fy, self.camera.fx, self.camera.cy, self.camera.cx, int(head_tilt*100)]))
+        print(self.socket.recv_string())
+
+        ## Sending Object text and Manipulation mode
+        self.socket.send_string(text)
+        print(self.socket.recv_string())
+        self.socket.send_string(mode)
+        print(self.socket.recv_string())
+
+        ## Waiting for the base and camera transforms to center the object vertically and horizontally
+        self.socket.send_string("Waiting for gripper pose/ base and head trans from workstation")
+        translation = recv_array(self.socket)
+        self.socket.send_string("translation received by robot")
+        rotation = recv_array(self.socket)
+        self.socket.send_string("rotation received by robot")
+        add_data = recv_array(self.socket)
+        self.socket.send_string(f"Additional data received robot")
+
+        depth = add_data[0]
+        width = add_data[1]
+        retry = add_data[2]
+        print(f"Additional data received - {add_data}")
+        print("translation: ")
+        print(translation)
+        print("rotation: ")
+        print(rotation)
+        print(self.socket.recv_string())    
+        return translation, rotation, depth, width, retry
diff --git a/src/stretch/dynav/ok_robot_hw/robot.py b/src/stretch/dynav/ok_robot_hw/robot.py
new file mode 100644
index 000000000..ffe3f64c6
--- /dev/null
+++ b/src/stretch/dynav/ok_robot_hw/robot.py
@@ -0,0 +1,326 @@
+import numpy as np
+# import PyKDL
+import sys
+import os
+
+# from urdf_parser_py.urdf import URDF
+from scipy.spatial.transform import Rotation as R
+import math
+import time
+import random
+import os
+
+from stretch.dynav.ok_robot_hw.utils import kdl_tree_from_urdf_model
+from stretch.dynav.ok_robot_hw.global_parameters import *
+
+
+OVERRIDE_STATES = {}
+
+class HelloRobot:
+    def __init__(
+        self,
+        robot,
+        stretch_client_urdf_file = 'src/config/urdf',
+        gripper_threshold = 7.0, 
+        stretch_gripper_max = 0.64, 
+        stretch_gripper_min = 0, 
+        end_link = 'link_straight_gripper'
+    ):
+        self.STRETCH_GRIPPER_MAX = stretch_gripper_max
+        self.STRETCH_GRIPPER_MIN = stretch_gripper_min
+        self.urdf_path = os.path.join(stretch_client_urdf_file, 'stretch.urdf')
+        
+        self.GRIPPER_THRESHOLD = gripper_threshold
+
+        print("hello robot starting")
+        self.head_joint_list = ["joint_fake", "joint_head_pan", "joint_head_tilt"]
+        self.init_joint_list = ["joint_fake","joint_lift","3","2","1" ,"0","joint_wrist_yaw","joint_wrist_pitch","joint_wrist_roll", "joint_gripper_finger_left"]
+
+        # end_link is the frame of reference node 
+        self.end_link = end_link 
+        self.set_end_link(end_link)
+        
+        # Initialize StretchClient controller (from home_robot/src/home_robot_hw/home_robot_hw/remote/api.py)
+        # self.robot = StretchClient(urdf_path = stretch_client_urdf_file)
+        self.robot = robot
+        self.robot.switch_to_manipulation_mode()
+        # time.sleep(2)
+
+        # Constraining the robots movement
+        self.clamp = lambda n, minn, maxn: max(min(maxn, n), minn)
+
+        # Joint dictionary for Kinematics
+        self.setup_kdl()
+
+
+    def setup_kdl(self):
+        """
+            Kdl Setup for forward and Inverse Kinematics
+        """
+        import PyKDL
+        from urdf_parser_py.urdf import URDF
+        self.joints = {'joint_fake':0}
+        self.head_joints = {'joint_fake':0}
+        
+        # Loading URDF and listing the internediate joints from base to gripper
+        robot_model = URDF.from_xml_file(self.urdf_path)
+        self.kdl_tree = kdl_tree_from_urdf_model(robot_model)
+        self.arm_chain = self.kdl_tree.getChain('base_link', self.end_link)
+        self.joint_array = PyKDL.JntArray(self.arm_chain.getNrOfJoints())
+
+        # Forward kinematics
+        self.fk_p_kdl = PyKDL.ChainFkSolverPos_recursive(self.arm_chain)
+        # Inverse Kinematics
+        self.ik_v_kdl = PyKDL.ChainIkSolverVel_pinv(self.arm_chain)
+        self.ik_p_kdl = PyKDL.ChainIkSolverPos_NR(self.arm_chain, self.fk_p_kdl, self.ik_v_kdl) 
+    
+
+    def set_end_link(self, link):
+        if link == GRIPPER_FINGERTIP_LEFT_NODE or GRIPPER_FINGERTIP_RIGHT_NODE:
+            self.joint_list = self.init_joint_list
+        else:
+            self.joint_list = self.init_joint_list[:-1]
+
+
+    def get_joints(self):
+        """
+            Returns all the joint names and values involved in forward kinematics of head and gripper 
+        """
+        ## Names of all 13 joints
+        joint_names = self.init_joint_list + ["joint_gripper_finger_right"] + self.head_joint_list[1:]
+        self.updateJoints()
+        joint_values = list(self.joints.values()) + [0] + list(self.head_joints.values())[1:]
+
+        return joint_names, joint_values
+
+    def move_to_position(
+        self, 
+        lift_pos = None, 
+        arm_pos = None, 
+        base_trans = 0.0, 
+        wrist_yaw = None, 
+        wrist_pitch = None, 
+        wrist_roll = None, 
+        gripper_pos = None, 
+        base_theta = None, 
+        head_tilt = None, 
+        head_pan = None
+    ):
+        """
+            Moves the robots, base, arm, lift, wrist and head to a desired position.
+        """
+        if base_theta is not None:
+            self.robot.navigate_to([0, 0, base_theta], relative = True, blocking = True)
+            return
+            
+        # Base, arm and lift state update
+        target_state = self.robot.get_six_joints()
+        if not gripper_pos is None:
+            self.CURRENT_STATE = gripper_pos*(self.STRETCH_GRIPPER_MAX-self.STRETCH_GRIPPER_MIN)+self.STRETCH_GRIPPER_MIN
+            self.robot.gripper_to(self.CURRENT_STATE, blocking = True)
+        if not arm_pos is None:
+            target_state[2] = arm_pos
+        if not lift_pos is None:
+            target_state[1] = lift_pos
+        if base_trans is None:
+            base_trans = 0
+        target_state[0] = base_trans + target_state[0]
+
+        # Wrist state update
+        if not wrist_yaw is None:
+            target_state[3] = wrist_yaw
+        if not wrist_pitch is None:
+            target_state[4] = min(wrist_pitch, 0.1)
+        if not wrist_roll is None:
+            target_state[5] = wrist_roll    
+        
+        # Actual Movement
+        print('Target Position', target_state)
+        self.robot.arm_to(target_state, blocking = True)
+        print('Actual location', self.robot.get_six_joints())
+
+        # Head state update and Movement
+        target_head_pan, target_head_tilt = self.robot.get_pan_tilt()
+        if not head_tilt is None:
+            target_head_tilt = head_tilt
+        if not head_pan is None:
+            target_head_pan = head_pan
+        self.robot.head_to(head_tilt = target_head_tilt, head_pan = target_head_pan, blocking = True)
+        #time.sleep(0.7)
+
+    def pickup(self, width):
+        """
+            Code for grasping the object
+            Gripper closes gradually until it encounters resistence
+        """
+        next_gripper_pos = width
+        while True:
+            self.robot.gripper_to(max(next_gripper_pos * self.STRETCH_GRIPPER_MAX, -0.25), blocking = True)
+            curr_gripper_pose = self.robot.get_gripper_position()
+            # print('Robot means to move gripper to', next_gripper_pos * self.STRETCH_GRIPPER_MAX)
+            # print('Robot actually moves gripper to', curr_gripper_pose)
+            if next_gripper_pos == -1 or (curr_gripper_pose > max(next_gripper_pos * self.STRETCH_GRIPPER_MAX, -0.25) + 0.015):
+                break
+            
+            if next_gripper_pos > 0:
+                next_gripper_pos -= 0.25
+            else: 
+                next_gripper_pos = -1
+
+    def updateJoints(self):
+        """
+            update all the current poisitions of joints 
+        """
+        state = self.robot.get_six_joints()
+        origin_dist = state[0]
+        
+        # Base to gripper joints
+        self.joints['joint_fake'] = origin_dist
+        self.joints['joint_lift'] = state[1]
+        armPos = state[2]
+        self.joints['3'] = armPos / 4.0
+        self.joints['2'] = armPos / 4.0
+        self.joints['1'] = armPos / 4.0
+        self.joints['0'] = armPos / 4.0
+        self.joints['joint_wrist_yaw'] = state[3]
+        self.joints['joint_wrist_roll'] = state[5]
+        self.joints['joint_wrist_pitch'] = OVERRIDE_STATES.get('wrist_pitch', state[4])
+
+        self.joints['joint_gripper_finger_left'] = 0
+
+        # Head Joints
+        pan, tilt = self.robot.get_pan_tilt()
+        self.head_joints['joint_fake'] = origin_dist
+        self.head_joints['joint_head_pan'] = pan
+        self.head_joints['joint_head_tilt'] = tilt
+
+    # following function is used to move the robot to a desired joint configuration 
+    def move_to_joints(self, joints, gripper, mode=0, velocities = None):
+        """
+            Given the desrired joints movement this fucntion will the joints accordingly
+        """
+        state = self.robot.get_six_joints()
+
+        # clamp rotational joints between -1.57 to 1.57
+        joints['joint_wrist_pitch'] = (joints['joint_wrist_pitch'] + 1.57) % 3.14 - 1.57
+        joints['joint_wrist_yaw'] = (joints['joint_wrist_yaw'] + 1.57) % 3.14 - 1.57
+        joints['joint_wrist_roll'] = (joints['joint_wrist_roll'] + 1.57) % 3.14 - 1.57
+        joints['joint_wrist_pitch'] = self.clamp(joints['joint_wrist_pitch'], -1.57, 0.1)
+        target_state = [
+            joints['joint_fake'], 
+            joints['joint_lift'],
+            joints['3'] + 
+            joints['2'] + 
+            joints['1'] + 
+            joints['0'],
+            joints['joint_wrist_yaw'],
+            joints['joint_wrist_pitch'],
+            joints['joint_wrist_roll']]
+        
+        # Moving only the lift first
+        if mode == 1:
+            target1 = state
+            target1[1] = target_state[1]
+            self.robot.arm_to(target1, blocking = True)
+            # time.sleep(1)
+
+        # self.robot.arm_to(target_state, velocities = velocities, blocking = True)
+        self.robot.arm_to(target_state, blocking = True)
+        print(f"current state {self.robot.get_six_joints()}")
+        print(f"target state {target_state}")
+        # time.sleep(1)
+
+        #NOTE: below code is to fix the pitch drift issue in current hello-robot. Remove it if there is no pitch drift issue
+        OVERRIDE_STATES['wrist_pitch'] = joints['joint_wrist_pitch']
+    
+    def get_joint_transform(self, node1, node2):
+        '''
+            This function takes two nodes from a robot URDF file as input and 
+            outputs the coordinate frame of node2 relative to the coordinate frame of node1.
+
+            Mainly used for transforming co-ordinates from camera frame to gripper frame.
+        '''
+        import PyKDL
+
+        # Intializing chain -> maintains list of nodes from base link to corresponding nodes
+        chain1 = self.kdl_tree.getChain('base_link', node1)
+        chain2 = self.kdl_tree.getChain('base_link', node2)
+
+        # Intializing corresponding joint array and forward chain solvers
+        joint_array1 = PyKDL.JntArray(chain1.getNrOfJoints())
+        joint_array2 = PyKDL.JntArray(chain2.getNrOfJoints())
+
+        fk_p_kdl1 = PyKDL.ChainFkSolverPos_recursive(chain1)
+        fk_p_kdl2 = PyKDL.ChainFkSolverPos_recursive(chain2)
+
+        self.updateJoints()
+
+        if node1 == TOP_CAMERA_NODE:
+            ref_joints1 = self.head_joints
+            ref_joint1_list = self.head_joint_list
+        else:
+            ref_joints1 = self.joints
+            ref_joint1_list = self.joint_list
+            
+        # Updating the joint arrays from self.joints
+        for joint_index in range(joint_array1.rows()):
+            joint_array1[joint_index] = ref_joints1[ref_joint1_list[joint_index]]
+
+        for joint_index in range(joint_array2.rows()):
+            joint_array2[joint_index] = self.joints[self.joint_list[joint_index]]
+            
+        # Intializing frames corresponding to nodes
+        frame1 = PyKDL.Frame()
+        frame2 = PyKDL.Frame()
+
+        # Calculating current frames of nodes
+        fk_p_kdl1.JntToCart(joint_array1, frame1)
+        fk_p_kdl2.JntToCart(joint_array2, frame2)
+        
+        # This allows to transform a point in frame1 to frame2
+        frame_transform = frame2.Inverse() * frame1
+
+        return frame_transform, frame2, frame1
+    
+    def move_to_pose(self, translation_tensor, rotational_tensor, gripper, move_mode=0, velocities=None):
+        """
+            Function to move the gripper to a desired translation and rotation
+        """
+        import PyKDL
+        translation = [translation_tensor[0], translation_tensor[1], translation_tensor[2]]
+        rotation = rotational_tensor
+        # print('translation and rotation', translation_tensor, rotational_tensor)
+        
+        self.updateJoints()
+        for joint_index in range(self.joint_array.rows()):
+            self.joint_array[joint_index] = self.joints[self.joint_list[joint_index]]
+        # print('self.joints', self.joints)
+        # print('self.joint_array', self.joint_array)
+
+        curr_pose = PyKDL.Frame() # Current pose of gripper in base frame
+        del_pose = PyKDL.Frame() # Relative Movement of gripper 
+        self.fk_p_kdl.JntToCart(self.joint_array, curr_pose)
+        rot_matrix = R.from_euler('xyz', rotation, degrees=False).as_matrix()
+        del_rot = PyKDL.Rotation(PyKDL.Vector(rot_matrix[0][0], rot_matrix[1][0], rot_matrix[2][0]),
+                                  PyKDL.Vector(rot_matrix[0][1], rot_matrix[1][1], rot_matrix[2][1]),
+                                  PyKDL.Vector(rot_matrix[0][2], rot_matrix[1][2], rot_matrix[2][2]))
+        del_trans = PyKDL.Vector(translation[0], translation[1], translation[2])
+        del_pose.M = del_rot
+        del_pose.p = del_trans
+        goal_pose_new = curr_pose*del_pose # Final pose of gripper in base frame
+
+        # Ik to calculate the required joint movements to move the gripper to desired pose
+        seed_array = PyKDL.JntArray(self.arm_chain.getNrOfJoints())
+        self.ik_p_kdl.CartToJnt(seed_array, goal_pose_new, self.joint_array) 
+
+        ik_joints = {}
+        for joint_index in range(self.joint_array.rows()):
+            ik_joints[self.joint_list[joint_index]] = self.joint_array[joint_index]
+
+        # Actual Movement of joints
+        self.move_to_joints(ik_joints, gripper, move_mode, velocities)
+
+        # Update joint_values
+        self.updateJoints()
+        for joint_index in range(self.joint_array.rows()):
+            self.joint_array[joint_index] = self.joints[self.joint_list[joint_index]]
\ No newline at end of file
diff --git a/src/stretch/dynav/ok_robot_hw/utils/__init__.py b/src/stretch/dynav/ok_robot_hw/utils/__init__.py
new file mode 100644
index 000000000..60bbf771c
--- /dev/null
+++ b/src/stretch/dynav/ok_robot_hw/utils/__init__.py
@@ -0,0 +1,4 @@
+from .grasper_utils import *
+from .urdf_utils import *
+from .communication_utils import *
+from .utils import *
\ No newline at end of file
diff --git a/src/stretch/dynav/ok_robot_hw/utils/communication_utils.py b/src/stretch/dynav/ok_robot_hw/utils/communication_utils.py
new file mode 100644
index 000000000..ad2396b70
--- /dev/null
+++ b/src/stretch/dynav/ok_robot_hw/utils/communication_utils.py
@@ -0,0 +1,47 @@
+import numpy as np
+import zmq
+import cv2
+
+# use zmq to send a numpy array
+def send_array(socket, A, flags=0, copy=True, track=False):
+    """send a numpy array with metadata"""
+    A = np.array(A)
+    md = dict(
+        dtype = str(A.dtype),
+        shape = A.shape,
+    )
+    socket.send_json(md, flags|zmq.SNDMORE)
+    return socket.send(np.ascontiguousarray(A), flags, copy=copy, track=track)
+
+# use zmq to receive a numpy array
+def recv_array(socket, flags=0, copy=True, track=False):
+    """recv a numpy array"""
+    md = socket.recv_json(flags=flags)
+    msg = socket.recv(flags=flags, copy=copy, track=track)
+    A = np.frombuffer(msg, dtype=md['dtype'])
+    return A.reshape(md['shape'])
+
+def send_rgb_img(socket, img):
+    img = img.astype(np.uint8) 
+    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 90]
+    _, img_encoded = cv2.imencode('.jpg', img, encode_param)
+    socket.send(img_encoded.tobytes())
+
+def recv_rgb_img(socket):
+    img = socket.recv()
+    img = np.frombuffer(img, dtype=np.uint8)
+    img = cv2.imdecode(img, cv2.IMREAD_COLOR)
+    return img
+
+def send_depth_img(socket, depth_img):
+    depth_img = (depth_img * 1000).astype(np.uint16)
+    encode_param = [int(cv2.IMWRITE_PNG_COMPRESSION), 3]  # Compression level from 0 (no compression) to 9 (max compression)
+    _, depth_img_encoded = cv2.imencode('.png', depth_img, encode_param)
+    socket.send(depth_img_encoded.tobytes())
+
+def recv_depth_img(socket):
+    depth_img = socket.recv()
+    depth_img = np.frombuffer(depth_img, dtype=np.uint8)
+    depth_img = cv2.imdecode(depth_img, cv2.IMREAD_UNCHANGED)
+    depth_img = (depth_img / 1000.)
+    return depth_img
\ No newline at end of file
diff --git a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
new file mode 100644
index 000000000..937023c57
--- /dev/null
+++ b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
@@ -0,0 +1,234 @@
+# import PyKDL
+import time
+import math
+
+from stretch.dynav.ok_robot_hw.image_publisher import ImagePublisher
+from stretch.dynav.ok_robot_hw.global_parameters import *
+
+def capture_and_process_image(camera, mode, obj, socket, hello_robot):
+    import PyKDL
+    
+    print('Currently in ' + mode + ' mode and the robot is about to manipulate ' + obj + '.')
+
+    image_publisher = ImagePublisher(camera, socket)
+
+    # Centering the object
+    head_tilt_angles = [0, -0.1, 0.1]
+    tilt_retries, side_retries = 1, 0
+    retry_flag = True
+    head_tilt = INIT_HEAD_TILT
+    head_pan = INIT_HEAD_PAN
+
+    while(retry_flag):
+        translation, rotation, depth, width, retry_flag = image_publisher.publish_image(obj, mode, head_tilt=head_tilt)
+
+        print(f"retry flag : {retry_flag}")
+        if (retry_flag == 1):
+            base_trans = translation[0]
+            head_tilt += (rotation[0])
+
+            hello_robot.move_to_position(base_trans=base_trans,
+                                    head_pan=head_pan,
+                                    head_tilt=head_tilt)
+            # time.sleep(2)
+        
+        elif (retry_flag !=0 and side_retries == 3):
+            print("Tried in all angles but couldn't succed")
+            # time.sleep(2)
+            return None, None, None
+
+        elif (side_retries == 2 and tilt_retries == 3):
+            hello_robot.move_to_position(base_trans=0.1, head_tilt=head_tilt)
+            side_retries = 3
+
+        elif retry_flag == 2:
+            if (tilt_retries == 3):
+                if (side_retries == 0):
+                    hello_robot.move_to_position(base_trans=0.1, head_tilt=head_tilt)
+                    side_retries = 1
+                else:
+                    hello_robot.move_to_position(base_trans=-0.2, head_tilt=head_tilt)
+                    side_retries = 2
+                tilt_retries = 1
+            else:
+                print(f"retrying with head tilt : {head_tilt + head_tilt_angles[tilt_retries]}")
+                hello_robot.move_to_position(head_pan=head_pan,
+                                        head_tilt=head_tilt + head_tilt_angles[tilt_retries])
+                tilt_retries += 1
+                time.sleep(1)
+
+    if mode == "place":
+        translation = PyKDL.Vector(-translation[1], -translation[0], -translation[2])
+
+    if mode == "pick":
+        return rotation, translation, depth, width
+    else:
+        return rotation, translation
+
+
+def move_to_point(robot, point, base_node, gripper_node, move_mode=1, pitch_rotation=0):
+    """
+        Function for moving the gripper to a specific point
+    """
+    import PyKDL
+    rotation = PyKDL.Rotation(1, 0, 0, 0, 1, 0, 0, 0, 1)
+
+    dest_frame = PyKDL.Frame(rotation, point)
+    transform, _, _ = robot.get_joint_transform(base_node, gripper_node)
+
+    # Rotation from gripper frame frame to gripper frame
+    transformed_frame = transform * dest_frame
+
+    transformed_frame.p[2] -= 0.2
+
+    robot.move_to_pose(
+            [transformed_frame.p[0], transformed_frame.p[1], transformed_frame.p[2]],
+            [pitch_rotation, 0, 0],
+            [1],
+            move_mode=move_mode
+        )
+
+def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height = 0.03, gripper_depth=0.03, gripper_width = 1):
+    """
+        rotation: Relative rotation of gripper pose w.r.t camera
+        translation: Relative translation of gripper pose w.r.t camera
+        base_node: Camera Node
+
+        Supports home robot top down grasping as well
+
+        Graping trajectory steps
+        1. Rotation of gripper
+        2. Lift the gripper
+        3. Move the base such gripper in line with the grasp
+        4. Gradually Move the gripper to the desired position
+    """
+    import PyKDL
+    # Transforming the final point from Model camera frame to robot camera frame
+    point = PyKDL.Vector(-translation[1], -translation[0], translation[2])
+
+    # Rotation from Camera frame to Model frame
+    rotation1_bottom = PyKDL.Rotation(0.0000000, -1.0000000,  0.0000000,
+                                -1.0000000,  0.0000000,  0.0000000, 
+                                0.0000000,  0.0000000, 1.0000000)
+
+    # Rotation from model frame to pose frame
+    rotation1 = PyKDL.Rotation(rotation[0][0], rotation[0][1], rotation[0][2],
+                            rotation[1][0],  rotation[1][1], rotation[1][2],
+                                rotation[2][0],  rotation[2][1], rotation[2][2])
+
+    # Rotation from camera frame to pose frame
+    rotation =  rotation1_bottom * rotation1
+
+    # Relative rotation and translation of grasping point relative to camera
+    dest_frame = PyKDL.Frame(rotation, point) 
+
+    # Camera to gripper frame transformation
+    cam2gripper_transform, _, _ = robot.get_joint_transform(base_node, gripper_node)
+    transformed_frame = cam2gripper_transform * dest_frame
+
+    # Lifting the arm to high position as part of pregrasping position
+    robot.move_to_position(lift_pos = 1.05, head_pan = None, head_tilt = None)
+    # time.sleep(2)
+
+    # Rotation for aligning Robot gripper frame to Model gripper frame
+    rotation2_top = PyKDL.Rotation(0, 0, 1, 1, 0, 0, 0, -1, 0)
+
+    # final Rotation of gripper to hold the objet
+    final_rotation = transformed_frame.M * rotation2_top
+    # print(f"final rotation - {final_rotation.GetRPY()}")
+    robot.move_to_position(gripper_pos = gripper_width)
+    robot.move_to_pose(
+            [0, 0, 0],
+            [final_rotation.GetRPY()[0], final_rotation.GetRPY()[1], final_rotation.GetRPY()[2]],
+            [1],
+        )
+    # time.sleep(1)
+
+    # Final grasping point relative to camera
+    cam2gripper_transform, _, _ = robot.get_joint_transform(base_node, gripper_node)
+    transformed_point1 = cam2gripper_transform * point
+
+    # Final grasping point relative to base
+    cam2base_transform, _, _ = robot.get_joint_transform(base_node, 'base_link')
+    base_point = cam2base_transform * point
+
+    diff_value = (0.228 - gripper_depth - gripper_height) # 0.228 is the distance between link_Straight_gripper node and the gripper tip
+    transformed_point1[2] -= (diff_value)
+    ref_diff = (diff_value)
+
+    # Moving gripper to a point that is 0.2m away from the pose center in the line of gripper
+    print("Moving to 0.2m away point with 4s sleep")
+    robot.move_to_pose(
+        [transformed_point1.x(), transformed_point1.y(), transformed_point1.z() - 0.2],
+        [0, 0, 0],
+        [1],
+        move_mode = 1
+    )
+    # time.sleep(4)
+
+    # Z-Axis of link_straight_gripper points in line of gripper
+    # So, the z co-ordiante of point w.r.t gripper gives the distance of point from gripper
+    base2gripper_transform, _, _ = robot.get_joint_transform('base_link', gripper_node)
+    transformed_point2 = base2gripper_transform * base_point
+    # print(f"transformed point2 : {transformed_point2}")
+    curr_diff = transformed_point2.z()
+
+    # The distance between gripper and point is covered gradullay to allow for velocity control when it approaches the object
+    # Lower velocity helps is not topping the light objects
+    diff = abs(curr_diff - ref_diff)
+    velocities = [1.0]*8
+    velocities[5:] = [0.03, 0.03, 0.03, 0.03]
+    velocities[0] = 0.03
+    if diff > 0.08:
+        dist = diff - 0.08
+        print("Move to intermediate point with sleep 2s")
+        robot.move_to_pose(
+            [0, 0, dist],
+            [0, 0, 0],
+            [1]
+        )
+        # time.sleep(2)
+        base2gripper_transform, _, _ = robot.get_joint_transform('base_link', gripper_node)
+        # print(f"transformed point3 : {base2gripper_transform * base_point}")
+        diff = diff - dist
+        
+    while diff > 0.01:
+        dist = min(0.03, diff)
+        print("Move to Secondary intermediate point with sleep 2s")
+        robot.move_to_pose(
+            [0, 0, dist],   
+            [0, 0, 0],
+            [1],
+            velocities=velocities
+        )
+        # time.sleep(2)
+        base2gripper_transform, _, _ = robot.get_joint_transform('base_link', gripper_node)
+        # print(f"transformed point3 : {base2gripper_transform * base_point}")
+        diff = diff - dist
+    
+    # Now the gripper reached the grasping point and starts picking procedure
+    robot.pickup(gripper_width)
+
+    # Lifts the arm
+    robot.move_to_position(lift_pos = robot.robot.get_six_joints()[1] + 0.2)
+    # time.sleep(1)
+
+    # Tucks the gripper so that while moving to place it wont collide with any obstacles
+    robot.move_to_position(arm_pos = 0.01)
+    # time.sleep(1)
+    robot.move_to_position(wrist_pitch = 0.0)
+    # time.sleep(1)
+    robot.move_to_position(wrist_yaw = 2.0)
+    # time.sleep(1)
+
+    # rotate the arm wrist onto the base
+    if abs(robot.robot.get_six_joints()[3] - 2.0) > 0.1:
+        robot.move_to_position(wrist_yaw  = -2.0)
+        # time.sleep(1)
+
+    # Put down the arm    
+    robot.move_to_position(lift_pos = 0.85)
+    if abs(robot.robot.get_six_joints()[3] - 2.0) < 0.1:
+        robot.move_to_position(wrist_yaw = 2.5)
+    robot.move_to_position(lift_pos = 0.6)
+    # time.sleep(1)
diff --git a/src/stretch/dynav/ok_robot_hw/utils/urdf_utils.py b/src/stretch/dynav/ok_robot_hw/utils/urdf_utils.py
new file mode 100644
index 000000000..7cad7f231
--- /dev/null
+++ b/src/stretch/dynav/ok_robot_hw/utils/urdf_utils.py
@@ -0,0 +1,81 @@
+import numpy as np
+# import PyKDL
+
+def euler_to_quat(r, p, y):
+    sr, sp, sy = np.sin(r/2.0), np.sin(p/2.0), np.sin(y/2.0)
+    cr, cp, cy = np.cos(r/2.0), np.cos(p/2.0), np.cos(y/2.0)
+    return [sr*cp*cy - cr*sp*sy,
+            cr*sp*cy + sr*cp*sy,
+            cr*cp*sy - sr*sp*cy,
+            cr*cp*cy + sr*sp*sy]
+
+def urdf_joint_to_kdl_joint(jnt):
+    import PyKDL as kdl
+    origin_frame = urdf_pose_to_kdl_frame(jnt.origin)
+    if jnt.joint_type == 'fixed':
+        if hasattr(kdl.Joint, 'Fixed'):
+            return kdl.Joint(jnt.name, kdl.Joint.Fixed)
+        else:
+            return kdl.Joint(jnt.name, getattr(kdl.Joint, 'None'))
+    axis = kdl.Vector(*jnt.axis)
+    if jnt.joint_type == 'revolute':
+        return kdl.Joint(jnt.name, origin_frame.p,
+                         origin_frame.M * axis, kdl.Joint.RotAxis)
+    if jnt.joint_type == 'continuous':
+        return kdl.Joint(jnt.name, origin_frame.p,
+                         origin_frame.M * axis, kdl.Joint.RotAxis)
+    if jnt.joint_type == 'prismatic':
+        return kdl.Joint(jnt.name, origin_frame.p,
+                         origin_frame.M * axis, kdl.Joint.TransAxis)
+    print("Unknown joint type: %s." % jnt.joint_type)
+    return kdl.Joint(jnt.name, kdl.Joint.Fixed)
+
+def urdf_pose_to_kdl_frame(pose):
+    import PyKDL as kdl
+    pos = [0., 0., 0.]
+    rot = [0., 0., 0.]
+    if pose is not None:
+        if pose.position is not None:
+            pos = pose.position
+        if pose.rotation is not None:
+            rot = pose.rotation
+    return kdl.Frame(kdl.Rotation.Quaternion(*euler_to_quat(*rot)),
+                     kdl.Vector(*pos))
+
+def urdf_inertial_to_kdl_rbi(i):
+    import PyKDL as kdl
+    origin = urdf_pose_to_kdl_frame(i.origin)
+    rbi = kdl.RigidBodyInertia(i.mass, origin.p,
+                               kdl.RotationalInertia(i.inertia.ixx,
+                                                     i.inertia.iyy,
+                                                     i.inertia.izz,
+                                                     i.inertia.ixy,
+                                                     i.inertia.ixz,
+                                                     i.inertia.iyz))
+    return origin.M * rbi
+
+##
+# Returns a PyKDL.Tree generated from a urdf_parser_py.urdf.URDF object.
+def kdl_tree_from_urdf_model(urdf):
+    import PyKDL as kdl
+    root = urdf.get_root()
+    print(f"root -> {root}")
+    tree = kdl.Tree(root)
+    def add_children_to_tree(parent):
+        if parent in urdf.child_map:
+            for joint, child_name in urdf.child_map[parent]:
+                child = urdf.link_map[child_name]
+                if child.inertial is not None:
+                    kdl_inert = urdf_inertial_to_kdl_rbi(child.inertial)
+                else:
+                    kdl_inert = kdl.RigidBodyInertia()
+                kdl_jnt = urdf_joint_to_kdl_joint(urdf.joint_map[joint])
+                kdl_origin = urdf_pose_to_kdl_frame(urdf.joint_map[joint].origin)
+                kdl_sgm = kdl.Segment(child_name, kdl_jnt,
+                                      kdl_origin, kdl_inert)
+                tree.addSegment(kdl_sgm, parent)
+                add_children_to_tree(child_name)
+                # print(f"{parent} -> {child_name}")
+    add_children_to_tree(root)
+    # print("\n\n\n")
+    return tree
diff --git a/src/stretch/dynav/ok_robot_hw/utils/utils.py b/src/stretch/dynav/ok_robot_hw/utils/utils.py
new file mode 100644
index 000000000..1c60e52ee
--- /dev/null
+++ b/src/stretch/dynav/ok_robot_hw/utils/utils.py
@@ -0,0 +1,21 @@
+import numpy as np
+
+def apply_se3_transform(se3_obj, point):
+    homogeneous_point = np.append(point.flatten(), 1)
+    print(homogeneous_point)
+    transformed_homogeneous_point = se3_obj.homogeneous.dot(homogeneous_point)
+    transformed_point = transformed_homogeneous_point[:3]
+
+    return transformed_point
+
+def transform_joint_array(joint_array):
+    n = len(joint_array)
+    new_joint_array = []
+    for i in range(n+3):
+        if i < 2:
+            new_joint_array.append(joint_array[i])
+        elif i < 6:
+            new_joint_array.append(joint_array[2]/4.0)
+        else:
+            new_joint_array.append(joint_array[i-3])
+    return np.array(new_joint_array)
\ No newline at end of file
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
new file mode 100644
index 000000000..19b9167cf
--- /dev/null
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -0,0 +1,403 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import copy
+import datetime
+import os
+import pickle
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+import clip
+import numpy as np
+import torch
+from loguru import logger
+
+from stretch.core.parameters import Parameters, get_parameters
+from stretch.agent import RobotClient
+
+import zmq
+
+from matplotlib import pyplot as plt
+
+from stretch.dynav.ok_robot_hw.global_parameters import *
+from stretch.dynav.ok_robot_hw.robot import HelloRobot as Manipulation_Wrapper
+from stretch.dynav.ok_robot_hw.camera import RealSenseCamera
+from stretch.dynav.ok_robot_hw.utils.grasper_utils import pickup, move_to_point, capture_and_process_image
+from stretch.dynav.ok_robot_hw.utils.communication_utils import send_array, recv_array
+
+import cv2
+
+import threading
+from multiprocessing import Process
+
+class RobotAgentMDP:
+    """Basic demo code. Collects everything that we need to make this work."""
+
+    _retry_on_fail = False
+
+    def __init__(
+        self,
+        robot: RobotClient,
+        parameters: Dict[str, Any],
+        ip: str,
+        image_port: int = 5560,
+        text_port: int = 5561,
+        manip_port: int = 5557,
+        re: int = 1,
+    ):
+        print('------------------------YOU ARE NOW RUNNING PEIQI CODES V3-----------------')
+        self.re = re
+        if isinstance(parameters, Dict):
+            self.parameters = Parameters(**parameters)
+        elif isinstance(parameters, Parameters):
+            self.parameters = parameters
+        else:
+            raise RuntimeError(f"parameters of unsupported type: {type(parameters)}")
+        self.robot = robot
+        if re == 1:
+            stretch_gripper_max = 0.3
+            end_link = "link_straight_gripper"
+        else:
+            stretch_gripper_max = 0.64
+            end_link = "link_gripper_s3_body"
+        self.transform_node = end_link
+        self.manip_wrapper = Manipulation_Wrapper(self.robot, stretch_gripper_max = stretch_gripper_max, end_link = end_link)
+        self.robot.move_to_nav_posture()
+
+        self.normalize_embeddings = True
+        self.pos_err_threshold = 0.35
+        self.rot_err_threshold = 0.4
+        self.obs_count = 0
+        self.obs_history = []
+        self.guarantee_instance_is_reachable = (
+            parameters.guarantee_instance_is_reachable
+        )
+
+        self.image_sender = ImageSender(ip = ip, image_port = image_port, text_port = text_port, manip_port = manip_port)
+
+        self.look_around_times = []
+        self.execute_times = []
+        
+        timestamp = f"{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}"
+
+        # self.head_lock = threading.Lock()
+
+    def look_around(self):
+        logger.info("Look around to check")
+        for pan in [0.5, -0.5, -1.5]:
+            for tilt in [-0.55]:
+                self.robot.head_to(pan, tilt, blocking = True)
+                time.sleep(0.3)
+                self.update()
+
+    def update(self):
+        """Step the data collector. Get a single observation of the world. Remove bad points, such as those from too far or too near the camera. Update the 3d world representation."""
+        obs = self.robot.get_observation()
+        # self.image_sender.send_images(obs)
+        self.obs_history.append(obs)
+        self.obs_count += 1
+        self.image_sender.send_images(obs)
+
+
+    def execute_action(
+        self,
+        text: str,
+    ):
+        start_time = time.time()
+
+        self.robot.look_front()
+        self.look_around()
+        self.robot.look_front()
+        self.robot.switch_to_navigation_mode()
+
+        start = self.robot.get_base_pose()
+        print("       Start:", start)
+        res = self.image_sender.query_text(text, start)  
+
+        look_around_finish = time.time()
+        look_around_take = look_around_finish - start_time
+        print('Looking around takes ', look_around_take, ' seconds.')
+        self.look_around_times.append(look_around_take)
+        print(self.look_around_times)
+        print(sum(self.look_around_times) / len(self.look_around_times))
+
+        if len(res) > 0:
+            print("Plan successful!")
+            if len(res) > 2 and np.isnan(res[-2]).all():
+                blocking = text != ''
+                self.robot.execute_trajectory(
+                    res[:-2],
+                    pos_err_threshold=self.pos_err_threshold,
+                    rot_err_threshold=self.rot_err_threshold,
+                    blocking = blocking
+                )
+
+                execution_finish = time.time()
+                execution_take = execution_finish - look_around_finish
+                print('Executing action takes ', execution_take, ' seconds.')
+                self.execute_times.append(execution_take)
+                print(self.execute_times)
+                print(sum(self.execute_times) / len(self.execute_times))
+
+                return True, res[-1]
+            else:
+                self.robot.execute_trajectory(
+                    res,
+                    pos_err_threshold=self.pos_err_threshold,
+                    rot_err_threshold=self.rot_err_threshold,
+                    blocking = False
+                )
+
+                execution_finish = time.time()
+                execution_take = execution_finish - look_around_finish
+                print('Executing action takes ', execution_take, ' seconds.')
+                self.execute_times.append(execution_take)
+                print(self.execute_times)
+                print(sum(self.execute_times) / len(self.execute_times))
+
+                return False, None
+        else:
+            print("Failed. Try again!")
+            return None, None
+
+    def run_exploration(
+        self
+    ):
+        """Go through exploration. We use the voxel_grid map created by our collector to sample free space, and then use our motion planner (RRT for now) to get there. At the end, we plan back to (0,0,0).
+
+        Args:
+            visualize(bool): true if we should do intermediate debug visualizations"""
+        status, _ = self.execute_action("")
+        if status is None:
+            print("Exploration failed! Perhaps nowhere to explore!")
+            return False
+        return True
+
+    def navigate(self, text, max_step = 15):
+        finished = False
+        step = 0
+        end_point = None
+        while not finished and step < max_step:
+            print('*' * 20, step, '*' * 20)
+            step += 1
+            finished, end_point = self.execute_action(text) 
+            if finished is None:
+                print("Navigation failed! The path might be blocked!")
+                return None
+        return end_point
+
+    def place(self, text, init_tilt = INIT_HEAD_TILT, base_node = TOP_CAMERA_NODE):
+        '''
+            An API for running placing. By calling this API, human will ask the robot to place whatever it holds
+            onto objects specified by text queries A
+            - hello_robot: a wrapper for home-robot StretchClient controller
+            - socoket: we use this to communicate with workstation to get estimated gripper pose
+            - text: queries specifying target object
+            - transform node: node name for coordinate systems of target gripper pose (usually the coordinate system on the robot gripper)
+            - base node: node name for coordinate systems of estimated gipper poses given by anygrasp
+        '''
+        self.robot.switch_to_manipulation_mode()
+        self.robot.look_at_ee()
+        self.manip_wrapper.move_to_position(
+            head_pan=INIT_HEAD_PAN,
+            head_tilt=init_tilt)
+        camera = RealSenseCamera(self.robot)
+
+        time.sleep(2)
+        rotation, translation = capture_and_process_image(
+            camera = camera,
+            mode = 'place',
+            obj = text,
+            socket = self.image_sender.manip_socket, 
+            hello_robot = self.manip_wrapper)
+
+        if rotation is None:
+            return False
+        print(rotation)
+
+        # lift arm to the top before the robot extends the arm, prepare the pre-placing gripper pose
+        self.manip_wrapper.move_to_position(lift_pos=1.05)
+        time.sleep(1.5)
+        self.manip_wrapper.move_to_position(wrist_yaw=0,
+                                 wrist_pitch=0)
+        time.sleep(1.5)
+
+        # Placing the object
+        move_to_point(self.manip_wrapper, translation, base_node, self.transform_node, move_mode=0)
+        time.sleep(1.5)
+        self.manip_wrapper.move_to_position(gripper_pos=1)
+
+        # Lift the arm a little bit, and rotate the wrist roll of the robot in case the object attached on the gripper
+        self.manip_wrapper.move_to_position(lift_pos = self.manip_wrapper.robot.get_six_joints()[1] + 0.3)
+        self.manip_wrapper.move_to_position(wrist_roll = 3.)
+        time.sleep(0.8)
+        self.manip_wrapper.move_to_position(wrist_roll = -3.)
+        time.sleep(0.8)
+
+        # Wait for some time and shrink the arm back
+        self.manip_wrapper.move_to_position(gripper_pos=1, 
+                                lift_pos = 1.05,
+                                arm_pos = 0)
+        time.sleep(2)
+        self.manip_wrapper.move_to_position(wrist_pitch=-1.57)
+
+        # Shift the base back to the original point as we are certain that orginal point is navigable in navigation obstacle map
+        self.manip_wrapper.move_to_position(base_trans = -self.manip_wrapper.robot.get_six_joints()[0])
+        return True
+
+    def manipulate(self, text, init_tilt = INIT_HEAD_TILT, base_node = TOP_CAMERA_NODE):
+        '''
+            An API for running manipulation. By calling this API, human will ask the robot to pick up objects
+            specified by text queries A
+            - hello_robot: a wrapper for home-robot StretchClient controller
+            - socoket: we use this to communicate with workstation to get estimated gripper pose
+            - text: queries specifying target object
+            - transform node: node name for coordinate systems of target gripper pose (usually the coordinate system on the robot gripper)
+            - base node: node name for coordinate systems of estimated gipper poses given by anygrasp
+        '''
+
+        self.robot.switch_to_manipulation_mode()
+        self.robot.look_at_ee()
+
+        gripper_pos = 1
+
+        self.manip_wrapper.move_to_position(arm_pos=INIT_ARM_POS,
+                                head_pan=INIT_HEAD_PAN,
+                                head_tilt=init_tilt,
+                                gripper_pos = gripper_pos,
+                                lift_pos=INIT_LIFT_POS,
+                                wrist_pitch = INIT_WRIST_PITCH,
+                                wrist_roll = INIT_WRIST_ROLL,
+                                wrist_yaw = INIT_WRIST_YAW)
+
+        camera = RealSenseCamera(self.robot)
+
+        rotation, translation, depth, width = capture_and_process_image(
+            camera = camera,
+            mode = 'pick',
+            obj = text,
+            socket = self.image_sender.manip_socket, 
+            hello_robot = self.manip_wrapper)
+
+        print('Predicted width:', width)
+    
+        if rotation is None:
+            return False
+        
+        if width < 0.045 and self.re == 3:
+            gripper_width = 0.45
+        if width < 0.075 and self.re == 3: 
+            gripper_width = 0.6
+        else:
+            gripper_width = 1
+
+        if input('Do you want to do this manipulation? Y or N ') != 'N':
+            pickup(self.manip_wrapper, rotation, translation, base_node, self.transform_node, gripper_depth = depth, gripper_width = gripper_width)
+    
+        # Shift the base back to the original point as we are certain that orginal point is navigable in navigation obstacle map
+        self.manip_wrapper.move_to_position(base_trans = -self.manip_wrapper.robot.get_six_joints()[0])
+
+        return True
+
+def send_array(socket, A, flags=0, copy=True, track=False):
+    """send a numpy array with metadata"""
+    A = np.array(A)
+    md = dict(
+        dtype = str(A.dtype),
+        shape = A.shape,
+    )
+    socket.send_json(md, flags|zmq.SNDMORE)
+    return socket.send(np.ascontiguousarray(A), flags, copy=copy, track=track)
+
+def recv_array(socket, flags=0, copy=True, track=False):
+    """recv a numpy array"""
+    md = socket.recv_json(flags=flags)
+    msg = socket.recv(flags=flags, copy=copy, track=track)
+    A = np.frombuffer(msg, dtype=md['dtype'])
+    return A.reshape(md['shape'])
+
+def send_rgb_img(socket, img):
+    img = img.astype(np.uint8) 
+    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 90]
+    _, img_encoded = cv2.imencode('.jpg', img, encode_param)
+    socket.send(img_encoded.tobytes())
+
+def recv_rgb_img(socket):
+    img = socket.recv()
+    img = np.frombuffer(img, dtype=np.uint8)
+    img = cv2.imdecode(img, cv2.IMREAD_COLOR)
+    return img
+
+def send_depth_img(socket, depth_img):
+    depth_img = (depth_img * 1000).astype(np.uint16)
+    encode_param = [int(cv2.IMWRITE_PNG_COMPRESSION), 3]  # Compression level from 0 (no compression) to 9 (max compression)
+    _, depth_img_encoded = cv2.imencode('.png', depth_img, encode_param)
+    socket.send(depth_img_encoded.tobytes())
+
+def recv_depth_img(socket):
+    depth_img = socket.recv()
+    depth_img = np.frombuffer(depth_img, dtype=np.uint8)
+    depth_img = cv2.imdecode(depth_img, cv2.IMREAD_UNCHANGED)
+    depth_img = (depth_img / 1000.)
+    return depth_img
+
+def send_everything(socket, rgb, depth, intrinsics, pose):
+    send_rgb_img(socket, rgb)
+    socket.recv_string()
+    send_depth_img(socket, depth)
+    socket.recv_string()
+    send_array(socket, intrinsics)
+    socket.recv_string()
+    send_array(socket, pose)
+    socket.recv_string()
+
+def recv_everything(socket):
+    rgb = recv_rgb_img(socket)
+    socket.send_string('')
+    depth = recv_depth_img(socket)
+    socket.send_string('')
+    intrinsics = recv_array(socket)
+    socket.send_string('')
+    pose = recv_array(socket)
+    socket.send_string('')
+    return rgb, depth, intrinsics, pose
+
+class ImageSender:
+    def __init__(self, 
+        stop_and_photo = False, 
+        ip = '100.108.67.79', 
+        image_port = 5560,
+        text_port = 5561,
+        manip_port = 5557,
+        color_name = "/camera/color",
+        depth_name = "/camera/aligned_depth_to_color",
+        camera_name = "/camera_pose",
+        slop_time_seconds = 0.05,
+        queue_size = 100,
+    ):
+        context = zmq.Context()
+        self.img_socket = context.socket(zmq.REQ)
+        self.img_socket.connect('tcp://' + str(ip) + ':' + str(image_port))
+        self.text_socket = context.socket(zmq.REQ)
+        self.text_socket.connect('tcp://' + str(ip) + ':' + str(text_port))
+        self.manip_socket = context.socket(zmq.REQ)
+        self.manip_socket.connect('tcp://' + str(ip) + ':' + str(manip_port))
+
+    def query_text(self, text, start):
+        self.text_socket.send_string(text)
+        self.text_socket.recv_string()
+        send_array(self.text_socket, start)
+        return recv_array(self.text_socket)
+        
+    def send_images(self, obs):
+        rgb = obs.rgb
+        depth = obs.depth
+        camer_K = obs.camera_K
+        camera_pose = obs.camera_pose
+        # data = np.concatenate((depth.shape, rgb.flatten(), depth.flatten(), camer_K.flatten(), camera_pose.flatten()))
+        # send_array(self.img_socket, data)
+        send_everything(self.img_socket, rgb, depth, camer_K, camera_pose)
+        # self.img_socket.recv_string()
diff --git a/src/stretch/dynav/run_manip.py b/src/stretch/dynav/run_manip.py
new file mode 100644
index 000000000..afb353cce
--- /dev/null
+++ b/src/stretch/dynav/run_manip.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import datetime
+import sys
+import time
+import timeit
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+import click
+import matplotlib.pyplot as plt
+import numpy as np
+import open3d
+import torch
+from PIL import Image
+
+# Mapping and perception
+from stretch.core.parameters import Parameters, get_parameters
+from stretch.dynav import RobotAgentMDP
+
+# Chat and UI tools
+from stretch.utils.point_cloud import numpy_to_pcd, show_point_cloud
+from stretch.agent import RobotClient
+
+import cv2
+import threading
+
+import os
+
+def compute_tilt(camera_xyz, target_xyz):
+    '''
+        a util function for computing robot head tilts so the robot can look at the target object after navigation
+        - camera_xyz: estimated (x, y, z) coordinates of camera
+        - target_xyz: estimated (x, y, z) coordinates of the target object
+    '''
+    vector = camera_xyz - target_xyz
+    return -np.arctan2(vector[2], np.linalg.norm(vector[:2]))
+
+@click.command()
+@click.option("--ip", default='100.108.67.79', type=str)
+@click.option("--manual-wait", default=False, is_flag=True)
+@click.option("--random-goals", default=False, is_flag=True)
+@click.option("--explore-iter", default=-1)
+@click.option("--re", default=1, type=int)
+@click.option(
+    "--input-path",
+    type=click.Path(),
+    default=None,
+    help="Input path with default value 'output.npy'",
+)
+def main(
+    ip,
+    manual_wait,
+    navigate_home: bool = False,
+    explore_iter: int = 5,
+    re: int = 1,
+    input_path: str = None,
+    **kwargs,
+):
+    """
+    Including only some selected arguments here.
+
+    Args:
+        random_goals(bool): randomly sample frontier goals instead of looking for closest
+    """
+    click.echo("Will connect to a Stretch robot and collect a short trajectory.")
+    robot = RobotClient(robot_ip = "127.0.0.1")
+
+    print("- Load parameters")
+    parameters = get_parameters("dynav_config.yaml")
+    # print(parameters)
+    if explore_iter >= 0:
+        parameters["exploration_steps"] = explore_iter
+    object_to_find, location_to_place = None, None
+
+    print("- Start robot agent with data collection")
+    demo = RobotAgentMDP(
+        robot, parameters, ip = ip, re = re
+    )
+
+    while input('STOP? Y/N') != 'Y':
+        text = input('Enter object name: ')
+        theta = -0.6
+        demo.manipulate(text, theta)
+            
+        if input('You want to run placing: y/n') == 'n':
+            continue
+        text = input('Enter receptacle name: ')
+        theta = -0.6
+        demo.place(text, theta)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/stretch/dynav/run_ok_nav.py b/src/stretch/dynav/run_ok_nav.py
new file mode 100644
index 000000000..9642118ef
--- /dev/null
+++ b/src/stretch/dynav/run_ok_nav.py
@@ -0,0 +1,138 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import datetime
+import sys
+import time
+import timeit
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+import click
+import matplotlib.pyplot as plt
+import numpy as np
+import open3d
+import torch
+from PIL import Image
+
+# Mapping and perception
+from stretch.core.parameters import Parameters, get_parameters
+from stretch.dynav import RobotAgentMDP
+
+# Chat and UI tools
+from stretch.utils.point_cloud import numpy_to_pcd, show_point_cloud
+from stretch.agent import RobotClient
+
+import cv2
+import threading
+
+import os
+
+def compute_tilt(camera_xyz, target_xyz):
+    '''
+        a util function for computing robot head tilts so the robot can look at the target object after navigation
+        - camera_xyz: estimated (x, y, z) coordinates of camera
+        - target_xyz: estimated (x, y, z) coordinates of the target object
+    '''
+    vector = camera_xyz - target_xyz
+    return -np.arctan2(vector[2], np.linalg.norm(vector[:2]))
+
+@click.command()
+@click.option("--ip", default='100.108.67.79', type=str)
+@click.option("--manual-wait", default=False, is_flag=True)
+@click.option("--random-goals", default=False, is_flag=True)
+@click.option("--explore-iter", default=-1)
+@click.option("--re", default=1, type=int)
+@click.option(
+    "--input-path",
+    type=click.Path(),
+    default=None,
+    help="Input path with default value 'output.npy'",
+)
+def main(
+    ip,
+    manual_wait,
+    navigate_home: bool = False,
+    explore_iter: int = 5,
+    re: int = 1,
+    input_path: str = None,
+    **kwargs,
+):
+    """
+    Including only some selected arguments here.
+
+    Args:
+        random_goals(bool): randomly sample frontier goals instead of looking for closest
+    """
+    click.echo("Will connect to a Stretch robot and collect a short trajectory.")
+    robot = RobotClient(robot_ip = "127.0.0.1")
+
+    print("- Load parameters")
+    parameters = get_parameters("dynav_config.yaml")
+    # print(parameters)
+    if explore_iter >= 0:
+        parameters["exploration_steps"] = explore_iter
+    object_to_find, location_to_place = None, None
+    robot.move_to_nav_posture()
+    robot.set_velocity(v = 15., w = 8.)
+
+    print("- Start robot agent with data collection")
+    demo = RobotAgentMDP(
+        robot, parameters, ip = ip, re = re
+    )
+
+    while True:
+        mode = input('select mode? E/N/S')
+        if mode == 'S':
+            break
+        if mode == 'E':
+            robot.switch_to_navigation_mode()
+            for epoch in range(explore_iter):
+                print('\n', 'Exploration epoch ', epoch, '\n')
+                if not demo.run_exploration():
+                    print('Exploration failed! Quitting!')
+                    break
+        else:
+            robot.move_to_nav_posture()
+            robot.switch_to_navigation_mode()
+            text = input('Enter object name: ')
+            point = demo.navigate(text)
+            if point is None:
+                print('Navigation Failure!')
+                continue
+            robot.switch_to_navigation_mode()
+            xyt = robot.get_base_pose()
+            xyt[2] = xyt[2] + np.pi / 2
+            robot.navigate_to(xyt, blocking = True)
+            cv2.imwrite(text + '.jpg', robot.get_observation().rgb[:, :, [2, 1, 0]])
+
+            if input('You want to run manipulation: y/n') == 'n':
+                continue
+            camera_xyz = robot.get_head_pose()[:3, 3]
+            theta = compute_tilt(camera_xyz, point)
+            demo.manipulate(text, theta)
+            
+            robot.switch_to_navigation_mode()
+            if input('You want to run placing: y/n') == 'n':
+                continue
+            text = input('Enter receptacle name: ')
+            point = demo.navigate(text)
+            if point is None:
+                print('Navigation Failure')
+                continue
+            robot.switch_to_navigation_mode()
+            xyt = robot.get_base_pose()
+            xyt[2] = xyt[2] + np.pi / 2
+            robot.navigate_to(xyt, blocking = True)
+            cv2.imwrite(text + '.jpg', robot.get_observation().rgb[:, :, [2, 1, 0]])
+        
+            if input('You want to run placing: y/n') == 'n':
+                continue
+            camera_xyz = robot.get_head_pose()[:3, 3]
+            theta = compute_tilt(camera_xyz, point)
+            demo.place(text, theta)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/stretch/motion/constants.py b/src/stretch/motion/constants.py
index 4921551df..089066f22 100644
--- a/src/stretch/motion/constants.py
+++ b/src/stretch/motion/constants.py
@@ -15,7 +15,8 @@
 
 # Stretch stuff
 PLANNER_STRETCH_URDF = get_full_config_path("urdf/planner_calibrated.urdf")
-MANIP_STRETCH_URDF = get_full_config_path("urdf/stretch_manip_mode.urdf")
+# MANIP_STRETCH_URDF = get_full_config_path("urdf/stretch_manip_mode.urdf")
+MANIP_STRETCH_URDF = get_full_config_path("urdf/stretch.urdf")
 
 # This is the gripper, and the distance in the gripper frame to where the fingers will roughly meet
 STRETCH_GRASP_FRAME = "link_grasp_center"
diff --git a/src/stretch/motion/kinematics.py b/src/stretch/motion/kinematics.py
index eb10c294c..391c7ebe2 100644
--- a/src/stretch/motion/kinematics.py
+++ b/src/stretch/motion/kinematics.py
@@ -133,7 +133,8 @@ class HelloStretchKinematics:
     default_ee_link_name = "link_grasp_center"
 
     default_manip_mode_controlled_joints = [
-        "base_x_joint",
+        # "base_x_joint",
+        "joint_fake",
         "joint_lift",
         "joint_arm_l3",
         "joint_arm_l2",
@@ -419,14 +420,14 @@ def interpolate(self, q0, qg, step=None, xy_tol=0.05, theta_tol=0.01):
         for qi, ai in self.interpolate_arm(qi, qg, step):
             yield qi, ai
 
-    def manip_fk(self, q: np.ndarray = None) -> Tuple[np.ndarray, np.ndarray]:
+    def manip_fk(self, q: np.ndarray = None, node: str = None) -> Tuple[np.ndarray, np.ndarray]:
         """manipulator specific forward kinematics; uses separate URDF than the full-body fk() method"""
         assert q.shape == (self.dof,)
 
         if "pinocchio" in self._ik_type:
             q = self._ros_pose_to_pinocchio(q)
 
-        ee_pos, ee_quat = self.manip_ik_solver.compute_fk(q)
+        ee_pos, ee_quat = self.manip_ik_solver.compute_fk(q, node)
         return ee_pos.copy(), ee_quat.copy()
 
     def update_head(self, qi: np.ndarray, look_at) -> np.ndarray:
diff --git a/src/stretch/motion/pinocchio_ik_solver.py b/src/stretch/motion/pinocchio_ik_solver.py
index bc6561f74..771c1562c 100644
--- a/src/stretch/motion/pinocchio_ik_solver.py
+++ b/src/stretch/motion/pinocchio_ik_solver.py
@@ -139,6 +139,23 @@ def _qmap_model2control(self, q_input: np.ndarray) -> np.ndarray:
 
         return q_out
 
+    def get_frame_pose(self, config: Union[np.ndarray, dict], node_a: str, node_b: str, ignore_missing_joints: bool = False):
+        '''
+            Get a transformation matrix transforming from node_a frame to node_b frame
+        '''
+        q_model = self._qmap_control2model(config, ignore_missing_joints=ignore_missing_joints)
+        print('q_model', q_model)
+        pinocchio.forwardKinematics(self.model, self.data, q_model)
+        frame_idx1 = [f.name for f in self.model.frames].index(node_a)
+        frame_idx2 = [f.name for f in self.model.frames].index(node_b)
+        pinocchio.updateFramePlacement(self.model, self.data, frame_idx1)
+        placement_frame1 = self.data.oMf[frame_idx1]
+        pinocchio.updateFramePlacement(self.model, self.data, frame_idx2)
+        placement_frame2 = self.data.oMf[frame_idx2]
+        print('pin 1', placement_frame1)
+        print('pin 2', placement_frame2)
+        return placement_frame2.inverse() * placement_frame1
+
     def compute_fk(
         self, config: np.ndarray, link_name: str = None, ignore_missing_joints: bool = False
     ) -> Tuple[np.ndarray, np.ndarray]:
diff --git a/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py b/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py
index 8340bde15..aebae197c 100755
--- a/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py
+++ b/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py
@@ -241,6 +241,10 @@ def spin_recv(self):
                         action["xyt"],
                         relative=action["nav_relative"],
                     )
+                elif "v" in action and "w" in action:
+                    if self.verbose:
+                        print("Setting navigation velocities v and w!")
+                    self.client.nav.set_velocity(v = float(action['v']), w = float(action['w']))
                 elif "joint" in action:
                     # This allows for executing motor commands on the robot relatively quickly
                     if self.verbose:

From 71a5f770e515f68957ec65ca00b49cc9a3042235 Mon Sep 17 00:00:00 2001
From: stretch-se3-3055 <grail-hello-stretch@nyu.edu>
Date: Mon, 26 Aug 2024 15:55:36 -0700
Subject: [PATCH 002/410] speed up manipulation

---
 src/config/default_planner.yaml               | 24 +++++------
 src/stretch/agent/zmq_client.py               |  5 ++-
 src/stretch/dynav/ok_robot_hw/robot.py        | 20 +++++-----
 .../dynav/ok_robot_hw/utils/grasper_utils.py  | 35 ++++++++--------
 src/stretch/dynav/robot_agent_manip_mdp.py    | 40 ++++++++++---------
 src/stretch/dynav/run_ok_nav.py               | 10 +++++
 6 files changed, 77 insertions(+), 57 deletions(-)

diff --git a/src/config/default_planner.yaml b/src/config/default_planner.yaml
index d59703b5a..408baa633 100644
--- a/src/config/default_planner.yaml
+++ b/src/config/default_planner.yaml
@@ -49,18 +49,18 @@ motion:
   angle_threshold: 0.01  # How much the robot has to rotate to be considered "rotating"
   min_steps_not_moving: 2  # How many steps the robot has to not move before we consider it "stopped"
   joint_tolerance:
-    # arm: 0.005
-    # base_x: 0.005
-    # lift: 0.005
-    # wrist_roll: 0.01
-    # wrist_pitch: 0.01
-    # wrist_yaw: 0.01
-    arm: 0.01
-    base_x: 0.01
-    lift: 0.01
-    wrist_roll: 0.05
-    wrist_pitch: 0.05
-    wrist_yaw: 0.05
+    arm: 0.005
+    base_x: 0.005
+    lift: 0.005
+    wrist_roll: 0.01
+    wrist_pitch: 0.01
+    wrist_yaw: 0.01
+    # arm: 0.05
+    # base_x: 0.05
+    # lift: 0.05
+    # wrist_roll: 0.25
+    # wrist_pitch: 0.25
+    # wrist_yaw: 0.05
     head_pan: 0.01
     head_tilt: 0.01
   joint_thresholds:
diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 2b400cced..834bd12ae 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -794,7 +794,9 @@ def _wait_for_action(
             time.sleep(0.01)
             t1 = timeit.default_timer()
             if t1 - t0 > timeout:
-                raise RuntimeError(f"Timeout waiting for block with step id = {block_id}")
+                print(f"Timeout waiting for block with step id = {block_id}")
+                break
+                # raise RuntimeError(f"Timeout waiting for block with step id = {block_id}")
 
     def in_manipulation_mode(self) -> bool:
         """is the robot ready to grasp"""
@@ -886,6 +888,7 @@ def execute_trajectory(
         per_waypoint_timeout: float = 10.0,
         final_timeout: float = 10.0,
         relative: bool = False,
+        blocking: bool = False
     ):
         """Execute a multi-step trajectory; this is always blocking since it waits to reach each one in turn."""
 
diff --git a/src/stretch/dynav/ok_robot_hw/robot.py b/src/stretch/dynav/ok_robot_hw/robot.py
index ffe3f64c6..427cec634 100644
--- a/src/stretch/dynav/ok_robot_hw/robot.py
+++ b/src/stretch/dynav/ok_robot_hw/robot.py
@@ -13,6 +13,8 @@
 from stretch.dynav.ok_robot_hw.utils import kdl_tree_from_urdf_model
 from stretch.dynav.ok_robot_hw.global_parameters import *
 
+# from stretch.utils.geometry import angle_difference
+# import timeit
 
 OVERRIDE_STATES = {}
 
@@ -104,7 +106,8 @@ def move_to_position(
         gripper_pos = None, 
         base_theta = None, 
         head_tilt = None, 
-        head_pan = None
+        head_pan = None,
+        blocking = True,
     ):
         """
             Moves the robots, base, arm, lift, wrist and head to a desired position.
@@ -117,7 +120,7 @@ def move_to_position(
         target_state = self.robot.get_six_joints()
         if not gripper_pos is None:
             self.CURRENT_STATE = gripper_pos*(self.STRETCH_GRIPPER_MAX-self.STRETCH_GRIPPER_MIN)+self.STRETCH_GRIPPER_MIN
-            self.robot.gripper_to(self.CURRENT_STATE, blocking = True)
+            self.robot.gripper_to(self.CURRENT_STATE, blocking = blocking)
         if not arm_pos is None:
             target_state[2] = arm_pos
         if not lift_pos is None:
@@ -136,7 +139,7 @@ def move_to_position(
         
         # Actual Movement
         print('Target Position', target_state)
-        self.robot.arm_to(target_state, blocking = True)
+        self.robot.arm_to(target_state, blocking = blocking)
         print('Actual location', self.robot.get_six_joints())
 
         # Head state update and Movement
@@ -145,7 +148,7 @@ def move_to_position(
             target_head_tilt = head_tilt
         if not head_pan is None:
             target_head_pan = head_pan
-        self.robot.head_to(head_tilt = target_head_tilt, head_pan = target_head_pan, blocking = True)
+        self.robot.head_to(head_tilt = target_head_tilt, head_pan = target_head_pan, blocking = blocking)
         #time.sleep(0.7)
 
     def pickup(self, width):
@@ -155,15 +158,15 @@ def pickup(self, width):
         """
         next_gripper_pos = width
         while True:
-            self.robot.gripper_to(max(next_gripper_pos * self.STRETCH_GRIPPER_MAX, -0.25), blocking = True)
+            self.robot.gripper_to(max(next_gripper_pos * self.STRETCH_GRIPPER_MAX, -0.2), blocking = True)
             curr_gripper_pose = self.robot.get_gripper_position()
             # print('Robot means to move gripper to', next_gripper_pos * self.STRETCH_GRIPPER_MAX)
             # print('Robot actually moves gripper to', curr_gripper_pose)
-            if next_gripper_pos == -1 or (curr_gripper_pose > max(next_gripper_pos * self.STRETCH_GRIPPER_MAX, -0.25) + 0.015):
+            if next_gripper_pos == -1 or (curr_gripper_pose > max(next_gripper_pos * self.STRETCH_GRIPPER_MAX, -0.2) + 0.1):
                 break
             
             if next_gripper_pos > 0:
-                next_gripper_pos -= 0.25
+                next_gripper_pos -= 0.4
             else: 
                 next_gripper_pos = -1
 
@@ -222,7 +225,6 @@ def move_to_joints(self, joints, gripper, mode=0, velocities = None):
             target1 = state
             target1[1] = target_state[1]
             self.robot.arm_to(target1, blocking = True)
-            # time.sleep(1)
 
         # self.robot.arm_to(target_state, velocities = velocities, blocking = True)
         self.robot.arm_to(target_state, blocking = True)
@@ -323,4 +325,4 @@ def move_to_pose(self, translation_tensor, rotational_tensor, gripper, move_mode
         # Update joint_values
         self.updateJoints()
         for joint_index in range(self.joint_array.rows()):
-            self.joint_array[joint_index] = self.joints[self.joint_list[joint_index]]
\ No newline at end of file
+            self.joint_array[joint_index] = self.joints[self.joint_list[joint_index]]
diff --git a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
index 937023c57..fa7961d0b 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
@@ -34,8 +34,10 @@ def capture_and_process_image(camera, mode, obj, socket, hello_robot):
         
         elif (retry_flag !=0 and side_retries == 3):
             print("Tried in all angles but couldn't succed")
-            # time.sleep(2)
-            return None, None, None
+            if mode == 'pick':
+                return None, None, None, None
+            else:
+                return None, None
 
         elif (side_retries == 2 and tilt_retries == 3):
             hello_robot.move_to_position(base_trans=0.1, head_tilt=head_tilt)
@@ -55,7 +57,6 @@ def capture_and_process_image(camera, mode, obj, socket, hello_robot):
                 hello_robot.move_to_position(head_pan=head_pan,
                                         head_tilt=head_tilt + head_tilt_angles[tilt_retries])
                 tilt_retries += 1
-                time.sleep(1)
 
     if mode == "place":
         translation = PyKDL.Vector(-translation[1], -translation[0], -translation[2])
@@ -127,7 +128,7 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
     transformed_frame = cam2gripper_transform * dest_frame
 
     # Lifting the arm to high position as part of pregrasping position
-    robot.move_to_position(lift_pos = 1.05, head_pan = None, head_tilt = None)
+    robot.move_to_position(lift_pos = 1.05, gripper_pos = gripper_width, head_pan = None, head_tilt = None)
     # time.sleep(2)
 
     # Rotation for aligning Robot gripper frame to Model gripper frame
@@ -136,7 +137,6 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
     # final Rotation of gripper to hold the objet
     final_rotation = transformed_frame.M * rotation2_top
     # print(f"final rotation - {final_rotation.GetRPY()}")
-    robot.move_to_position(gripper_pos = gripper_width)
     robot.move_to_pose(
             [0, 0, 0],
             [final_rotation.GetRPY()[0], final_rotation.GetRPY()[1], final_rotation.GetRPY()[2]],
@@ -188,7 +188,7 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
             [1]
         )
         # time.sleep(2)
-        base2gripper_transform, _, _ = robot.get_joint_transform('base_link', gripper_node)
+        # base2gripper_transform, _, _ = robot.get_joint_transform('base_link', gripper_node)
         # print(f"transformed point3 : {base2gripper_transform * base_point}")
         diff = diff - dist
         
@@ -202,7 +202,7 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
             velocities=velocities
         )
         # time.sleep(2)
-        base2gripper_transform, _, _ = robot.get_joint_transform('base_link', gripper_node)
+        # base2gripper_transform, _, _ = robot.get_joint_transform('base_link', gripper_node)
         # print(f"transformed point3 : {base2gripper_transform * base_point}")
         diff = diff - dist
     
@@ -218,17 +218,18 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
     # time.sleep(1)
     robot.move_to_position(wrist_pitch = 0.0)
     # time.sleep(1)
-    robot.move_to_position(wrist_yaw = 2.0)
+    robot.move_to_position(lift_pos = min(robot.robot.get_six_joints()[1], 0.9), wrist_yaw = 2.5)
+    robot.move_to_position(lift_pos = min(robot.robot.get_six_joints()[1], 0.6))
     # time.sleep(1)
 
     # rotate the arm wrist onto the base
-    if abs(robot.robot.get_six_joints()[3] - 2.0) > 0.1:
-        robot.move_to_position(wrist_yaw  = -2.0)
-        # time.sleep(1)
-
-    # Put down the arm    
-    robot.move_to_position(lift_pos = 0.85)
-    if abs(robot.robot.get_six_joints()[3] - 2.0) < 0.1:
-        robot.move_to_position(wrist_yaw = 2.5)
-    robot.move_to_position(lift_pos = 0.6)
+    # if abs(robot.robot.get_six_joints()[3] - 2.0) > 0.1:
+    #     robot.move_to_position(wrist_yaw  = -2.0)
+    #     # time.sleep(1)
+
+    # # Put down the arm    
+    # robot.move_to_position(lift_pos = 0.85)
+    # if abs(robot.robot.get_six_joints()[3] - 2.0) < 0.1:
+    #     robot.move_to_position(wrist_yaw = 2.5)
+    # robot.move_to_position(lift_pos = 0.6)
     # time.sleep(1)
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index 19b9167cf..219b4e15c 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -43,8 +43,8 @@ def __init__(
         robot: RobotClient,
         parameters: Dict[str, Any],
         ip: str,
-        image_port: int = 5560,
-        text_port: int = 5561,
+        image_port: int = 5555,
+        text_port: int = 5556,
         manip_port: int = 5557,
         re: int = 1,
     ):
@@ -90,9 +90,16 @@ def look_around(self):
         for pan in [0.5, -0.5, -1.5]:
             for tilt in [-0.55]:
                 self.robot.head_to(pan, tilt, blocking = True)
-                time.sleep(0.3)
                 self.update()
 
+    def rotate_in_place(self):
+        logger.info("Rotate in place")
+        xyt = self.robot.get_base_pose()
+        for i in range(8):
+            xyt[2] += 2 * np.pi / 8
+            self.robot.navigate_to(xyt, blocking = True)
+            self.update()
+
     def update(self):
         """Step the data collector. Get a single observation of the world. Remove bad points, such as those from too far or too near the camera. Update the 3d world representation."""
         obs = self.robot.get_observation()
@@ -108,9 +115,9 @@ def execute_action(
     ):
         start_time = time.time()
 
-        self.robot.look_front()
+        # self.robot.look_front()
         self.look_around()
-        self.robot.look_front()
+        # self.robot.look_front()
         self.robot.switch_to_navigation_mode()
 
         start = self.robot.get_base_pose()
@@ -127,12 +134,12 @@ def execute_action(
         if len(res) > 0:
             print("Plan successful!")
             if len(res) > 2 and np.isnan(res[-2]).all():
-                blocking = text != ''
+                # blocking = text != ''
                 self.robot.execute_trajectory(
                     res[:-2],
                     pos_err_threshold=self.pos_err_threshold,
                     rot_err_threshold=self.rot_err_threshold,
-                    blocking = blocking
+                    blocking = True
                 )
 
                 execution_finish = time.time()
@@ -148,7 +155,7 @@ def execute_action(
                     res,
                     pos_err_threshold=self.pos_err_threshold,
                     rot_err_threshold=self.rot_err_threshold,
-                    blocking = False
+                    blocking = True
                 )
 
                 execution_finish = time.time()
@@ -206,7 +213,7 @@ def place(self, text, init_tilt = INIT_HEAD_TILT, base_node = TOP_CAMERA_NODE):
             head_tilt=init_tilt)
         camera = RealSenseCamera(self.robot)
 
-        time.sleep(2)
+        # time.sleep(2)
         rotation, translation = capture_and_process_image(
             camera = camera,
             mode = 'place',
@@ -220,28 +227,25 @@ def place(self, text, init_tilt = INIT_HEAD_TILT, base_node = TOP_CAMERA_NODE):
 
         # lift arm to the top before the robot extends the arm, prepare the pre-placing gripper pose
         self.manip_wrapper.move_to_position(lift_pos=1.05)
-        time.sleep(1.5)
         self.manip_wrapper.move_to_position(wrist_yaw=0,
                                  wrist_pitch=0)
-        time.sleep(1.5)
 
         # Placing the object
         move_to_point(self.manip_wrapper, translation, base_node, self.transform_node, move_mode=0)
-        time.sleep(1.5)
-        self.manip_wrapper.move_to_position(gripper_pos=1)
+        self.manip_wrapper.move_to_position(gripper_pos=1, blocking = False)
+        time.sleep(0.1)
 
         # Lift the arm a little bit, and rotate the wrist roll of the robot in case the object attached on the gripper
         self.manip_wrapper.move_to_position(lift_pos = self.manip_wrapper.robot.get_six_joints()[1] + 0.3)
-        self.manip_wrapper.move_to_position(wrist_roll = 3.)
-        time.sleep(0.8)
-        self.manip_wrapper.move_to_position(wrist_roll = -3.)
-        time.sleep(0.8)
+        self.manip_wrapper.move_to_position(wrist_roll = 3., blocking = False)
+        time.sleep(0.5)
+        self.manip_wrapper.move_to_position(wrist_roll = -3., blocking = False)
+        time.sleep(0.5)
 
         # Wait for some time and shrink the arm back
         self.manip_wrapper.move_to_position(gripper_pos=1, 
                                 lift_pos = 1.05,
                                 arm_pos = 0)
-        time.sleep(2)
         self.manip_wrapper.move_to_position(wrist_pitch=-1.57)
 
         # Shift the base back to the original point as we are certain that orginal point is navigable in navigation obstacle map
diff --git a/src/stretch/dynav/run_ok_nav.py b/src/stretch/dynav/run_ok_nav.py
index 9642118ef..18d706b68 100644
--- a/src/stretch/dynav/run_ok_nav.py
+++ b/src/stretch/dynav/run_ok_nav.py
@@ -82,6 +82,16 @@ def main(
         robot, parameters, ip = ip, re = re
     )
 
+    demo.rotate_in_place()
+
+    # def keep_looking_around():
+    #     while True:
+    #         demo.look_around()
+
+    # img_thread = threading.Thread(target=keep_looking_around)
+    # img_thread.daemon = True
+    # img_thread.start()
+
     while True:
         mode = input('select mode? E/N/S')
         if mode == 'S':

From 067bf040ae08e265bcf079931828f3caf8d5b8ae Mon Sep 17 00:00:00 2001
From: stretch-se3-3055 <grail-hello-stretch@nyu.edu>
Date: Wed, 28 Aug 2024 23:01:08 -0700
Subject: [PATCH 003/410] fix bug

---
 src/stretch/agent/zmq_client.py            |   5 +-
 src/stretch/dynav/ok_robot_hw/robot.py     | 138 +++++++++++++++++----
 src/stretch/dynav/robot_agent_manip_mdp.py |   6 +-
 3 files changed, 120 insertions(+), 29 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index eb9b308c7..a21f44f98 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -282,8 +282,9 @@ def get_gripper_position(self):
         joint_state = self.get_joint_positions()
         return joint_state[HelloStretchIdx.GRIPPER]
 
-    def get_ee_pose(self, matrix=False, link_name=None):
-        q = self.get_joint_positions()
+    def get_ee_pose(self, matrix=False, link_name=None, q = None):
+        if q is None:
+            q = self.get_joint_positions()
         pos, quat = self._robot_model.manip_fk(q, node = link_name)
 
         if matrix:
diff --git a/src/stretch/dynav/ok_robot_hw/robot.py b/src/stretch/dynav/ok_robot_hw/robot.py
index 43b06d2b7..d09f2d488 100644
--- a/src/stretch/dynav/ok_robot_hw/robot.py
+++ b/src/stretch/dynav/ok_robot_hw/robot.py
@@ -16,6 +16,7 @@
 # from stretch.utils.geometry import angle_difference
 # import timeit
 import pinocchio as pin
+from stretch.motion.kinematics import HelloStretchIdx
 
 OVERRIDE_STATES = {}
 
@@ -62,6 +63,8 @@ def setup_kdl(self):
         """
         import PyKDL
         from urdf_parser_py.urdf import URDF
+        self.joints = {'joint_fake':0}
+        self.head_joints = {'joint_fake':0}
         self.joints_pin = {'joint_fake':0}
         
         # Loading URDF and listing the internediate joints from base to gripper
@@ -179,6 +182,23 @@ def updateJoints(self):
 
         # Head Joints
         pan, tilt = self.robot.get_pan_tilt()
+
+        self.joints['joint_fake'] = origin_dist
+        self.joints['joint_lift'] = state[1]
+        armPos = state[2]
+        self.joints['3'] = armPos / 4.0
+        self.joints['2'] = armPos / 4.0
+        self.joints['1'] = armPos / 4.0
+        self.joints['0'] = armPos / 4.0
+        self.joints['joint_wrist_yaw'] = state[3]
+        self.joints['joint_wrist_roll'] = state[5]
+        self.joints['joint_wrist_pitch'] = OVERRIDE_STATES.get('wrist_pitch', state[4])
+
+        self.joints['joint_gripper_finger_left'] = 0
+
+        self.head_joints['joint_fake'] = origin_dist
+        self.head_joints['joint_head_pan'] = pan
+        self.head_joints['joint_head_tilt'] = tilt
         
         self.joints_pin['joint_fake'] = origin_dist
         self.joints_pin['joint_lift'] = state[1]
@@ -242,29 +262,75 @@ def get_joint_transform(self, node1, node2):
         '''
 
         # return frame_transform, frame2, frame1
+        self.updateJoints()
+        print('joints pin', self.joints_pin)
         frame_pin = self.robot.get_frame_pose(self.joints_pin, node1, node2)
+
+        import PyKDL
+
+        # Intializing chain -> maintains list of nodes from base link to corresponding nodes
+        chain1 = self.kdl_tree.getChain('base_link', node1)
+        chain2 = self.kdl_tree.getChain('base_link', node2)
+
+        # Intializing corresponding joint array and forward chain solvers
+        joint_array1 = PyKDL.JntArray(chain1.getNrOfJoints())
+        joint_array2 = PyKDL.JntArray(chain2.getNrOfJoints())
+
+        fk_p_kdl1 = PyKDL.ChainFkSolverPos_recursive(chain1)
+        fk_p_kdl2 = PyKDL.ChainFkSolverPos_recursive(chain2)
+
+        if node1 == TOP_CAMERA_NODE:
+            ref_joints1 = self.head_joints
+            ref_joint1_list = self.head_joint_list
+        else:
+            ref_joints1 = self.joints
+            ref_joint1_list = self.joint_list
+            
+        # Updating the joint arrays from self.joints
+        for joint_index in range(joint_array1.rows()):
+            joint_array1[joint_index] = ref_joints1[ref_joint1_list[joint_index]]
+
+        for joint_index in range(joint_array2.rows()):
+            joint_array2[joint_index] = self.joints[self.joint_list[joint_index]]
+            
+        # Intializing frames corresponding to nodes
+        frame1 = PyKDL.Frame()
+        frame2 = PyKDL.Frame()
+
+        # Calculating current frames of nodes
+        print('joint_array1', joint_array1)
+        print('joint_array2', joint_array2)
+        print('self.joints', self.joints)
+        print('self.head_joints', self.head_joints)
+        fk_p_kdl1.JntToCart(joint_array1, frame1)
+        fk_p_kdl2.JntToCart(joint_array2, frame2)
         
+        # This allows to transform a point in frame1 to frame2
+        frame_transform = frame2.Inverse() * frame1
+        print('pin', frame_pin)
+        print('pykdl', frame_transform)
         return frame_pin
 
-    # def debug_pose(self):
-    #     import PyKDL
-    #     self.updateJoints()
-    #     print(self.joints)
-    #     for joint_index in range(self.joint_array.rows()):
-    #         self.joint_array[joint_index] = self.joints[self.joint_list[joint_index]]
-    #     print(self.joint_array)
-    #     curr_pose = PyKDL.Frame() # Current pose of gripper in base frame
-    #     self.fk_p_kdl.JntToCart(self.joint_array, curr_pose)
-    #     print(f"cur pose {curr_pose}")
-    #     pin_pose = self.robot.get_ee_pose(matrix=True, link_name=self.end_Link)
-    #     pin_rotation, pin_translation = pin_pose[:3, :3], pin_pose[:3, 3]
-    #     pin_curr_pose = pin.SE3(pin_rotation, pin_translation)
-    #     print(f"pin curr pose {pin_curr_pose}")
+    def debug_pose(self):
+        import PyKDL
+        self.updateJoints()
+        print(self.joints)
+        for joint_index in range(self.joint_array.rows()):
+            self.joint_array[joint_index] = self.joints[self.joint_list[joint_index]]
+        print(self.joint_array)
+        curr_pose = PyKDL.Frame() # Current pose of gripper in base frame
+        self.fk_p_kdl.JntToCart(self.joint_array, curr_pose)
+        print(f"cur pose {curr_pose}")
+        pin_pose = self.robot.get_ee_pose(matrix=True, link_name=self.end_link)
+        pin_rotation, pin_translation = pin_pose[:3, :3], pin_pose[:3, 3]
+        pin_curr_pose = pin.SE3(pin_rotation, pin_translation)
+        print(f"pin curr pose {pin_curr_pose}")
     
     def move_to_pose(self, translation_tensor, rotational_tensor, gripper, move_mode=0, velocities=None):
         """
             Function to move the gripper to a desired translation and rotation
         """
+        import PyKDL
         translation = [translation_tensor[0], translation_tensor[1], translation_tensor[2]]
         rotation = rotational_tensor
         # print('translation and rotation', translation_tensor, rotational_tensor)
@@ -272,20 +338,37 @@ def move_to_pose(self, translation_tensor, rotational_tensor, gripper, move_mode
         self.updateJoints()
         for joint_index in range(self.joint_array.rows()):
             self.joint_array[joint_index] = self.joints[self.joint_list[joint_index]]
-        # print('self.joints', self.joints)
-        # print('self.joint_array', self.joint_array)
+        print('self.joints', self.joints)
+        print('self.joint_array', self.joint_array)
 
-        pin_pose = self.robot.get_ee_pose(matrix=True, link_name=self.end_link)
+        curr_pose = PyKDL.Frame() # Current pose of gripper in base frame
+        del_pose = PyKDL.Frame() # Relative Movement of gripper 
+        self.fk_p_kdl.JntToCart(self.joint_array, curr_pose)
+
+        q = self.robot.get_joint_positions()
+        q[HelloStretchIdx.WRIST_PITCH] = OVERRIDE_STATES.get('wrist_pitch', q[HelloStretchIdx.WRIST_PITCH])
+        pin_pose = self.robot.get_ee_pose(matrix=True, link_name=self.end_link, q = q)
         pin_rotation, pin_translation = pin_pose[:3, :3], pin_pose[:3, 3]
         pin_curr_pose = pin.SE3(pin_rotation, pin_translation)
-        # print(f"pin curr pose {pin_curr_pose}")
+        print(f"pykdl curr pose {curr_pose}")
+        print(f"pin curr pose {pin_curr_pose}")
 
         rot_matrix = R.from_euler('xyz', rotation, degrees=False).as_matrix()
 
+        del_rot = PyKDL.Rotation(PyKDL.Vector(rot_matrix[0][0], rot_matrix[1][0], rot_matrix[2][0]),
+                                  PyKDL.Vector(rot_matrix[0][1], rot_matrix[1][1], rot_matrix[2][1]),
+                                  PyKDL.Vector(rot_matrix[0][2], rot_matrix[1][2], rot_matrix[2][2]))
+        del_trans = PyKDL.Vector(translation[0], translation[1], translation[2])
+        del_pose.M = del_rot
+        del_pose.p = del_trans
+        goal_pose_new = curr_pose*del_pose
+
         pin_del_pose = pin.SE3(np.array(rot_matrix), np.array(translation))
         pin_goal_pose_new = pin_curr_pose * pin_del_pose
-        # print(f"pin del psoe {pin_del_pose}")
-        # print(f"pin goal pose new {pin_goal_pose_new}")
+        print(f"pykdl del psoe {del_pose}")
+        print(f"pykdl goal pose new {goal_pose_new}")
+        print(f"pin del psoe {pin_del_pose}")
+        print(f"pin goal pose new {pin_goal_pose_new}")
 
         final_pos = pin_goal_pose_new.translation.tolist()
         final_quat = pin.Quaternion(pin_goal_pose_new.rotation).coeffs().tolist()
@@ -301,10 +384,14 @@ def move_to_pose(self, translation_tensor, rotational_tensor, gripper, move_mode
         pin_joint_pos = self.robot._extract_joint_pos(full_body_cfg)
         transform_joint_pos = transform_joint_array(pin_joint_pos)
 
+        # Ik to calculate the required joint movements to move the gripper to desired pose
+        seed_array = PyKDL.JntArray(self.arm_chain.getNrOfJoints())
+        self.ik_p_kdl.CartToJnt(seed_array, goal_pose_new, self.joint_array) 
+
         self.joint_array1 = transform_joint_pos
-        # print(f"joint array {self.joint_array}")
-        # print(f"pin joint pos {pin_joint_pos}")
-        # print(f"transformed joint pos {transform_joint_pos}")
+        print(f"joint array {self.joint_array}")
+        print(f"pin joint pos {pin_joint_pos}")
+        print(f"transformed joint pos {transform_joint_pos}")
 
         ik_joints = {}
         # for joint_index in range(self.joint_array.rows()):
@@ -313,3 +400,8 @@ def move_to_pose(self, translation_tensor, rotational_tensor, gripper, move_mode
 
         # Actual Movement of joints
         self.move_to_joints(ik_joints, gripper, move_mode, velocities)
+
+        self.updateJoints()
+        # for joint_index in range(self.joint_array.rows()):
+        for joint_index in range(len(self.joint_array1)):
+            self.joint_array[joint_index] = self.joints[self.joint_list[joint_index]]
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index 1810ce254..865b0bfb5 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -236,10 +236,8 @@ def place(self, text, init_tilt = INIT_HEAD_TILT, base_node = TOP_CAMERA_NODE):
 
         # Lift the arm a little bit, and rotate the wrist roll of the robot in case the object attached on the gripper
         self.manip_wrapper.move_to_position(lift_pos = self.manip_wrapper.robot.get_six_joints()[1] + 0.3)
-        self.manip_wrapper.move_to_position(wrist_roll = 3., blocking = True)
-        time.sleep(0.5)
-        self.manip_wrapper.move_to_position(wrist_roll = -3., blocking = True)
-        time.sleep(0.5)
+        self.manip_wrapper.move_to_position(wrist_roll = 2.5, blocking = True)
+        self.manip_wrapper.move_to_position(wrist_roll = -2.5, blocking = True)
 
         # Wait for some time and shrink the arm back
         self.manip_wrapper.move_to_position(gripper_pos=1, 

From fef95c166d19eb26bbda321bc2854c1985293102 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 29 Aug 2024 12:13:27 -0400
Subject: [PATCH 004/410] update robot agent code

---
 src/stretch/agent/robot_agent.py | 115 ++++++++++++++++++-------------
 1 file changed, 66 insertions(+), 49 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index d84fd56c6..938ca033d 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -98,55 +98,7 @@ def __init__(
         if voxel_map is not None:
             self.voxel_map = voxel_map
         else:
-            self.voxel_map = SparseVoxelMap(
-                resolution=self._voxel_size,
-                local_radius=parameters["local_radius"],
-                obs_min_height=parameters["obs_min_height"],
-                obs_max_height=parameters["obs_max_height"],
-                min_depth=parameters["min_depth"],
-                max_depth=parameters["max_depth"],
-                pad_obstacles=parameters["pad_obstacles"],
-                add_local_radius_points=parameters.get("add_local_radius_points", default=True),
-                remove_visited_from_obstacles=parameters.get(
-                    "remove_visited_from_obstacles", default=False
-                ),
-                obs_min_density=parameters["obs_min_density"],
-                encoder=self.encoder,
-                smooth_kernel_size=parameters.get("filters/smooth_kernel_size", -1),
-                use_median_filter=parameters.get("filters/use_median_filter", False),
-                median_filter_size=parameters.get("filters/median_filter_size", 5),
-                median_filter_max_error=parameters.get("filters/median_filter_max_error", 0.01),
-                use_derivative_filter=parameters.get("filters/use_derivative_filter", False),
-                derivative_filter_threshold=parameters.get(
-                    "filters/derivative_filter_threshold", 0.5
-                ),
-                use_instance_memory=(self.semantic_sensor is not None or self._use_instance_memory),
-                instance_memory_kwargs={
-                    "min_pixels_for_instance_view": parameters.get(
-                        "min_pixels_for_instance_view", 100
-                    ),
-                    "min_instance_thickness": parameters.get(
-                        "instance_memory/min_instance_thickness", 0.01
-                    ),
-                    "min_instance_vol": parameters.get("instance_memory/min_instance_vol", 1e-6),
-                    "max_instance_vol": parameters.get("instance_memory/max_instance_vol", 10.0),
-                    "min_instance_height": parameters.get(
-                        "instance_memory/min_instance_height", 0.1
-                    ),
-                    "max_instance_height": parameters.get(
-                        "instance_memory/max_instance_height", 1.8
-                    ),
-                    "min_pixels_for_instance_view": parameters.get(
-                        "instance_memory/min_pixels_for_instance_view", 100
-                    ),
-                    "min_percent_for_instance_view": parameters.get(
-                        "instance_memory/min_percent_for_instance_view", 0.2
-                    ),
-                    "open_vocab_cat_map_file": parameters.get("open_vocab_category_map_file", None),
-                    "use_visual_feat": parameters.get("use_visual_feat", False),
-                },
-                prune_detected_objects=parameters.get("prune_detected_objects", False),
-            )
+            self.voxel_map = self._create_voxel_map(parameters)
 
         # Create planning space
         self.space = SparseVoxelMapNavigationSpace(
@@ -175,6 +127,57 @@ def __init__(
 
         timestamp = f"{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}"
 
+    def _create_voxel_map(self, parameters: Parameters) -> SparseVoxelMap:
+        """Create a voxel map from parameters.
+
+        Args:
+            parameters(Parameters): the parameters for the voxel map
+
+        Returns:
+            SparseVoxelMap: the voxel map
+        """
+        return SparseVoxelMap(
+            resolution=self._voxel_size,
+            local_radius=parameters["local_radius"],
+            obs_min_height=parameters["obs_min_height"],
+            obs_max_height=parameters["obs_max_height"],
+            min_depth=parameters["min_depth"],
+            max_depth=parameters["max_depth"],
+            pad_obstacles=parameters["pad_obstacles"],
+            add_local_radius_points=parameters.get("add_local_radius_points", default=True),
+            remove_visited_from_obstacles=parameters.get(
+                "remove_visited_from_obstacles", default=False
+            ),
+            obs_min_density=parameters["obs_min_density"],
+            encoder=self.encoder,
+            smooth_kernel_size=parameters.get("filters/smooth_kernel_size", -1),
+            use_median_filter=parameters.get("filters/use_median_filter", False),
+            median_filter_size=parameters.get("filters/median_filter_size", 5),
+            median_filter_max_error=parameters.get("filters/median_filter_max_error", 0.01),
+            use_derivative_filter=parameters.get("filters/use_derivative_filter", False),
+            derivative_filter_threshold=parameters.get("filters/derivative_filter_threshold", 0.5),
+            use_instance_memory=(self.semantic_sensor is not None or self._use_instance_memory),
+            instance_memory_kwargs={
+                "min_pixels_for_instance_view": parameters.get("min_pixels_for_instance_view", 100),
+                "min_instance_thickness": parameters.get(
+                    "instance_memory/min_instance_thickness", 0.01
+                ),
+                "min_instance_vol": parameters.get("instance_memory/min_instance_vol", 1e-6),
+                "max_instance_vol": parameters.get("instance_memory/max_instance_vol", 10.0),
+                "min_instance_height": parameters.get("instance_memory/min_instance_height", 0.1),
+                "max_instance_height": parameters.get("instance_memory/max_instance_height", 1.8),
+                "min_pixels_for_instance_view": parameters.get(
+                    "instance_memory/min_pixels_for_instance_view", 100
+                ),
+                "min_percent_for_instance_view": parameters.get(
+                    "instance_memory/min_percent_for_instance_view", 0.2
+                ),
+                "open_vocab_cat_map_file": parameters.get("open_vocab_category_map_file", None),
+                "use_visual_feat": parameters.get("use_visual_feat", False),
+            },
+            prune_detected_objects=parameters.get("prune_detected_objects", False),
+        )
+
     @property
     def feature_match_threshold(self) -> float:
         """Return the feature match threshold"""
@@ -1315,6 +1318,20 @@ def wave(self, **kwargs) -> bool:
 
         self.robot.switch_to_navigation_mode()
 
+    def load_map(self, filename: str) -> None:
+        """Load a map from a PKL file. Creates a new voxel map and loads the data from the file into this map. Then uses RANSAC to figure out where the current map and the loaded map overlap, computes a transform, and applies this transform to the loaded map to align it with the current map.
+
+        Args:
+            filename(str): the name of the file to load the map from
+        """
+
+        # Load the map from the file
+        loaded_voxel_map = self._create_voxel_map(self.parameters)
+        loaded_voxel_map.read_from_pickel(filename, perception=self.semantic_sensor)
+
+        # Align the loaded map with the current map
+        raise NotImplementedError("This function is not implemented yet.")
+
     def get_detections(self, **kwargs) -> List[Instance]:
         """Get the current detections."""
         instances = self.voxel_map.get_instances()

From a27e3c209f6593f1432897d907301c708b272ebb Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 29 Aug 2024 14:16:34 -0400
Subject: [PATCH 005/410] add code to find transform between two point clouds

---
 src/stretch/agent/robot_agent.py | 12 +++--
 src/stretch/utils/point_cloud.py | 82 ++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+), 4 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 938ca033d..7bde5e47d 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -34,6 +34,7 @@
 from stretch.perception.encoders import BaseImageTextEncoder, get_encoder
 from stretch.perception.wrapper import OvmmPerception
 from stretch.utils.geometry import angle_difference
+from stretch.utils.point_cloud import find_se3_transform
 
 
 class RobotAgent:
@@ -1318,7 +1319,7 @@ def wave(self, **kwargs) -> bool:
 
         self.robot.switch_to_navigation_mode()
 
-    def load_map(self, filename: str) -> None:
+    def load_map(self, filename: str, color_weight: float = 0.5) -> None:
         """Load a map from a PKL file. Creates a new voxel map and loads the data from the file into this map. Then uses RANSAC to figure out where the current map and the loaded map overlap, computes a transform, and applies this transform to the loaded map to align it with the current map.
 
         Args:
@@ -1327,10 +1328,13 @@ def load_map(self, filename: str) -> None:
 
         # Load the map from the file
         loaded_voxel_map = self._create_voxel_map(self.parameters)
-        loaded_voxel_map.read_from_pickel(filename, perception=self.semantic_sensor)
+        loaded_voxel_map.read_from_pickle(filename, perception=self.semantic_sensor)
 
-        # Align the loaded map with the current map
-        raise NotImplementedError("This function is not implemented yet.")
+        xyz1, rgb1, _, _ = self.voxel_pcd.get_pointcloud()
+        xyz2, rgb2, _, _ = loaded_voxel_map.get_pointcloud()
+
+        tform = find_se3_transform(xyz1, xyz2, rgb1, rgb2)
+        breakpoint()
 
     def get_detections(self, **kwargs) -> List[Instance]:
         """Get the current detections."""
diff --git a/src/stretch/utils/point_cloud.py b/src/stretch/utils/point_cloud.py
index 822d4dc7d..f77155f3d 100644
--- a/src/stretch/utils/point_cloud.py
+++ b/src/stretch/utils/point_cloud.py
@@ -418,3 +418,85 @@ def dropout_random_ellipses(depth_img, dropout_mean, gamma_shape=10000, gamma_sc
         depth_img[mask == 1] = 0
 
     return depth_img
+
+
+import numpy as np
+from scipy.spatial import cKDTree
+
+
+def find_se3_transform(
+    cloud1, cloud2, rgb1, rgb2, color_weight=0.5, max_iterations=50, tolerance=1e-5
+):
+    """
+    Find the SE(3) transformation between two colorized point clouds.
+
+    Parameters:
+    cloud1, cloud2: Nx3 numpy arrays containing XYZ coordinates
+    rgb1, rgb2: Nx3 numpy arrays containing RGB values (0-255)
+    color_weight: Weight given to color difference (0-1)
+    max_iterations: Maximum number of ICP iterations
+    tolerance: Convergence tolerance for transformation change
+
+    Returns:
+    R: 3x3 rotation matrix
+    t: 3x1 translation vector
+    """
+
+    # Example usage:
+    # cloud1 = np.random.rand(1000, 3)  # XYZ coordinates for cloud 1
+    # cloud2 = np.random.rand(1000, 3)  # XYZ coordinates for cloud 2
+    # rgb1 = np.random.randint(0, 256, (1000, 3))  # RGB values for cloud 1
+    # rgb2 = np.random.randint(0, 256, (1000, 3))  # RGB values for cloud 2
+    #
+    # R, t = find_se3_transform(cloud1, cloud2, rgb1, rgb2)
+
+    def best_fit_transform(A, B):
+        """
+        Calculates the least-squares best-fit transform between corresponding 3D points A->B
+        """
+        centroid_A = np.mean(A, axis=0)
+        centroid_B = np.mean(B, axis=0)
+        AA = A - centroid_A
+        BB = B - centroid_B
+        H = np.dot(AA.T, BB)
+        U, S, Vt = np.linalg.svd(H)
+        R = np.dot(Vt.T, U.T)
+        if np.linalg.det(R) < 0:
+            Vt[2, :] *= -1
+            R = np.dot(Vt.T, U.T)
+        t = centroid_B.T - np.dot(R, centroid_A.T)
+        return R, t
+
+    # Normalize RGB values
+    rgb1_norm = rgb1 / 255.0
+    rgb2_norm = rgb2 / 255.0
+
+    # Combine spatial and color information
+    combined1 = np.hstack((cloud1, rgb1_norm * color_weight))
+    combined2 = np.hstack((cloud2, rgb2_norm * color_weight))
+
+    # Initial transformation
+    R = np.eye(3)
+    t = np.zeros(3)
+
+    for iteration in range(max_iterations):
+        # Find nearest neighbors
+        tree = cKDTree(combined2)
+        distances, indices = tree.query(combined1, k=1)
+
+        # Estimate transformation
+        R_new, t_new = best_fit_transform(cloud1, cloud2[indices])
+
+        # Update transformation
+        t = t_new + np.dot(R_new, t)
+        R = np.dot(R_new, R)
+
+        # Apply transformation
+        cloud1 = np.dot(cloud1, R_new.T) + t_new
+        combined1[:, :3] = cloud1
+
+        # Check for convergence
+        if np.sum(np.abs(R_new - np.eye(3))) < tolerance and np.sum(np.abs(t_new)) < tolerance:
+            break
+
+    return R, t

From d094953cd8f8f529bbad57cdcfab531a8f1d22bd Mon Sep 17 00:00:00 2001
From: stretch-se3-3055 <grail-hello-stretch@nyu.edu>
Date: Thu, 29 Aug 2024 15:36:53 -0700
Subject: [PATCH 006/410] remove unnecessary codes

---
 src/stretch/dynav/ok_robot_hw/robot.py        | 230 +++++++++---------
 .../dynav/ok_robot_hw/utils/grasper_utils.py  | 101 ++++----
 2 files changed, 165 insertions(+), 166 deletions(-)

diff --git a/src/stretch/dynav/ok_robot_hw/robot.py b/src/stretch/dynav/ok_robot_hw/robot.py
index d09f2d488..6accca627 100644
--- a/src/stretch/dynav/ok_robot_hw/robot.py
+++ b/src/stretch/dynav/ok_robot_hw/robot.py
@@ -33,6 +33,7 @@ def __init__(
         self.STRETCH_GRIPPER_MAX = stretch_gripper_max
         self.STRETCH_GRIPPER_MIN = stretch_gripper_min
         self.urdf_path = os.path.join(stretch_client_urdf_file, 'stretch.urdf')
+        self.joints_pin = {'joint_fake':0}
         
         self.GRIPPER_THRESHOLD = gripper_threshold
 
@@ -54,30 +55,29 @@ def __init__(
         self.clamp = lambda n, minn, maxn: max(min(maxn, n), minn)
 
         # Joint dictionary for Kinematics
-        self.setup_kdl()
+        # self.setup_kdl()
 
 
-    def setup_kdl(self):
-        """
-            Kdl Setup for forward and Inverse Kinematics
-        """
-        import PyKDL
-        from urdf_parser_py.urdf import URDF
-        self.joints = {'joint_fake':0}
-        self.head_joints = {'joint_fake':0}
-        self.joints_pin = {'joint_fake':0}
+    # def setup_kdl(self):
+    #     """
+    #         Kdl Setup for forward and Inverse Kinematics
+    #     """
+    #     import PyKDL
+    #     from urdf_parser_py.urdf import URDF
+    #     self.joints = {'joint_fake':0}
+    #     self.head_joints = {'joint_fake':0}
         
-        # Loading URDF and listing the internediate joints from base to gripper
-        robot_model = URDF.from_xml_file(self.urdf_path)
-        self.kdl_tree = kdl_tree_from_urdf_model(robot_model)
-        self.arm_chain = self.kdl_tree.getChain('base_link', self.end_link)
-        self.joint_array = PyKDL.JntArray(self.arm_chain.getNrOfJoints())
-
-        # Forward kinematics
-        self.fk_p_kdl = PyKDL.ChainFkSolverPos_recursive(self.arm_chain)
-        # Inverse Kinematics
-        self.ik_v_kdl = PyKDL.ChainIkSolverVel_pinv(self.arm_chain)
-        self.ik_p_kdl = PyKDL.ChainIkSolverPos_NR(self.arm_chain, self.fk_p_kdl, self.ik_v_kdl) 
+    #     # Loading URDF and listing the internediate joints from base to gripper
+    #     robot_model = URDF.from_xml_file(self.urdf_path)
+    #     self.kdl_tree = kdl_tree_from_urdf_model(robot_model)
+    #     self.arm_chain = self.kdl_tree.getChain('base_link', self.end_link)
+    #     self.joint_array = PyKDL.JntArray(self.arm_chain.getNrOfJoints())
+
+    #     # Forward kinematics
+    #     self.fk_p_kdl = PyKDL.ChainFkSolverPos_recursive(self.arm_chain)
+    #     # Inverse Kinematics
+    #     self.ik_v_kdl = PyKDL.ChainIkSolverVel_pinv(self.arm_chain)
+    #     self.ik_p_kdl = PyKDL.ChainIkSolverPos_NR(self.arm_chain, self.fk_p_kdl, self.ik_v_kdl) 
     
 
     def set_end_link(self, link):
@@ -183,22 +183,22 @@ def updateJoints(self):
         # Head Joints
         pan, tilt = self.robot.get_pan_tilt()
 
-        self.joints['joint_fake'] = origin_dist
-        self.joints['joint_lift'] = state[1]
-        armPos = state[2]
-        self.joints['3'] = armPos / 4.0
-        self.joints['2'] = armPos / 4.0
-        self.joints['1'] = armPos / 4.0
-        self.joints['0'] = armPos / 4.0
-        self.joints['joint_wrist_yaw'] = state[3]
-        self.joints['joint_wrist_roll'] = state[5]
-        self.joints['joint_wrist_pitch'] = OVERRIDE_STATES.get('wrist_pitch', state[4])
-
-        self.joints['joint_gripper_finger_left'] = 0
-
-        self.head_joints['joint_fake'] = origin_dist
-        self.head_joints['joint_head_pan'] = pan
-        self.head_joints['joint_head_tilt'] = tilt
+        # self.joints['joint_fake'] = origin_dist
+        # self.joints['joint_lift'] = state[1]
+        # armPos = state[2]
+        # self.joints['3'] = armPos / 4.0
+        # self.joints['2'] = armPos / 4.0
+        # self.joints['1'] = armPos / 4.0
+        # self.joints['0'] = armPos / 4.0
+        # self.joints['joint_wrist_yaw'] = state[3]
+        # self.joints['joint_wrist_roll'] = state[5]
+        # self.joints['joint_wrist_pitch'] = OVERRIDE_STATES.get('wrist_pitch', state[4])
+
+        # self.joints['joint_gripper_finger_left'] = 0
+
+        # self.head_joints['joint_fake'] = origin_dist
+        # self.head_joints['joint_head_pan'] = pan
+        # self.head_joints['joint_head_tilt'] = tilt
         
         self.joints_pin['joint_fake'] = origin_dist
         self.joints_pin['joint_lift'] = state[1]
@@ -266,65 +266,65 @@ def get_joint_transform(self, node1, node2):
         print('joints pin', self.joints_pin)
         frame_pin = self.robot.get_frame_pose(self.joints_pin, node1, node2)
 
-        import PyKDL
+        # import PyKDL
 
-        # Intializing chain -> maintains list of nodes from base link to corresponding nodes
-        chain1 = self.kdl_tree.getChain('base_link', node1)
-        chain2 = self.kdl_tree.getChain('base_link', node2)
+        # # Intializing chain -> maintains list of nodes from base link to corresponding nodes
+        # chain1 = self.kdl_tree.getChain('base_link', node1)
+        # chain2 = self.kdl_tree.getChain('base_link', node2)
 
-        # Intializing corresponding joint array and forward chain solvers
-        joint_array1 = PyKDL.JntArray(chain1.getNrOfJoints())
-        joint_array2 = PyKDL.JntArray(chain2.getNrOfJoints())
+        # # Intializing corresponding joint array and forward chain solvers
+        # joint_array1 = PyKDL.JntArray(chain1.getNrOfJoints())
+        # joint_array2 = PyKDL.JntArray(chain2.getNrOfJoints())
 
-        fk_p_kdl1 = PyKDL.ChainFkSolverPos_recursive(chain1)
-        fk_p_kdl2 = PyKDL.ChainFkSolverPos_recursive(chain2)
+        # fk_p_kdl1 = PyKDL.ChainFkSolverPos_recursive(chain1)
+        # fk_p_kdl2 = PyKDL.ChainFkSolverPos_recursive(chain2)
 
-        if node1 == TOP_CAMERA_NODE:
-            ref_joints1 = self.head_joints
-            ref_joint1_list = self.head_joint_list
-        else:
-            ref_joints1 = self.joints
-            ref_joint1_list = self.joint_list
+        # if node1 == TOP_CAMERA_NODE:
+        #     ref_joints1 = self.head_joints
+        #     ref_joint1_list = self.head_joint_list
+        # else:
+        #     ref_joints1 = self.joints
+        #     ref_joint1_list = self.joint_list
             
-        # Updating the joint arrays from self.joints
-        for joint_index in range(joint_array1.rows()):
-            joint_array1[joint_index] = ref_joints1[ref_joint1_list[joint_index]]
+        # # Updating the joint arrays from self.joints
+        # for joint_index in range(joint_array1.rows()):
+        #     joint_array1[joint_index] = ref_joints1[ref_joint1_list[joint_index]]
 
-        for joint_index in range(joint_array2.rows()):
-            joint_array2[joint_index] = self.joints[self.joint_list[joint_index]]
+        # for joint_index in range(joint_array2.rows()):
+        #     joint_array2[joint_index] = self.joints[self.joint_list[joint_index]]
             
-        # Intializing frames corresponding to nodes
-        frame1 = PyKDL.Frame()
-        frame2 = PyKDL.Frame()
-
-        # Calculating current frames of nodes
-        print('joint_array1', joint_array1)
-        print('joint_array2', joint_array2)
-        print('self.joints', self.joints)
-        print('self.head_joints', self.head_joints)
-        fk_p_kdl1.JntToCart(joint_array1, frame1)
-        fk_p_kdl2.JntToCart(joint_array2, frame2)
+        # # Intializing frames corresponding to nodes
+        # frame1 = PyKDL.Frame()
+        # frame2 = PyKDL.Frame()
+
+        # # Calculating current frames of nodes
+        # print('joint_array1', joint_array1)
+        # print('joint_array2', joint_array2)
+        # print('self.joints', self.joints)
+        # print('self.head_joints', self.head_joints)
+        # fk_p_kdl1.JntToCart(joint_array1, frame1)
+        # fk_p_kdl2.JntToCart(joint_array2, frame2)
         
-        # This allows to transform a point in frame1 to frame2
-        frame_transform = frame2.Inverse() * frame1
-        print('pin', frame_pin)
-        print('pykdl', frame_transform)
+        # # This allows to transform a point in frame1 to frame2
+        # frame_transform = frame2.Inverse() * frame1
+        # print('pin', frame_pin)
+        # print('pykdl', frame_transform)
         return frame_pin
 
-    def debug_pose(self):
-        import PyKDL
-        self.updateJoints()
-        print(self.joints)
-        for joint_index in range(self.joint_array.rows()):
-            self.joint_array[joint_index] = self.joints[self.joint_list[joint_index]]
-        print(self.joint_array)
-        curr_pose = PyKDL.Frame() # Current pose of gripper in base frame
-        self.fk_p_kdl.JntToCart(self.joint_array, curr_pose)
-        print(f"cur pose {curr_pose}")
-        pin_pose = self.robot.get_ee_pose(matrix=True, link_name=self.end_link)
-        pin_rotation, pin_translation = pin_pose[:3, :3], pin_pose[:3, 3]
-        pin_curr_pose = pin.SE3(pin_rotation, pin_translation)
-        print(f"pin curr pose {pin_curr_pose}")
+    # def debug_pose(self):
+    #     import PyKDL
+    #     self.updateJoints()
+    #     print(self.joints)
+    #     for joint_index in range(self.joint_array.rows()):
+    #         self.joint_array[joint_index] = self.joints[self.joint_list[joint_index]]
+    #     print(self.joint_array)
+    #     curr_pose = PyKDL.Frame() # Current pose of gripper in base frame
+    #     self.fk_p_kdl.JntToCart(self.joint_array, curr_pose)
+    #     print(f"cur pose {curr_pose}")
+    #     pin_pose = self.robot.get_ee_pose(matrix=True, link_name=self.end_link)
+    #     pin_rotation, pin_translation = pin_pose[:3, :3], pin_pose[:3, 3]
+    #     pin_curr_pose = pin.SE3(pin_rotation, pin_translation)
+    #     print(f"pin curr pose {pin_curr_pose}")
     
     def move_to_pose(self, translation_tensor, rotational_tensor, gripper, move_mode=0, velocities=None):
         """
@@ -336,39 +336,39 @@ def move_to_pose(self, translation_tensor, rotational_tensor, gripper, move_mode
         # print('translation and rotation', translation_tensor, rotational_tensor)
         
         self.updateJoints()
-        for joint_index in range(self.joint_array.rows()):
-            self.joint_array[joint_index] = self.joints[self.joint_list[joint_index]]
-        print('self.joints', self.joints)
-        print('self.joint_array', self.joint_array)
+        # for joint_index in range(self.joint_array.rows()):
+        #     self.joint_array[joint_index] = self.joints[self.joint_list[joint_index]]
+        # print('self.joints', self.joints)
+        # print('self.joint_array', self.joint_array)
 
-        curr_pose = PyKDL.Frame() # Current pose of gripper in base frame
-        del_pose = PyKDL.Frame() # Relative Movement of gripper 
-        self.fk_p_kdl.JntToCart(self.joint_array, curr_pose)
+        # curr_pose = PyKDL.Frame() # Current pose of gripper in base frame
+        # del_pose = PyKDL.Frame() # Relative Movement of gripper 
+        # self.fk_p_kdl.JntToCart(self.joint_array, curr_pose)
 
         q = self.robot.get_joint_positions()
         q[HelloStretchIdx.WRIST_PITCH] = OVERRIDE_STATES.get('wrist_pitch', q[HelloStretchIdx.WRIST_PITCH])
         pin_pose = self.robot.get_ee_pose(matrix=True, link_name=self.end_link, q = q)
         pin_rotation, pin_translation = pin_pose[:3, :3], pin_pose[:3, 3]
         pin_curr_pose = pin.SE3(pin_rotation, pin_translation)
-        print(f"pykdl curr pose {curr_pose}")
-        print(f"pin curr pose {pin_curr_pose}")
+        # print(f"pykdl curr pose {curr_pose}")
+        # print(f"pin curr pose {pin_curr_pose}")
 
         rot_matrix = R.from_euler('xyz', rotation, degrees=False).as_matrix()
 
-        del_rot = PyKDL.Rotation(PyKDL.Vector(rot_matrix[0][0], rot_matrix[1][0], rot_matrix[2][0]),
-                                  PyKDL.Vector(rot_matrix[0][1], rot_matrix[1][1], rot_matrix[2][1]),
-                                  PyKDL.Vector(rot_matrix[0][2], rot_matrix[1][2], rot_matrix[2][2]))
-        del_trans = PyKDL.Vector(translation[0], translation[1], translation[2])
-        del_pose.M = del_rot
-        del_pose.p = del_trans
-        goal_pose_new = curr_pose*del_pose
+        # del_rot = PyKDL.Rotation(PyKDL.Vector(rot_matrix[0][0], rot_matrix[1][0], rot_matrix[2][0]),
+        #                           PyKDL.Vector(rot_matrix[0][1], rot_matrix[1][1], rot_matrix[2][1]),
+        #                           PyKDL.Vector(rot_matrix[0][2], rot_matrix[1][2], rot_matrix[2][2]))
+        # del_trans = PyKDL.Vector(translation[0], translation[1], translation[2])
+        # del_pose.M = del_rot
+        # del_pose.p = del_trans
+        # goal_pose_new = curr_pose*del_pose
 
         pin_del_pose = pin.SE3(np.array(rot_matrix), np.array(translation))
         pin_goal_pose_new = pin_curr_pose * pin_del_pose
-        print(f"pykdl del psoe {del_pose}")
-        print(f"pykdl goal pose new {goal_pose_new}")
-        print(f"pin del psoe {pin_del_pose}")
-        print(f"pin goal pose new {pin_goal_pose_new}")
+        # print(f"pykdl del psoe {del_pose}")
+        # print(f"pykdl goal pose new {goal_pose_new}")
+        # print(f"pin del psoe {pin_del_pose}")
+        # print(f"pin goal pose new {pin_goal_pose_new}")
 
         final_pos = pin_goal_pose_new.translation.tolist()
         final_quat = pin.Quaternion(pin_goal_pose_new.rotation).coeffs().tolist()
@@ -385,13 +385,13 @@ def move_to_pose(self, translation_tensor, rotational_tensor, gripper, move_mode
         transform_joint_pos = transform_joint_array(pin_joint_pos)
 
         # Ik to calculate the required joint movements to move the gripper to desired pose
-        seed_array = PyKDL.JntArray(self.arm_chain.getNrOfJoints())
-        self.ik_p_kdl.CartToJnt(seed_array, goal_pose_new, self.joint_array) 
+        # seed_array = PyKDL.JntArray(self.arm_chain.getNrOfJoints())
+        # self.ik_p_kdl.CartToJnt(seed_array, goal_pose_new, self.joint_array) 
 
         self.joint_array1 = transform_joint_pos
-        print(f"joint array {self.joint_array}")
-        print(f"pin joint pos {pin_joint_pos}")
-        print(f"transformed joint pos {transform_joint_pos}")
+        # print(f"joint array {self.joint_array}")
+        # print(f"pin joint pos {pin_joint_pos}")
+        # print(f"transformed joint pos {transform_joint_pos}")
 
         ik_joints = {}
         # for joint_index in range(self.joint_array.rows()):
@@ -401,7 +401,7 @@ def move_to_pose(self, translation_tensor, rotational_tensor, gripper, move_mode
         # Actual Movement of joints
         self.move_to_joints(ik_joints, gripper, move_mode, velocities)
 
-        self.updateJoints()
-        # for joint_index in range(self.joint_array.rows()):
-        for joint_index in range(len(self.joint_array1)):
-            self.joint_array[joint_index] = self.joints[self.joint_list[joint_index]]
+        # self.updateJoints()
+        # # for joint_index in range(self.joint_array.rows()):
+        # for joint_index in range(len(self.joint_array1)):
+        #     self.joint_array[joint_index] = self.joints[self.joint_list[joint_index]]
diff --git a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
index 6d053d563..44151abaf 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
@@ -36,7 +36,7 @@ def capture_and_process_image(camera, mode, obj, socket, hello_robot):
         elif (retry_flag !=0 and side_retries == 3):
             print("Tried in all angles but couldn't succed")
             time.sleep(2)
-            return None, None, None
+            return None, None, None, None
 
         elif (side_retries == 2 and tilt_retries == 3):
             hello_robot.move_to_position(base_trans=0.1, head_tilt=head_tilt)
@@ -108,54 +108,54 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
         3. Move the base such gripper in line with the grasp
         4. Gradually Move the gripper to the desired position
     """
-    import PyKDL
+    # import PyKDL
     # Transforming the final point from Model camera frame to robot camera frame
-    point = PyKDL.Vector(-translation[1], -translation[0], translation[2])
+    # point = PyKDL.Vector(-translation[1], -translation[0], translation[2])
     pin_point = np.array([-translation[1], -translation[0], translation[2]])
 
     # Rotation from Camera frame to Model frame
-    rotation1_bottom = PyKDL.Rotation(0.0000000, -1.0000000,  0.0000000,
-                                -1.0000000,  0.0000000,  0.0000000, 
-                                0.0000000,  0.0000000, 1.0000000)
+    # rotation1_bottom = PyKDL.Rotation(0.0000000, -1.0000000,  0.0000000,
+    #                             -1.0000000,  0.0000000,  0.0000000, 
+    #                             0.0000000,  0.0000000, 1.0000000)
     rotation1_bottom_mat = np.array([[0.0000000, -1.0000000,  0.0000000],
                                  [-1.0000000,  0.0000000,  0.0000000],
                                  [0.0000000,  0.0000000, 1.0000000]])
 
     # Rotation from model frame to pose frame
-    rotation1 = PyKDL.Rotation(rotation[0][0], rotation[0][1], rotation[0][2],
-                            rotation[1][0],  rotation[1][1], rotation[1][2],
-                                rotation[2][0],  rotation[2][1], rotation[2][2])
+    # rotation1 = PyKDL.Rotation(rotation[0][0], rotation[0][1], rotation[0][2],
+    #                         rotation[1][0],  rotation[1][1], rotation[1][2],
+    #                             rotation[2][0],  rotation[2][1], rotation[2][2])
     rotation1_mat = np.array([[rotation[0][0], rotation[0][1], rotation[0][2]],
                             [rotation[1][0],  rotation[1][1], rotation[1][2]],
                             [rotation[2][0],  rotation[2][1], rotation[2][2]]])
 
     # Rotation from camera frame to pose frame
-    rotation =  rotation1_bottom * rotation1
+    # rotation =  rotation1_bottom * rotation1
     pin_rotation = np.dot(rotation1_bottom_mat, rotation1_mat)
-    print(f"rotation {rotation}")
+    # print(f"rotation {rotation}")
     print(f"pin rotation{pin_rotation}")
 
     # Relative rotation and translation of grasping point relative to camera
-    dest_frame = PyKDL.Frame(rotation, point) 
+    # dest_frame = PyKDL.Frame(rotation, point) 
     pin_dest_frame = pin.SE3(np.array(pin_rotation), np.array(pin_point))
-    print(f"dest frame {dest_frame}")
+    # print(f"dest frame {dest_frame}")
     print(f"pin dest frame {pin_dest_frame}")
 
     # Camera to gripper frame transformation
     # cam2gripper_transform, pin_cam2gripper_transform, _, _ = robot.get_joint_transform(base_node, gripper_node)
     pin_cam2gripper_transform = robot.get_joint_transform(base_node, gripper_node)
 
-    del_pose = PyKDL.Frame()
-    del_rot = PyKDL.Rotation(PyKDL.Vector(pin_cam2gripper_transform.rotation[0][0], pin_cam2gripper_transform.rotation[1][0], pin_cam2gripper_transform.rotation[2][0]),
-                                PyKDL.Vector(pin_cam2gripper_transform.rotation[0][1], pin_cam2gripper_transform.rotation[1][1], pin_cam2gripper_transform.rotation[2][1]),
-                                PyKDL.Vector(pin_cam2gripper_transform.rotation[0][2], pin_cam2gripper_transform.rotation[1][2], pin_cam2gripper_transform.rotation[2][2]))
-    del_trans = PyKDL.Vector(pin_cam2gripper_transform.translation[0], pin_cam2gripper_transform.translation[1], pin_cam2gripper_transform.translation[2])
-    del_pose.M = del_rot
-    del_pose.p = del_trans
+    # del_pose = PyKDL.Frame()
+    # del_rot = PyKDL.Rotation(PyKDL.Vector(pin_cam2gripper_transform.rotation[0][0], pin_cam2gripper_transform.rotation[1][0], pin_cam2gripper_transform.rotation[2][0]),
+    #                             PyKDL.Vector(pin_cam2gripper_transform.rotation[0][1], pin_cam2gripper_transform.rotation[1][1], pin_cam2gripper_transform.rotation[2][1]),
+    #                             PyKDL.Vector(pin_cam2gripper_transform.rotation[0][2], pin_cam2gripper_transform.rotation[1][2], pin_cam2gripper_transform.rotation[2][2]))
+    # del_trans = PyKDL.Vector(pin_cam2gripper_transform.translation[0], pin_cam2gripper_transform.translation[1], pin_cam2gripper_transform.translation[2])
+    # del_pose.M = del_rot
+    # del_pose.p = del_trans
 
-    transformed_frame = del_pose * dest_frame
+    # transformed_frame = del_pose * dest_frame
     pin_transformed_frame = pin_cam2gripper_transform * pin_dest_frame
-    print(f"transformed frame {transformed_frame}")
+    # print(f"transformed frame {transformed_frame}")
     print(f"pin_transformed frame {pin_transformed_frame}")
 
 
@@ -164,16 +164,16 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
     # time.sleep(2)
 
     # Rotation for aligning Robot gripper frame to Model gripper frame
-    rotation2_top = PyKDL.Rotation(0, 0, 1, 1, 0, 0, 0, -1, 0)
+    # rotation2_top = PyKDL.Rotation(0, 0, 1, 1, 0, 0, 0, -1, 0)
     rotation2_top_mat = np.array([[0, 0, 1], 
                                 [1, 0, 0],
                                 [0, -1, 0]])
 
 
     # final Rotation of gripper to hold the objet
-    final_rotation = transformed_frame.M * rotation2_top
+    # final_rotation = transformed_frame.M * rotation2_top
     pin_final_rotation = np.dot(pin_transformed_frame.rotation, rotation2_top_mat)
-    print(f"final rotation - {final_rotation}")
+    # print(f"final rotation - {final_rotation}")
     print(f"pin final rotation {pin_final_rotation}")
 
     print("Rotation wrist with sleep of 2s")
@@ -183,44 +183,44 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
     #         [1],
     #     )
     rpy_angles = pin.rpy.matrixToRpy(pin_final_rotation)
-    print(f"rpy angles {final_rotation.GetRPY}")
-    print(f"pin rpy angles {rpy_angles}")
-    robot.move_to_position(gripper_pos = gripper_width)
+    # print(f"rpy angles {final_rotation.GetRPY}")
+    # print(f"pin rpy angles {rpy_angles}")
+    # robot.move_to_position(gripper_pos = gripper_width)
     robot.move_to_pose(
             [0, 0, 0],
             [rpy_angles[0], rpy_angles[1], rpy_angles[2]],
             [1],
         )
-    time.sleep(1)
+    # time.sleep(1)
 
     # Final grasping point relative to camera
     # cam2gripper_transform, pin_cam2gripper_transform, _, _ = robot.get_joint_transform(base_node, gripper_node)
     pin_cam2gripper_transform = robot.get_joint_transform(base_node, gripper_node)
-    del_pose = PyKDL.Frame()
-    del_rot = PyKDL.Rotation(PyKDL.Vector(pin_cam2gripper_transform.rotation[0][0], pin_cam2gripper_transform.rotation[1][0], pin_cam2gripper_transform.rotation[2][0]),
-                                PyKDL.Vector(pin_cam2gripper_transform.rotation[0][1], pin_cam2gripper_transform.rotation[1][1], pin_cam2gripper_transform.rotation[2][1]),
-                                PyKDL.Vector(pin_cam2gripper_transform.rotation[0][2], pin_cam2gripper_transform.rotation[1][2], pin_cam2gripper_transform.rotation[2][2]))
-    del_trans = PyKDL.Vector(pin_cam2gripper_transform.translation[0], pin_cam2gripper_transform.translation[1], pin_cam2gripper_transform.translation[2])
-    del_pose.M = del_rot
-    del_pose.p = del_trans
-    transformed_point1 = del_pose * point
+    # del_pose = PyKDL.Frame()
+    # del_rot = PyKDL.Rotation(PyKDL.Vector(pin_cam2gripper_transform.rotation[0][0], pin_cam2gripper_transform.rotation[1][0], pin_cam2gripper_transform.rotation[2][0]),
+    #                             PyKDL.Vector(pin_cam2gripper_transform.rotation[0][1], pin_cam2gripper_transform.rotation[1][1], pin_cam2gripper_transform.rotation[2][1]),
+    #                             PyKDL.Vector(pin_cam2gripper_transform.rotation[0][2], pin_cam2gripper_transform.rotation[1][2], pin_cam2gripper_transform.rotation[2][2]))
+    # del_trans = PyKDL.Vector(pin_cam2gripper_transform.translation[0], pin_cam2gripper_transform.translation[1], pin_cam2gripper_transform.translation[2])
+    # del_pose.M = del_rot
+    # del_pose.p = del_trans
+    # transformed_point1 = del_pose * point
     pin_transformed_point1 = apply_se3_transform(pin_cam2gripper_transform, pin_point)
-    print(f"transformed point1 {transformed_point1}")
+    # print(f"transformed point1 {transformed_point1}")
     print(f"pin transformed point1 {pin_transformed_point1}")
 
     # Final grasping point relative to base
     # cam2base_transform, pin_cam2base_transform, _, _ = robot.get_joint_transform(base_node, 'base_link')
     pin_cam2base_transform = robot.get_joint_transform(base_node, 'base_link')
-    del_pose = PyKDL.Frame()
-    del_rot = PyKDL.Rotation(PyKDL.Vector(pin_cam2base_transform.rotation[0][0], pin_cam2base_transform.rotation[1][0], pin_cam2base_transform.rotation[2][0]),
-                                PyKDL.Vector(pin_cam2base_transform.rotation[0][1], pin_cam2base_transform.rotation[1][1], pin_cam2base_transform.rotation[2][1]),
-                                PyKDL.Vector(pin_cam2base_transform.rotation[0][2], pin_cam2base_transform.rotation[1][2], pin_cam2base_transform.rotation[2][2]))
-    del_trans = PyKDL.Vector(pin_cam2base_transform.translation[0], pin_cam2base_transform.translation[1], pin_cam2base_transform.translation[2])
-    del_pose.M = del_rot
-    del_pose.p = del_trans
-    base_point = del_pose * point
+    # del_pose = PyKDL.Frame()
+    # del_rot = PyKDL.Rotation(PyKDL.Vector(pin_cam2base_transform.rotation[0][0], pin_cam2base_transform.rotation[1][0], pin_cam2base_transform.rotation[2][0]),
+    #                             PyKDL.Vector(pin_cam2base_transform.rotation[0][1], pin_cam2base_transform.rotation[1][1], pin_cam2base_transform.rotation[2][1]),
+    #                             PyKDL.Vector(pin_cam2base_transform.rotation[0][2], pin_cam2base_transform.rotation[1][2], pin_cam2base_transform.rotation[2][2]))
+    # del_trans = PyKDL.Vector(pin_cam2base_transform.translation[0], pin_cam2base_transform.translation[1], pin_cam2base_transform.translation[2])
+    # del_pose.M = del_rot
+    # del_pose.p = del_trans
+    # base_point = del_pose * point
     pin_base_point = apply_se3_transform(pin_cam2base_transform, pin_point)
-    print(f"base point {base_point}")
+    # print(f"base point {base_point}")
     print(f"pin base point {pin_base_point}")
 
     diff_value = (0.228 - gripper_depth - gripper_height) # 0.228 is the distance between link_Straight_gripper node and the gripper tip
@@ -250,7 +250,7 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
     # transformed_point2 = base2gripper_transform * base_point
     pin_transformed_point2 = apply_se3_transform(pin_base2gripper_transform, pin_base_point)
     # print(f"transformed point2 : {transformed_point2}")
-    print(f"pin transformed point2 : {pin_transformed_point2}")
+    # print(f"pin transformed point2 : {pin_transformed_point2}")
     # curr_diff = transformed_point2.z()
     curr_diff = pin_transformed_point2[2]
 
@@ -260,9 +260,8 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
     velocities = [1.0]*8
     velocities[5:] = [0.03, 0.03, 0.03, 0.03]
     velocities[0] = 0.03
-    if diff > 0.08:
-        dist = diff - 0.08
-        print("Move to intermediate point with sleep 2s")
+    if diff > 0.06:
+        dist = diff - 0.06
         robot.move_to_pose(
             [0, 0, dist],
             [0, 0, 0],

From cd4001ce58871ec28bd5b0a4277d8c4901a617c3 Mon Sep 17 00:00:00 2001
From: stretch-se3-3055 <grail-hello-stretch@nyu.edu>
Date: Thu, 29 Aug 2024 17:45:45 -0700
Subject: [PATCH 007/410] remove comments

---
 src/stretch/dynav/ok_robot_hw/robot.py        | 155 +-----------------
 .../dynav/ok_robot_hw/utils/__init__.py       |   1 -
 .../dynav/ok_robot_hw/utils/grasper_utils.py  | 113 +------------
 .../dynav/ok_robot_hw/utils/urdf_utils.py     |  81 ---------
 src/stretch/dynav/robot_agent_manip_mdp.py    |   4 +-
 5 files changed, 15 insertions(+), 339 deletions(-)
 delete mode 100644 src/stretch/dynav/ok_robot_hw/utils/urdf_utils.py

diff --git a/src/stretch/dynav/ok_robot_hw/robot.py b/src/stretch/dynav/ok_robot_hw/robot.py
index 6accca627..fe5bd395a 100644
--- a/src/stretch/dynav/ok_robot_hw/robot.py
+++ b/src/stretch/dynav/ok_robot_hw/robot.py
@@ -10,11 +10,9 @@
 import random
 import os
 
-from stretch.dynav.ok_robot_hw.utils import kdl_tree_from_urdf_model, transform_joint_array
+from stretch.dynav.ok_robot_hw.utils import transform_joint_array
 from stretch.dynav.ok_robot_hw.global_parameters import *
 
-# from stretch.utils.geometry import angle_difference
-# import timeit
 import pinocchio as pin
 from stretch.motion.kinematics import HelloStretchIdx
 
@@ -45,39 +43,13 @@ def __init__(
         self.end_link = end_link 
         self.set_end_link(end_link)
         
-        # Initialize StretchClient controller (from home_robot/src/home_robot_hw/home_robot_hw/remote/api.py)
-        # self.robot = StretchClient(urdf_path = stretch_client_urdf_file)
+        # Initialize StretchClient controller
         self.robot = robot
         self.robot.switch_to_manipulation_mode()
         # time.sleep(2)
 
         # Constraining the robots movement
-        self.clamp = lambda n, minn, maxn: max(min(maxn, n), minn)
-
-        # Joint dictionary for Kinematics
-        # self.setup_kdl()
-
-
-    # def setup_kdl(self):
-    #     """
-    #         Kdl Setup for forward and Inverse Kinematics
-    #     """
-    #     import PyKDL
-    #     from urdf_parser_py.urdf import URDF
-    #     self.joints = {'joint_fake':0}
-    #     self.head_joints = {'joint_fake':0}
-        
-    #     # Loading URDF and listing the internediate joints from base to gripper
-    #     robot_model = URDF.from_xml_file(self.urdf_path)
-    #     self.kdl_tree = kdl_tree_from_urdf_model(robot_model)
-    #     self.arm_chain = self.kdl_tree.getChain('base_link', self.end_link)
-    #     self.joint_array = PyKDL.JntArray(self.arm_chain.getNrOfJoints())
-
-    #     # Forward kinematics
-    #     self.fk_p_kdl = PyKDL.ChainFkSolverPos_recursive(self.arm_chain)
-    #     # Inverse Kinematics
-    #     self.ik_v_kdl = PyKDL.ChainIkSolverVel_pinv(self.arm_chain)
-    #     self.ik_p_kdl = PyKDL.ChainIkSolverPos_NR(self.arm_chain, self.fk_p_kdl, self.ik_v_kdl) 
+        self.clamp = lambda n, minn, maxn: max(min(maxn, n), minn) 
     
 
     def set_end_link(self, link):
@@ -182,23 +154,6 @@ def updateJoints(self):
 
         # Head Joints
         pan, tilt = self.robot.get_pan_tilt()
-
-        # self.joints['joint_fake'] = origin_dist
-        # self.joints['joint_lift'] = state[1]
-        # armPos = state[2]
-        # self.joints['3'] = armPos / 4.0
-        # self.joints['2'] = armPos / 4.0
-        # self.joints['1'] = armPos / 4.0
-        # self.joints['0'] = armPos / 4.0
-        # self.joints['joint_wrist_yaw'] = state[3]
-        # self.joints['joint_wrist_roll'] = state[5]
-        # self.joints['joint_wrist_pitch'] = OVERRIDE_STATES.get('wrist_pitch', state[4])
-
-        # self.joints['joint_gripper_finger_left'] = 0
-
-        # self.head_joints['joint_fake'] = origin_dist
-        # self.head_joints['joint_head_pan'] = pan
-        # self.head_joints['joint_head_tilt'] = tilt
         
         self.joints_pin['joint_fake'] = origin_dist
         self.joints_pin['joint_lift'] = state[1]
@@ -216,7 +171,7 @@ def updateJoints(self):
         self.joints_pin['joint_head_tilt'] = tilt
 
     # following function is used to move the robot to a desired joint configuration 
-    def move_to_joints(self, joints, gripper, mode=0, velocities = None):
+    def move_to_joints(self, joints, gripper, mode=0):
         """
             Given the desrired joints movement this fucntion will the joints accordingly
         """
@@ -244,7 +199,6 @@ def move_to_joints(self, joints, gripper, mode=0, velocities = None):
             target1[1] = target_state[1]
             self.robot.arm_to(target1, blocking = True)
 
-        # self.robot.arm_to(target_state, velocities = velocities, blocking = True)
         self.robot.arm_to(target_state, blocking = True)
         print(f"current state {self.robot.get_six_joints()}")
         print(f"target state {target_state}")
@@ -263,116 +217,34 @@ def get_joint_transform(self, node1, node2):
 
         # return frame_transform, frame2, frame1
         self.updateJoints()
-        print('joints pin', self.joints_pin)
         frame_pin = self.robot.get_frame_pose(self.joints_pin, node1, node2)
 
-        # import PyKDL
-
-        # # Intializing chain -> maintains list of nodes from base link to corresponding nodes
-        # chain1 = self.kdl_tree.getChain('base_link', node1)
-        # chain2 = self.kdl_tree.getChain('base_link', node2)
-
-        # # Intializing corresponding joint array and forward chain solvers
-        # joint_array1 = PyKDL.JntArray(chain1.getNrOfJoints())
-        # joint_array2 = PyKDL.JntArray(chain2.getNrOfJoints())
-
-        # fk_p_kdl1 = PyKDL.ChainFkSolverPos_recursive(chain1)
-        # fk_p_kdl2 = PyKDL.ChainFkSolverPos_recursive(chain2)
-
-        # if node1 == TOP_CAMERA_NODE:
-        #     ref_joints1 = self.head_joints
-        #     ref_joint1_list = self.head_joint_list
-        # else:
-        #     ref_joints1 = self.joints
-        #     ref_joint1_list = self.joint_list
-            
-        # # Updating the joint arrays from self.joints
-        # for joint_index in range(joint_array1.rows()):
-        #     joint_array1[joint_index] = ref_joints1[ref_joint1_list[joint_index]]
-
-        # for joint_index in range(joint_array2.rows()):
-        #     joint_array2[joint_index] = self.joints[self.joint_list[joint_index]]
-            
-        # # Intializing frames corresponding to nodes
-        # frame1 = PyKDL.Frame()
-        # frame2 = PyKDL.Frame()
-
-        # # Calculating current frames of nodes
-        # print('joint_array1', joint_array1)
-        # print('joint_array2', joint_array2)
-        # print('self.joints', self.joints)
-        # print('self.head_joints', self.head_joints)
-        # fk_p_kdl1.JntToCart(joint_array1, frame1)
-        # fk_p_kdl2.JntToCart(joint_array2, frame2)
-        
-        # # This allows to transform a point in frame1 to frame2
-        # frame_transform = frame2.Inverse() * frame1
-        # print('pin', frame_pin)
-        # print('pykdl', frame_transform)
         return frame_pin
-
-    # def debug_pose(self):
-    #     import PyKDL
-    #     self.updateJoints()
-    #     print(self.joints)
-    #     for joint_index in range(self.joint_array.rows()):
-    #         self.joint_array[joint_index] = self.joints[self.joint_list[joint_index]]
-    #     print(self.joint_array)
-    #     curr_pose = PyKDL.Frame() # Current pose of gripper in base frame
-    #     self.fk_p_kdl.JntToCart(self.joint_array, curr_pose)
-    #     print(f"cur pose {curr_pose}")
-    #     pin_pose = self.robot.get_ee_pose(matrix=True, link_name=self.end_link)
-    #     pin_rotation, pin_translation = pin_pose[:3, :3], pin_pose[:3, 3]
-    #     pin_curr_pose = pin.SE3(pin_rotation, pin_translation)
-    #     print(f"pin curr pose {pin_curr_pose}")
     
-    def move_to_pose(self, translation_tensor, rotational_tensor, gripper, move_mode=0, velocities=None):
+    def move_to_pose(self, translation_tensor, rotational_tensor, gripper, move_mode=0):
         """
             Function to move the gripper to a desired translation and rotation
         """
         import PyKDL
         translation = [translation_tensor[0], translation_tensor[1], translation_tensor[2]]
         rotation = rotational_tensor
-        # print('translation and rotation', translation_tensor, rotational_tensor)
         
         self.updateJoints()
-        # for joint_index in range(self.joint_array.rows()):
-        #     self.joint_array[joint_index] = self.joints[self.joint_list[joint_index]]
-        # print('self.joints', self.joints)
-        # print('self.joint_array', self.joint_array)
-
-        # curr_pose = PyKDL.Frame() # Current pose of gripper in base frame
-        # del_pose = PyKDL.Frame() # Relative Movement of gripper 
-        # self.fk_p_kdl.JntToCart(self.joint_array, curr_pose)
 
         q = self.robot.get_joint_positions()
         q[HelloStretchIdx.WRIST_PITCH] = OVERRIDE_STATES.get('wrist_pitch', q[HelloStretchIdx.WRIST_PITCH])
         pin_pose = self.robot.get_ee_pose(matrix=True, link_name=self.end_link, q = q)
         pin_rotation, pin_translation = pin_pose[:3, :3], pin_pose[:3, 3]
         pin_curr_pose = pin.SE3(pin_rotation, pin_translation)
-        # print(f"pykdl curr pose {curr_pose}")
-        # print(f"pin curr pose {pin_curr_pose}")
 
         rot_matrix = R.from_euler('xyz', rotation, degrees=False).as_matrix()
 
-        # del_rot = PyKDL.Rotation(PyKDL.Vector(rot_matrix[0][0], rot_matrix[1][0], rot_matrix[2][0]),
-        #                           PyKDL.Vector(rot_matrix[0][1], rot_matrix[1][1], rot_matrix[2][1]),
-        #                           PyKDL.Vector(rot_matrix[0][2], rot_matrix[1][2], rot_matrix[2][2]))
-        # del_trans = PyKDL.Vector(translation[0], translation[1], translation[2])
-        # del_pose.M = del_rot
-        # del_pose.p = del_trans
-        # goal_pose_new = curr_pose*del_pose
-
         pin_del_pose = pin.SE3(np.array(rot_matrix), np.array(translation))
         pin_goal_pose_new = pin_curr_pose * pin_del_pose
-        # print(f"pykdl del psoe {del_pose}")
-        # print(f"pykdl goal pose new {goal_pose_new}")
-        # print(f"pin del psoe {pin_del_pose}")
-        # print(f"pin goal pose new {pin_goal_pose_new}")
 
         final_pos = pin_goal_pose_new.translation.tolist()
         final_quat = pin.Quaternion(pin_goal_pose_new.rotation).coeffs().tolist()
-        print(f"final pos and quat {final_pos}\n {final_quat}")
+        # print(f"final pos and quat {final_pos}\n {final_quat}")
 
         full_body_cfg = self.robot.solve_ik(
             final_pos, final_quat, False, None, False, node_name = self.end_link
@@ -384,24 +256,11 @@ def move_to_pose(self, translation_tensor, rotational_tensor, gripper, move_mode
         pin_joint_pos = self.robot._extract_joint_pos(full_body_cfg)
         transform_joint_pos = transform_joint_array(pin_joint_pos)
 
-        # Ik to calculate the required joint movements to move the gripper to desired pose
-        # seed_array = PyKDL.JntArray(self.arm_chain.getNrOfJoints())
-        # self.ik_p_kdl.CartToJnt(seed_array, goal_pose_new, self.joint_array) 
-
         self.joint_array1 = transform_joint_pos
-        # print(f"joint array {self.joint_array}")
-        # print(f"pin joint pos {pin_joint_pos}")
-        # print(f"transformed joint pos {transform_joint_pos}")
 
         ik_joints = {}
-        # for joint_index in range(self.joint_array.rows()):
         for joint_index in range(len(self.joint_array1)):
             ik_joints[self.joint_list[joint_index]] = self.joint_array1[joint_index]
 
         # Actual Movement of joints
-        self.move_to_joints(ik_joints, gripper, move_mode, velocities)
-
-        # self.updateJoints()
-        # # for joint_index in range(self.joint_array.rows()):
-        # for joint_index in range(len(self.joint_array1)):
-        #     self.joint_array[joint_index] = self.joints[self.joint_list[joint_index]]
+        self.move_to_joints(ik_joints, gripper, move_mode)
diff --git a/src/stretch/dynav/ok_robot_hw/utils/__init__.py b/src/stretch/dynav/ok_robot_hw/utils/__init__.py
index 60bbf771c..c7a2ffae2 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/__init__.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/__init__.py
@@ -1,4 +1,3 @@
 from .grasper_utils import *
-from .urdf_utils import *
 from .communication_utils import *
 from .utils import *
\ No newline at end of file
diff --git a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
index 44151abaf..193b2fd30 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
@@ -71,15 +71,11 @@ def move_to_point(robot, point, base_node, gripper_node, move_mode=1, pitch_rota
     """
         Function for moving the gripper to a specific point
     """
-    # import PyKDL
-    # rotation = PyKDL.Rotation(1, 0, 0, 0, 1, 0, 0, 0, 1)
     rotation = np.array([[1, 0, 0],
                          [0, 1, 0],
                          [0, 0, 1]])
 
-    # dest_frame = PyKDL.Frame(rotation, point)
     dest_frame = pin.SE3(rotation, point)
-    # transform, _, _ = robot.get_joint_transform(base_node, gripper_node)
     transform = robot.get_joint_transform(base_node, gripper_node)
 
     # Rotation from gripper frame frame to gripper frame
@@ -108,158 +104,86 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
         3. Move the base such gripper in line with the grasp
         4. Gradually Move the gripper to the desired position
     """
-    # import PyKDL
     # Transforming the final point from Model camera frame to robot camera frame
-    # point = PyKDL.Vector(-translation[1], -translation[0], translation[2])
     pin_point = np.array([-translation[1], -translation[0], translation[2]])
 
     # Rotation from Camera frame to Model frame
-    # rotation1_bottom = PyKDL.Rotation(0.0000000, -1.0000000,  0.0000000,
-    #                             -1.0000000,  0.0000000,  0.0000000, 
-    #                             0.0000000,  0.0000000, 1.0000000)
     rotation1_bottom_mat = np.array([[0.0000000, -1.0000000,  0.0000000],
                                  [-1.0000000,  0.0000000,  0.0000000],
                                  [0.0000000,  0.0000000, 1.0000000]])
 
     # Rotation from model frame to pose frame
-    # rotation1 = PyKDL.Rotation(rotation[0][0], rotation[0][1], rotation[0][2],
-    #                         rotation[1][0],  rotation[1][1], rotation[1][2],
-    #                             rotation[2][0],  rotation[2][1], rotation[2][2])
     rotation1_mat = np.array([[rotation[0][0], rotation[0][1], rotation[0][2]],
                             [rotation[1][0],  rotation[1][1], rotation[1][2]],
                             [rotation[2][0],  rotation[2][1], rotation[2][2]]])
 
     # Rotation from camera frame to pose frame
-    # rotation =  rotation1_bottom * rotation1
     pin_rotation = np.dot(rotation1_bottom_mat, rotation1_mat)
-    # print(f"rotation {rotation}")
-    print(f"pin rotation{pin_rotation}")
+    # print(f"pin rotation{pin_rotation}")
 
     # Relative rotation and translation of grasping point relative to camera
     # dest_frame = PyKDL.Frame(rotation, point) 
     pin_dest_frame = pin.SE3(np.array(pin_rotation), np.array(pin_point))
-    # print(f"dest frame {dest_frame}")
-    print(f"pin dest frame {pin_dest_frame}")
+    # print(f"pin dest frame {pin_dest_frame}")
 
     # Camera to gripper frame transformation
-    # cam2gripper_transform, pin_cam2gripper_transform, _, _ = robot.get_joint_transform(base_node, gripper_node)
     pin_cam2gripper_transform = robot.get_joint_transform(base_node, gripper_node)
 
-    # del_pose = PyKDL.Frame()
-    # del_rot = PyKDL.Rotation(PyKDL.Vector(pin_cam2gripper_transform.rotation[0][0], pin_cam2gripper_transform.rotation[1][0], pin_cam2gripper_transform.rotation[2][0]),
-    #                             PyKDL.Vector(pin_cam2gripper_transform.rotation[0][1], pin_cam2gripper_transform.rotation[1][1], pin_cam2gripper_transform.rotation[2][1]),
-    #                             PyKDL.Vector(pin_cam2gripper_transform.rotation[0][2], pin_cam2gripper_transform.rotation[1][2], pin_cam2gripper_transform.rotation[2][2]))
-    # del_trans = PyKDL.Vector(pin_cam2gripper_transform.translation[0], pin_cam2gripper_transform.translation[1], pin_cam2gripper_transform.translation[2])
-    # del_pose.M = del_rot
-    # del_pose.p = del_trans
-
     # transformed_frame = del_pose * dest_frame
     pin_transformed_frame = pin_cam2gripper_transform * pin_dest_frame
-    # print(f"transformed frame {transformed_frame}")
-    print(f"pin_transformed frame {pin_transformed_frame}")
-
+    # print(f"pin_transformed frame {pin_transformed_frame}")
 
     # Lifting the arm to high position as part of pregrasping position
     robot.move_to_position(lift_pos = 1.05, gripper_pos = gripper_width, head_pan = None, head_tilt = None)
-    # time.sleep(2)
 
     # Rotation for aligning Robot gripper frame to Model gripper frame
-    # rotation2_top = PyKDL.Rotation(0, 0, 1, 1, 0, 0, 0, -1, 0)
     rotation2_top_mat = np.array([[0, 0, 1], 
                                 [1, 0, 0],
                                 [0, -1, 0]])
 
 
     # final Rotation of gripper to hold the objet
-    # final_rotation = transformed_frame.M * rotation2_top
     pin_final_rotation = np.dot(pin_transformed_frame.rotation, rotation2_top_mat)
-    # print(f"final rotation - {final_rotation}")
     print(f"pin final rotation {pin_final_rotation}")
 
-    print("Rotation wrist with sleep of 2s")
-    # robot.move_to_pose(
-    #         [0, 0, 0],
-    #         [final_rotation.GetRPY()[0], final_rotation.GetRPY()[1], final_rotation.GetRPY()[2]],
-    #         [1],
-    #     )
     rpy_angles = pin.rpy.matrixToRpy(pin_final_rotation)
-    # print(f"rpy angles {final_rotation.GetRPY}")
-    # print(f"pin rpy angles {rpy_angles}")
-    # robot.move_to_position(gripper_pos = gripper_width)
     robot.move_to_pose(
             [0, 0, 0],
             [rpy_angles[0], rpy_angles[1], rpy_angles[2]],
             [1],
         )
-    # time.sleep(1)
 
     # Final grasping point relative to camera
-    # cam2gripper_transform, pin_cam2gripper_transform, _, _ = robot.get_joint_transform(base_node, gripper_node)
     pin_cam2gripper_transform = robot.get_joint_transform(base_node, gripper_node)
-    # del_pose = PyKDL.Frame()
-    # del_rot = PyKDL.Rotation(PyKDL.Vector(pin_cam2gripper_transform.rotation[0][0], pin_cam2gripper_transform.rotation[1][0], pin_cam2gripper_transform.rotation[2][0]),
-    #                             PyKDL.Vector(pin_cam2gripper_transform.rotation[0][1], pin_cam2gripper_transform.rotation[1][1], pin_cam2gripper_transform.rotation[2][1]),
-    #                             PyKDL.Vector(pin_cam2gripper_transform.rotation[0][2], pin_cam2gripper_transform.rotation[1][2], pin_cam2gripper_transform.rotation[2][2]))
-    # del_trans = PyKDL.Vector(pin_cam2gripper_transform.translation[0], pin_cam2gripper_transform.translation[1], pin_cam2gripper_transform.translation[2])
-    # del_pose.M = del_rot
-    # del_pose.p = del_trans
-    # transformed_point1 = del_pose * point
     pin_transformed_point1 = apply_se3_transform(pin_cam2gripper_transform, pin_point)
-    # print(f"transformed point1 {transformed_point1}")
-    print(f"pin transformed point1 {pin_transformed_point1}")
+    # print(f"pin transformed point1 {pin_transformed_point1}")
 
     # Final grasping point relative to base
-    # cam2base_transform, pin_cam2base_transform, _, _ = robot.get_joint_transform(base_node, 'base_link')
     pin_cam2base_transform = robot.get_joint_transform(base_node, 'base_link')
-    # del_pose = PyKDL.Frame()
-    # del_rot = PyKDL.Rotation(PyKDL.Vector(pin_cam2base_transform.rotation[0][0], pin_cam2base_transform.rotation[1][0], pin_cam2base_transform.rotation[2][0]),
-    #                             PyKDL.Vector(pin_cam2base_transform.rotation[0][1], pin_cam2base_transform.rotation[1][1], pin_cam2base_transform.rotation[2][1]),
-    #                             PyKDL.Vector(pin_cam2base_transform.rotation[0][2], pin_cam2base_transform.rotation[1][2], pin_cam2base_transform.rotation[2][2]))
-    # del_trans = PyKDL.Vector(pin_cam2base_transform.translation[0], pin_cam2base_transform.translation[1], pin_cam2base_transform.translation[2])
-    # del_pose.M = del_rot
-    # del_pose.p = del_trans
-    # base_point = del_pose * point
     pin_base_point = apply_se3_transform(pin_cam2base_transform, pin_point)
-    # print(f"base point {base_point}")
-    print(f"pin base point {pin_base_point}")
+    # print(f"pin base point {pin_base_point}")
 
     diff_value = (0.228 - gripper_depth - gripper_height) # 0.228 is the distance between link_Straight_gripper node and the gripper tip
     pin_transformed_point1[2] -= (diff_value)
     ref_diff = (diff_value)
 
     # Moving gripper to a point that is 0.2m away from the pose center in the line of gripper
-    print("Moving to 0.2m away point with 4s sleep")
-    # robot.move_to_pose(
-    #     [transformed_point1.x(), transformed_point1.y(), transformed_point1.z() - 0.2],
-    #     [0, 0, 0],
-    #     [1],
-    #     move_mode = 1
-    # )
     robot.move_to_pose(
         [pin_transformed_point1[0], pin_transformed_point1[1], pin_transformed_point1[2] - 0.2],
         [0, 0, 0],
         [1],
         move_mode = 1
     )
-    # time.sleep(4)
 
     # Z-Axis of link_straight_gripper points in line of gripper
     # So, the z co-ordiante of point w.r.t gripper gives the distance of point from gripper
-    # base2gripper_transform, pin_base2gripper_transform, _, _ = robot.get_joint_transform('base_link', gripper_node)
     pin_base2gripper_transform = robot.get_joint_transform('base_link', gripper_node)
-    # transformed_point2 = base2gripper_transform * base_point
     pin_transformed_point2 = apply_se3_transform(pin_base2gripper_transform, pin_base_point)
-    # print(f"transformed point2 : {transformed_point2}")
-    # print(f"pin transformed point2 : {pin_transformed_point2}")
-    # curr_diff = transformed_point2.z()
     curr_diff = pin_transformed_point2[2]
 
     # The distance between gripper and point is covered gradullay to allow for velocity control when it approaches the object
     # Lower velocity helps is not topping the light objects
     diff = abs(curr_diff - ref_diff)
-    velocities = [1.0]*8
-    velocities[5:] = [0.03, 0.03, 0.03, 0.03]
-    velocities[0] = 0.03
     if diff > 0.06:
         dist = diff - 0.06
         robot.move_to_pose(
@@ -267,9 +191,6 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
             [0, 0, 0],
             [1]
         )
-        # time.sleep(2)
-        # base2gripper_transform, _, _ = robot.get_joint_transform('base_link', gripper_node)
-        # print(f"transformed point3 : {base2gripper_transform * base_point}")
         diff = diff - dist
         
     while diff > 0.01:
@@ -278,38 +199,18 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
         robot.move_to_pose(
             [0, 0, dist],   
             [0, 0, 0],
-            [1],
-            velocities=velocities
+            [1]
         )
-        # time.sleep(2)
-        # base2gripper_transform, _, _ = robot.get_joint_transform('base_link', gripper_node)
-        # print(f"transformed point3 : {base2gripper_transform * base_point}")
         diff = diff - dist
     
     # Now the gripper reached the grasping point and starts picking procedure
     robot.pickup(gripper_width)
 
     # Lifts the arm
-    robot.move_to_position(lift_pos = robot.robot.get_six_joints()[1] + 0.2)
-    # time.sleep(1)
+    robot.move_to_position(lift_pos = min(robot.robot.get_six_joints()[1] + 0.2, 1.1))
 
     # Tucks the gripper so that while moving to place it wont collide with any obstacles
     robot.move_to_position(arm_pos = 0.01)
-    # time.sleep(1)
     robot.move_to_position(wrist_pitch = 0.0)
-    # time.sleep(1)
     robot.move_to_position(lift_pos = min(robot.robot.get_six_joints()[1], 0.9), wrist_yaw = 2.5)
     robot.move_to_position(lift_pos = min(robot.robot.get_six_joints()[1], 0.6))
-    # time.sleep(1)
-
-    # rotate the arm wrist onto the base
-    # if abs(robot.robot.get_six_joints()[3] - 2.0) > 0.1:
-    #     robot.move_to_position(wrist_yaw  = -2.0)
-    #     # time.sleep(1)
-
-    # # Put down the arm    
-    # robot.move_to_position(lift_pos = 0.85)
-    # if abs(robot.robot.get_six_joints()[3] - 2.0) < 0.1:
-    #     robot.move_to_position(wrist_yaw = 2.5)
-    # robot.move_to_position(lift_pos = 0.6)
-    # time.sleep(1)
diff --git a/src/stretch/dynav/ok_robot_hw/utils/urdf_utils.py b/src/stretch/dynav/ok_robot_hw/utils/urdf_utils.py
deleted file mode 100644
index 7cad7f231..000000000
--- a/src/stretch/dynav/ok_robot_hw/utils/urdf_utils.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import numpy as np
-# import PyKDL
-
-def euler_to_quat(r, p, y):
-    sr, sp, sy = np.sin(r/2.0), np.sin(p/2.0), np.sin(y/2.0)
-    cr, cp, cy = np.cos(r/2.0), np.cos(p/2.0), np.cos(y/2.0)
-    return [sr*cp*cy - cr*sp*sy,
-            cr*sp*cy + sr*cp*sy,
-            cr*cp*sy - sr*sp*cy,
-            cr*cp*cy + sr*sp*sy]
-
-def urdf_joint_to_kdl_joint(jnt):
-    import PyKDL as kdl
-    origin_frame = urdf_pose_to_kdl_frame(jnt.origin)
-    if jnt.joint_type == 'fixed':
-        if hasattr(kdl.Joint, 'Fixed'):
-            return kdl.Joint(jnt.name, kdl.Joint.Fixed)
-        else:
-            return kdl.Joint(jnt.name, getattr(kdl.Joint, 'None'))
-    axis = kdl.Vector(*jnt.axis)
-    if jnt.joint_type == 'revolute':
-        return kdl.Joint(jnt.name, origin_frame.p,
-                         origin_frame.M * axis, kdl.Joint.RotAxis)
-    if jnt.joint_type == 'continuous':
-        return kdl.Joint(jnt.name, origin_frame.p,
-                         origin_frame.M * axis, kdl.Joint.RotAxis)
-    if jnt.joint_type == 'prismatic':
-        return kdl.Joint(jnt.name, origin_frame.p,
-                         origin_frame.M * axis, kdl.Joint.TransAxis)
-    print("Unknown joint type: %s." % jnt.joint_type)
-    return kdl.Joint(jnt.name, kdl.Joint.Fixed)
-
-def urdf_pose_to_kdl_frame(pose):
-    import PyKDL as kdl
-    pos = [0., 0., 0.]
-    rot = [0., 0., 0.]
-    if pose is not None:
-        if pose.position is not None:
-            pos = pose.position
-        if pose.rotation is not None:
-            rot = pose.rotation
-    return kdl.Frame(kdl.Rotation.Quaternion(*euler_to_quat(*rot)),
-                     kdl.Vector(*pos))
-
-def urdf_inertial_to_kdl_rbi(i):
-    import PyKDL as kdl
-    origin = urdf_pose_to_kdl_frame(i.origin)
-    rbi = kdl.RigidBodyInertia(i.mass, origin.p,
-                               kdl.RotationalInertia(i.inertia.ixx,
-                                                     i.inertia.iyy,
-                                                     i.inertia.izz,
-                                                     i.inertia.ixy,
-                                                     i.inertia.ixz,
-                                                     i.inertia.iyz))
-    return origin.M * rbi
-
-##
-# Returns a PyKDL.Tree generated from a urdf_parser_py.urdf.URDF object.
-def kdl_tree_from_urdf_model(urdf):
-    import PyKDL as kdl
-    root = urdf.get_root()
-    print(f"root -> {root}")
-    tree = kdl.Tree(root)
-    def add_children_to_tree(parent):
-        if parent in urdf.child_map:
-            for joint, child_name in urdf.child_map[parent]:
-                child = urdf.link_map[child_name]
-                if child.inertial is not None:
-                    kdl_inert = urdf_inertial_to_kdl_rbi(child.inertial)
-                else:
-                    kdl_inert = kdl.RigidBodyInertia()
-                kdl_jnt = urdf_joint_to_kdl_joint(urdf.joint_map[joint])
-                kdl_origin = urdf_pose_to_kdl_frame(urdf.joint_map[joint].origin)
-                kdl_sgm = kdl.Segment(child_name, kdl_jnt,
-                                      kdl_origin, kdl_inert)
-                tree.addSegment(kdl_sgm, parent)
-                add_children_to_tree(child_name)
-                # print(f"{parent} -> {child_name}")
-    add_children_to_tree(root)
-    # print("\n\n\n")
-    return tree
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index 865b0bfb5..1bc8bf2c9 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -213,7 +213,6 @@ def place(self, text, init_tilt = INIT_HEAD_TILT, base_node = TOP_CAMERA_NODE):
             head_tilt=init_tilt)
         camera = RealSenseCamera(self.robot)
 
-        # time.sleep(2)
         rotation, translation = capture_and_process_image(
             camera = camera,
             mode = 'place',
@@ -223,7 +222,6 @@ def place(self, text, init_tilt = INIT_HEAD_TILT, base_node = TOP_CAMERA_NODE):
 
         if rotation is None:
             return False
-        print(rotation)
 
         # lift arm to the top before the robot extends the arm, prepare the pre-placing gripper pose
         self.manip_wrapper.move_to_position(lift_pos=1.05)
@@ -235,7 +233,7 @@ def place(self, text, init_tilt = INIT_HEAD_TILT, base_node = TOP_CAMERA_NODE):
         self.manip_wrapper.move_to_position(gripper_pos=1, blocking = True)
 
         # Lift the arm a little bit, and rotate the wrist roll of the robot in case the object attached on the gripper
-        self.manip_wrapper.move_to_position(lift_pos = self.manip_wrapper.robot.get_six_joints()[1] + 0.3)
+        self.manip_wrapper.move_to_position(lift_pos = min(self.manip_wrapper.robot.get_six_joints()[1] + 0.3, 1.1))
         self.manip_wrapper.move_to_position(wrist_roll = 2.5, blocking = True)
         self.manip_wrapper.move_to_position(wrist_roll = -2.5, blocking = True)
 

From 503c83d75e40a08c21c63ae61fcaac781405ff1f Mon Sep 17 00:00:00 2001
From: stretch-se3-3055 <grail-hello-stretch@nyu.edu>
Date: Tue, 3 Sep 2024 15:45:28 -0700
Subject: [PATCH 008/410] change

---
 src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py |  6 +++---
 src/stretch_ros2_bridge/launch/cameras.launch.py     | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
index 193b2fd30..93f818058 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
@@ -14,7 +14,7 @@ def capture_and_process_image(camera, mode, obj, socket, hello_robot):
     image_publisher = ImagePublisher(camera, socket)
 
     # Centering the object
-    head_tilt_angles = [0, -0.1, 0.1]
+    head_tilt_angles = [0, -0.1]
     tilt_retries, side_retries = 1, 0
     retry_flag = True
     head_tilt = INIT_HEAD_TILT
@@ -43,7 +43,7 @@ def capture_and_process_image(camera, mode, obj, socket, hello_robot):
             side_retries = 3
 
         elif retry_flag == 2:
-            if (tilt_retries == 3):
+            if (tilt_retries == 2):
                 if (side_retries == 0):
                     hello_robot.move_to_position(base_trans=0.1, head_tilt=head_tilt)
                     side_retries = 1
@@ -163,7 +163,7 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
     pin_base_point = apply_se3_transform(pin_cam2base_transform, pin_point)
     # print(f"pin base point {pin_base_point}")
 
-    diff_value = (0.228 - gripper_depth - gripper_height) # 0.228 is the distance between link_Straight_gripper node and the gripper tip
+    diff_value = (0.227 - gripper_depth - gripper_height) # 0.228 is the distance between link_Straight_gripper node and the gripper tip
     pin_transformed_point1[2] -= (diff_value)
     ref_diff = (diff_value)
 
diff --git a/src/stretch_ros2_bridge/launch/cameras.launch.py b/src/stretch_ros2_bridge/launch/cameras.launch.py
index 6e7ab248a..e7d308e49 100644
--- a/src/stretch_ros2_bridge/launch/cameras.launch.py
+++ b/src/stretch_ros2_bridge/launch/cameras.launch.py
@@ -26,9 +26,9 @@ def generate_launch_description():
         # "temporal_filter.enable": "True",
         # "disparity_filter.enable": "False",
         "device_type": "d435i",
-        "rgb_camera.color_profile": "640x480x30",
-        "depth_module.depth_profile": "640x480x30",
-        "depth_module.infra_profile": "640x480x30",
+        "rgb_camera.color_profile": "424x240x30",
+        "depth_module.depth_profile": "424x240x30",
+        "depth_module.infra_profile": "424x240x30",
         "enable_gyro": "true",
         "enable_accel": "true",
         "gyro_fps": "200",
@@ -50,9 +50,9 @@ def generate_launch_description():
         # "temporal_filter.enable": "True",
         # "disparity_filter.enable": "True",
         "device_type": "d405",
-        "rgb_camera.color_profile": "640x480x15",
-        "depth_module.depth_profile": "640x480x15",
-        "depth_module.color_profile": "640X480X15",
+        "rgb_camera.color_profile": "480x270x15",
+        "depth_module.depth_profile": "480x270x15",
+        "depth_module.color_profile": "480X270X15",
         # "rgb_camera.profile": "480x270x30",
         # "depth_module.profile": "480x270x30",
         "rgb_camera.enable_auto_exposure": "true",

From b4ae7a81aed72d6745d47ff88ad178f0a73c8a08 Mon Sep 17 00:00:00 2001
From: Peiqi Liu <leo20021210@163.com>
Date: Tue, 3 Sep 2024 18:53:12 -0400
Subject: [PATCH 009/410] update

---
 .gitignore                                    |    4 +-
 src/stretch/agent/zmq_client.py               |    2 +-
 src/stretch/config/default_planner.yaml       |    4 +-
 src/stretch/config/dynav_config.yaml          |    2 +-
 src/stretch/dynav/.gitignore                  |   36 +
 src/stretch/dynav/communication_util.py       |   73 ++
 src/stretch/dynav/config.yaml                 |   12 +
 src/stretch/dynav/mapping_utils/__init__.py   |    8 +
 src/stretch/dynav/mapping_utils/a_star.py     |  316 ++++++
 src/stretch/dynav/mapping_utils/voxel.py      | 1002 +++++++++++++++++
 src/stretch/dynav/mapping_utils/voxel_map.py  |  815 ++++++++++++++
 .../dynav/mapping_utils/voxelized_pcd.py      |  605 ++++++++++
 src/stretch/dynav/ok_robot_hw/robot.py        |    2 -
 .../dynav/ok_robot_hw/utils/grasper_utils.py  |    1 -
 src/stretch/dynav/robot_agent_manip_mdp.py    |   29 +-
 src/stretch/dynav/run_manip.py                |   20 +-
 src/stretch/dynav/run_ok_nav.py               |   19 +-
 src/stretch/dynav/scannet.py                  |  209 ++++
 src/stretch/dynav/voxel_map_localizer.py      |  233 ++++
 src/stretch/dynav/voxel_map_server.py         |  849 ++++++++++++++
 src/stretch/requirements.txt                  |   66 ++
 21 files changed, 4256 insertions(+), 51 deletions(-)
 create mode 100644 src/stretch/dynav/.gitignore
 create mode 100644 src/stretch/dynav/communication_util.py
 create mode 100644 src/stretch/dynav/config.yaml
 create mode 100644 src/stretch/dynav/mapping_utils/__init__.py
 create mode 100644 src/stretch/dynav/mapping_utils/a_star.py
 create mode 100644 src/stretch/dynav/mapping_utils/voxel.py
 create mode 100644 src/stretch/dynav/mapping_utils/voxel_map.py
 create mode 100644 src/stretch/dynav/mapping_utils/voxelized_pcd.py
 create mode 100644 src/stretch/dynav/scannet.py
 create mode 100644 src/stretch/dynav/voxel_map_localizer.py
 create mode 100644 src/stretch/dynav/voxel_map_server.py
 create mode 100644 src/stretch/requirements.txt

diff --git a/.gitignore b/.gitignore
index 04f59a9ef..67db76c46 100644
--- a/.gitignore
+++ b/.gitignore
@@ -135,4 +135,6 @@ data/
 log/
 
 # install folder
-install/
\ No newline at end of file
+install/
+
+debug*/
\ No newline at end of file
diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index a21f44f98..141ec12b6 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -380,7 +380,7 @@ def head_to(
             logger.warning(
                 f"Head tilt is restricted to be between {self._head_tilt_min} and {self._head_tilt_max} for safety: was{head_tilt}"
             )
-        head_pan = np.clip(head_pan, -np.pi, 0)
+        head_pan = np.clip(head_pan, self._head_pan_min, self._head_pan_max)
         head_tilt = np.clip(head_tilt, -np.pi / 2, 0)
         next_action = {"head_to": [float(head_pan), float(head_tilt)], "manip_blocking": blocking}
         self.send_action(next_action, timeout=timeout)
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index e06d837dd..bb611c02e 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -61,8 +61,8 @@ motion:
     # wrist_roll: 0.25
     # wrist_pitch: 0.25
     # wrist_yaw: 0.05
-    head_pan: 0.01
-    head_tilt: 0.01
+    head_pan: 0.1
+    head_tilt: 0.1
   joint_thresholds:
     head_not_moving_tolerance: 1.0e-4
     gripper_open_threshold: 0.3
diff --git a/src/stretch/config/dynav_config.yaml b/src/stretch/config/dynav_config.yaml
index 1d567b5fe..57dd6a808 100644
--- a/src/stretch/config/dynav_config.yaml
+++ b/src/stretch/config/dynav_config.yaml
@@ -9,7 +9,7 @@ voxel_size: 0.1
 # 0.05 seems too low
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.5  # Ignore things over this height (eg ceilings)
-obs_min_density: 10  # This many points makes it an obstacle
+obs_min_density: 1  # This many points makes it an obstacle
 exp_min_density: 1
 
 # Padding
diff --git a/src/stretch/dynav/.gitignore b/src/stretch/dynav/.gitignore
new file mode 100644
index 000000000..79d0c80bb
--- /dev/null
+++ b/src/stretch/dynav/.gitignore
@@ -0,0 +1,36 @@
+*.tar
+*.pyc
+/ok-robot-navigation/outputs/*
+/ok-robot-navigation/sample_debug/*
+/ok-robot-navigation/test/*
+/outputs/*
+*.pt
+*.ply
+**/*.egg-info/
+*.o
+*.egg
+.DS_Store
+.TimeRecord
+license
+ok-robot-manipulation/src/checkpoints/*
+ok-robot-manipulation/src/example_data/*
+/src/sample_save_directory/*
+*.pth
+ok-robot-manipulation/pointnet2/*
+ok-robot-hw/hab_stretch/*
+ok-robot-hw/*.npy
+ok-robot-hw/*.png
+
+# Ignore python build files.
+build
+dist
+example_data
+
+# vim files
+*.swp
+*.swo
+*.swn
+
+# Data files
+*.r3d
+*.npy
\ No newline at end of file
diff --git a/src/stretch/dynav/communication_util.py b/src/stretch/dynav/communication_util.py
new file mode 100644
index 000000000..7680ed147
--- /dev/null
+++ b/src/stretch/dynav/communication_util.py
@@ -0,0 +1,73 @@
+import zmq
+import numpy as np
+import cv2
+
+def load_socket(port_number):
+    context = zmq.Context()
+    socket = context.socket(zmq.REP)
+    socket.bind("tcp://*:" + str(port_number))
+
+    return socket
+
+def send_array(socket, A, flags=0, copy=True, track=False):
+    """send a numpy array with metadata"""
+    A = np.array(A)
+    md = dict(
+        dtype = str(A.dtype),
+        shape = A.shape,
+    )
+    socket.send_json(md, flags|zmq.SNDMORE)
+    return socket.send(np.ascontiguousarray(A), flags, copy=copy, track=track)
+
+def recv_array(socket, flags=0, copy=True, track=False):
+    """recv a numpy array"""
+    md = socket.recv_json(flags=flags)
+    msg = socket.recv(flags=flags, copy=copy, track=track)
+    A = np.frombuffer(msg, dtype=md['dtype'])
+    return A.reshape(md['shape'])
+
+def send_rgb_img(socket, img):
+    img = img.astype(np.uint8) 
+    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 90]
+    _, img_encoded = cv2.imencode('.jpg', img, encode_param)
+    socket.send(img_encoded.tobytes())
+
+def recv_rgb_img(socket):
+    img = socket.recv()
+    img = np.frombuffer(img, dtype=np.uint8)
+    img = cv2.imdecode(img, cv2.IMREAD_COLOR)
+    return img
+
+def send_depth_img(socket, depth_img):
+    depth_img = (depth_img * 1000).astype(np.uint16)
+    encode_param = [int(cv2.IMWRITE_PNG_COMPRESSION), 3]  # Compression level from 0 (no compression) to 9 (max compression)
+    _, depth_img_encoded = cv2.imencode('.png', depth_img, encode_param)
+    socket.send(depth_img_encoded.tobytes())
+
+def recv_depth_img(socket):
+    depth_img = socket.recv()
+    depth_img = np.frombuffer(depth_img, dtype=np.uint8)
+    depth_img = cv2.imdecode(depth_img, cv2.IMREAD_UNCHANGED)
+    depth_img = (depth_img / 1000.)
+    return depth_img
+
+def send_everything(socket, rgb, depth, intrinsics, pose):
+    send_rgb_img(socket, rgb)
+    socket.recv_string()
+    send_depth_img(socket, depth)
+    socket.recv_string()
+    send_array(socket, intrinsics)
+    socket.recv_string()
+    send_array(socket, pose)
+    socket.recv_string()
+
+def recv_everything(socket):
+    rgb = recv_rgb_img(socket)
+    socket.send_string('')
+    depth = recv_depth_img(socket)
+    socket.send_string('')
+    intrinsics = recv_array(socket)
+    socket.send_string('')
+    pose = recv_array(socket)
+    socket.send_string('')
+    return rgb, depth, intrinsics, pose
\ No newline at end of file
diff --git a/src/stretch/dynav/config.yaml b/src/stretch/dynav/config.yaml
new file mode 100644
index 000000000..33915d9e8
--- /dev/null
+++ b/src/stretch/dynav/config.yaml
@@ -0,0 +1,12 @@
+# pickle_file_name: debug/debug_2024-07-23_11-28-58.pkl
+pickle_file_name: null
+
+rerun: true
+open_communication: true
+static: false
+
+min_depth: 0.25
+max_depth: 2.5
+
+# Add this key
+ui_server_address: "http://localhost:8080"
diff --git a/src/stretch/dynav/mapping_utils/__init__.py b/src/stretch/dynav/mapping_utils/__init__.py
new file mode 100644
index 000000000..5b9355132
--- /dev/null
+++ b/src/stretch/dynav/mapping_utils/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .voxelized_pcd import VoxelizedPointcloud, scatter3d
+from .voxel import SparseVoxelMap
+from .voxel_map import SparseVoxelMapNavigationSpace
+from .a_star import AStar
diff --git a/src/stretch/dynav/mapping_utils/a_star.py b/src/stretch/dynav/mapping_utils/a_star.py
new file mode 100644
index 000000000..853401b53
--- /dev/null
+++ b/src/stretch/dynav/mapping_utils/a_star.py
@@ -0,0 +1,316 @@
+import time
+from random import random
+from typing import Callable, List, Optional, Tuple, Set
+
+import numpy as np
+
+from stretch.motion import PlanResult
+from stretch.dynav.mapping_utils import SparseVoxelMapNavigationSpace
+
+import heapq
+import math
+import torch
+
+class Node():
+    """Stores an individual spot in the tree"""
+
+    def __init__(self, state):
+        self.state = state
+
+def neighbors(pt: Tuple[int, int]) -> List[Tuple[int, int]]:
+    return [(pt[0] + dx, pt[1] + dy) for dx in range(-1, 2) for dy in range(-1, 2) if (dx, dy) != (0, 0)]
+
+class AStar():
+    """Define RRT planning problem and parameters"""
+
+    def __init__(
+        self,
+        space: SparseVoxelMapNavigationSpace,
+    ):
+        """Create RRT planner with configuration"""
+        self.space = space
+        self.reset()
+    
+    def compute_theta(self, cur_x, cur_y, end_x, end_y):
+        theta = 0
+        if end_x == cur_x and end_y >= cur_y:
+            theta = np.pi / 2
+        elif end_x == cur_x and end_y < cur_y:
+            theta = -np.pi / 2
+        else:
+            theta = np.arctan((end_y - cur_y) / (end_x - cur_x))
+            if end_x < cur_x:
+                theta = theta + np.pi
+            # move theta into [-pi, pi] range, for this function specifically, 
+            # (theta -= 2 * pi) or (theta += 2 * pi) is enough
+            if theta > np.pi:
+                theta = theta - 2 * np.pi
+            if theta < np.pi:
+                theta = theta + 2 * np.pi
+        return theta
+
+    def reset(self):
+        print('loading the up to date navigable map')
+        print('Wait')
+        obs, exp = self.space.voxel_map.get_2d_map()
+        print('up to date navigable map loaded')
+        self._navigable = ~obs & exp
+        self.start_time = time.time()
+
+    def point_is_occupied(self, x: int, y: int) -> bool:
+        return not bool(self._navigable[x][y])
+    
+    def to_pt(self, xy: Tuple[float, float]) -> Tuple[int, int]:
+        xy = torch.tensor([xy[0], xy[1]])
+        pt = self.space.voxel_map.xy_to_grid_coords(xy)
+        return tuple(pt.tolist())
+
+    def to_xy(self, pt: Tuple[int, int]) -> Tuple[float, float]:
+        pt = torch.tensor([pt[0], pt[1]])
+        xy = self.space.voxel_map.grid_coords_to_xy(pt)
+        return tuple(xy.tolist())
+
+    def compute_dis(self, a: Tuple[int, int], b: Tuple[int, int]):
+        return ((a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2) ** 0.5
+
+    def compute_obstacle_punishment(self, a: Tuple[int, int], weight: int, avoid: int) -> float:
+        obstacle_punishment = 0
+        # it is a trick, if we compute euclidean dis between a and every obstacle,
+        # this single compute_obstacle_punishment will be O(n) complexity
+        # so we just check a square of size (2*avoid) * (2*avoid)
+        # centered at a, which is O(1) complexity
+        for i in range(-avoid, avoid + 1):
+            for j in range(-avoid, avoid + 1):
+                if self.point_is_occupied(a[0] + i, a[1] + j):
+                    b = [a[0] + i, a[1] + j]
+                    obs_dis = ((a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2) ** 0.5
+                    obstacle_punishment = max((weight / max(obs_dis, 1)), obstacle_punishment)
+        return obstacle_punishment
+
+    # A* heuristic
+    def compute_heuristic(self, a: Tuple[int, int], b: Tuple[int, int], weight = 12, avoid = 3) -> float:
+        return self.compute_dis(a, b) + weight * self.compute_obstacle_punishment(a, weight, avoid)\
+            + self.compute_obstacle_punishment(b, weight, avoid)
+
+    def is_in_line_of_sight(self, start_pt: Tuple[int, int], end_pt: Tuple[int, int]) -> bool:
+        """Checks if there is a line-of-sight between two points.
+
+        Implements using Bresenham's line algorithm.
+
+        Args:
+            start_pt: The starting point.
+            end_pt: The ending point.
+
+        Returns:
+            Whether there is a line-of-sight between the two points.
+        """
+
+        dx = end_pt[0] - start_pt[0]
+        dy = end_pt[1] - start_pt[1]
+
+        if abs(dx) > abs(dy):
+            if dx < 0:
+                start_pt, end_pt = end_pt, start_pt
+            for x in range(start_pt[0], end_pt[0] + 1):
+                yf = start_pt[1] + (x - start_pt[0]) / dx * dy
+                for y in list({math.floor(yf), math.ceil(yf)}):
+                    if self.point_is_occupied(x, y):
+                        return False
+
+        else:
+            if dy < 0:
+                start_pt, end_pt = end_pt, start_pt
+            for y in range(start_pt[1], end_pt[1] + 1):
+                xf = start_pt[0] + (y - start_pt[1]) / dy * dx
+                for x in list({math.floor(xf), math.ceil(xf)}):
+                    if self.point_is_occupied(x, y):
+                        return False
+
+        return True
+
+    def is_a_line(self, a, b, c):
+        if a[0] == b[0]:
+            return c[0] == a[0]
+        if b[0] == c[0]:
+            return False
+        return ((c[1] - b[1]) / (c[0] - b[0])) == ((b[1] - a[1]) / (b[0] - a[0]))
+
+    def clean_path_for_xy(self, waypoints):
+        goal = waypoints[-1]
+        waypts = [self.to_pt(waypoint) for waypoint in waypoints]
+        waypts = self.clean_path(waypts)
+        waypoints = [self.to_xy(waypt) for waypt in waypts]
+        traj = []
+        for i in range(len(waypoints) - 1):
+            theta = self.compute_theta(waypoints[i][0], waypoints[i][1], waypoints[i + 1][0], waypoints[i + 1][1])
+            traj.append([waypoints[i][0], waypoints[i][1], float(theta)])
+        traj.append([waypoints[-1][0], waypoints[-1][1], goal[-1]])
+        return traj
+
+    def clean_path(self, path) -> List[Tuple[int, int]]:
+        """Cleans up the final path.
+
+        This implements a simple algorithm where, given some current position,
+        we find the last point in the path that is in line-of-sight, and then
+        we set the current position to that point. This is repeated until we
+        reach the end of the path. This is not particularly efficient, but
+        it's simple and works well enough.
+
+        Args:
+            path: The path to clean up.
+
+        Returns:
+            The cleaned up path.
+        """
+        cleaned_path = [path[0]]
+        i = 0
+        while i < len(path) - 1:
+            for j in range(len(path) - 1, i, -1):
+                if self.is_in_line_of_sight(path[i][:2], path[j][:2]):
+                    break
+            else:
+               j = i + 1
+            # Include the mid waypoint to avoid the collision
+            # if j - i >= 2:
+            #     cleaned_path.append(path[(i + j) // 2])
+            cleaned_path.append(path[j])
+            i = j
+        return cleaned_path
+
+    def get_unoccupied_neighbor(self, pt: Tuple[int, int], goal_pt = None) -> Tuple[int, int]:
+        if not self.point_is_occupied(*pt):
+            return pt
+
+        # This is a sort of hack to deal with points that are close to an edge.
+        # If the start point is occupied, we check adjacent points until we get
+        # one which is unoccupied. If we can't find one, we throw an error.
+        neighbor_pts = neighbors(pt)
+        if goal_pt is not None:
+            goal_pt_non_null = goal_pt
+            neighbor_pts.sort(key=lambda n: self.compute_heuristic(n, goal_pt_non_null))
+        for neighbor_pt in neighbor_pts:
+            if not self.point_is_occupied(*neighbor_pt):
+                return neighbor_pt
+        print("The robot might stand on a non navigable point, so check obstacle map and restart roslaunch")
+        return None
+        # raise ValueError("The robot might stand on a non navigable point, so check obstacle map and restart roslaunch")
+
+    def get_reachable_points(self, start_pt: Tuple[int, int]) -> Set[Tuple[int, int]]:
+        """Gets all reachable points from a given starting point.
+
+        Args:
+            start_pt: The starting point
+
+        Returns:
+            The set of all reachable points
+        """
+
+        self.reset()
+        start_pt = self.get_unoccupied_neighbor(start_pt)
+        if start_pt is None:
+            return set()
+
+        reachable_points: set[Tuple[int, int]] = set()
+        to_visit = [start_pt]
+        while to_visit:
+            pt = to_visit.pop()
+            if pt in reachable_points:
+                continue
+            reachable_points.add(pt)
+            for new_pt in neighbors(pt):
+                if new_pt in reachable_points:
+                    continue
+                if self.point_is_occupied(new_pt[0], new_pt[1]):
+                    continue
+                to_visit.append(new_pt)
+        return reachable_points
+
+    def run_astar(
+        self,
+        start_xy: Tuple[float, float],
+        end_xy: Tuple[float, float],
+        remove_line_of_sight_points: bool = False,
+    ) -> List[Tuple[float, float]]:
+
+        start_pt, end_pt = self.to_pt(start_xy), self.to_pt(end_xy)
+
+        # Checks that both points are unoccupied.
+        # start_pt = self.get_unoccupied_neighbor(start_pt, end_pt)
+        start_pt = self.get_unoccupied_neighbor(start_pt)
+        end_pt = self.get_unoccupied_neighbor(end_pt, start_pt)
+        print('A* formally starts ', time.time() - self.start_time, ' seconds after path planning starts')
+        if start_pt is None or end_pt is None:
+            return None
+
+        # Implements A* search.
+        q = [(0, start_pt)]
+        came_from: dict = {start_pt: None}
+        cost_so_far: dict[Tuple[int, int], float] = {start_pt: 0.0}
+        while q:
+            _, current = heapq.heappop(q)
+
+            if current == end_pt:
+                break
+
+            for nxt in neighbors(current):
+                if self.point_is_occupied(nxt[0], nxt[1]):
+                    continue
+                new_cost = cost_so_far[current] + self.compute_heuristic(current, nxt)
+                if nxt not in cost_so_far or new_cost < cost_so_far[nxt]:
+                    cost_so_far[nxt] = new_cost
+                    priority = new_cost + self.compute_heuristic(end_pt, nxt)
+                    heapq.heappush(q, (priority, nxt))  # type: ignore
+                    came_from[nxt] = current
+
+        else:
+            return None
+
+        # Reconstructs the path.
+        path = []
+        current = end_pt
+        while current != start_pt:
+            path.append(current)
+            prev = came_from[current]
+            if prev is None:
+                break
+            current = prev
+        path.append(start_pt)
+        path.reverse()
+
+        # Clean up the path.
+        if remove_line_of_sight_points:
+            path = self.clean_path(path)
+
+        # return [start_xy] + [self.to_xy(pt) for pt in path[1:-1]] + [end_xy]
+        return [start_xy] + [self.to_xy(pt) for pt in path[1:]]
+
+    def plan(self, start, goal, verbose: bool = True) -> PlanResult:
+        """plan from start to goal. creates a new tree.
+
+        Based on Caelan Garrett's code (MIT licensed):
+        https://github.com/caelan/motion-planners/blob/master/motion_planners/rrt.py
+        """
+        # assert len(start) == self.space.dof, "invalid start dimensions"
+        # assert len(goal) == self.space.dof, "invalid goal dimensions"
+        # self.start_time = time.time()
+        self.reset()
+        if not self.space.is_valid(goal):
+            if verbose:
+                print("[Planner] invalid goal")
+            return PlanResult(False, reason = "[Planner] invalid goal")
+        # Add start to the tree
+        print('Start running A* ', time.time() - self.start_time, ' seconds after path planning starts')
+        waypoints = self.run_astar(start[:2], goal[:2]) 
+        print('Finish running A* ', time.time() - self.start_time, ' seconds after path planning starts')
+
+        if waypoints is None:
+            if verbose:
+                print("A* fails, check obstacle map")
+            return PlanResult(False, reason = "A* fails, check obstacle map")
+        trajectory = []
+        for i in range(len(waypoints) - 1):
+            theta = self.compute_theta(waypoints[i][0], waypoints[i][1], waypoints[i + 1][0], waypoints[i + 1][1])
+            trajectory.append(Node([waypoints[i][0], waypoints[i][1], float(theta)]))
+        trajectory.append(Node([waypoints[-1][0], waypoints[-1][1], goal[-1]]))
+        print('Finish computing theta ', time.time() - self.start_time, ' seconds after path planning starts')
+        return PlanResult(True, trajectory = trajectory)
\ No newline at end of file
diff --git a/src/stretch/dynav/mapping_utils/voxel.py b/src/stretch/dynav/mapping_utils/voxel.py
new file mode 100644
index 000000000..b06733eb1
--- /dev/null
+++ b/src/stretch/dynav/mapping_utils/voxel.py
@@ -0,0 +1,1002 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import copy
+import logging
+import math
+import pickle
+from collections import namedtuple
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import open3d as open3d
+import scipy
+import skimage
+import torch
+import trimesh
+from torch import Tensor
+
+from stretch.core.interfaces import Observations
+from stretch.motion import Footprint, PlanResult, RobotModel
+from stretch.utils.data_tools.dict import update
+from stretch.utils.morphology import binary_dilation, binary_erosion, get_edges
+from stretch.utils.point_cloud import create_visualization_geometries, numpy_to_pcd
+from stretch.utils.point_cloud_torch import unproject_masked_depth_to_xyz_coordinates
+from stretch.utils.visualization import create_disk
+from stretch.dynav.mapping_utils import VoxelizedPointcloud, scatter3d
+
+import torch.nn.functional as F
+import torchvision.transforms.functional as V
+from torchvision import transforms
+import os
+import cv2
+import time
+
+from scipy.ndimage import gaussian_filter, maximum_filter
+
+Frame = namedtuple(
+    "Frame",
+    [
+        "camera_pose",
+        "camera_K",
+        "xyz",
+        "rgb",
+        "feats",
+        "depth",
+        "base_pose",
+        "info",
+        "full_world_xyz",
+        "xyz_frame",
+    ],
+)
+
+VALID_FRAMES = ["camera", "world"]
+
+DEFAULT_GRID_SIZE = [200, 200]
+
+logger = logging.getLogger(__name__)
+
+
+def ensure_tensor(arr):
+    if isinstance(arr, np.ndarray):
+        return Tensor(arr)
+    if not isinstance(arr, Tensor):
+        raise ValueError(f"arr of unknown type ({type(arr)}) cannot be cast to Tensor")
+
+
+class SparseVoxelMap(object):
+    """Create a voxel map object which captures 3d information.
+
+    This class represents a 3D voxel map used for capturing environmental information. It provides various parameters
+    for configuring the map's properties, such as resolution, feature dimensions.
+
+    Attributes:
+        resolution (float): The size of a voxel in meters.
+        feature_dim (int): The size of feature embeddings to capture per-voxel point.
+        grid_size (Tuple[int, int]): The dimensions of the voxel grid (optional).
+        grid_resolution (float): The resolution of the grid (optional).
+        obs_min_height (float): The minimum height for observations.
+        obs_max_height (float): The maximum height for observations.
+        obs_min_density (float): The minimum density for observations.
+        smooth_kernel_size (int): The size of the smoothing kernel.
+        add_local_radius_points (bool): Whether to add local radius points.
+        remove_visited_from_obstacles(bool): subtract out observations potentially of the robot
+        local_radius (float): The radius for local points.
+        min_depth (float): The minimum depth for observations.
+        max_depth (float): The maximum depth for observations.
+        pad_obstacles (int): Padding for obstacles.
+        voxel_kwargs (Dict[str, Any]): Additional voxel configuration.
+        map_2d_device (str): The device for 2D mapping.
+    """
+
+    DEFAULT_INSTANCE_MAP_KWARGS = dict(
+        du_scale=1,
+        instance_association="bbox_iou",
+        log_dir_overwrite_ok=True,
+        mask_cropped_instances="False",
+    )
+
+    def __init__(
+        self,
+        resolution: float = 0.1,
+        feature_dim: int = 3,
+        grid_size: Tuple[int, int] = None,
+        grid_resolution: float = 0.1,
+        obs_min_height: float = 0.1,
+        obs_max_height: float = 1.5,
+        obs_min_density: float = 50,
+        exp_min_density: float = 5,
+        smooth_kernel_size: int = 2,
+        add_local_radius_points: bool = True,
+        remove_visited_from_obstacles: bool = False,
+        local_radius: float = 0.8,
+        min_depth: float = 0.25,
+        max_depth: float = 2.0,
+        pad_obstacles: int = 0,
+        voxel_kwargs: Dict[str, Any] = {},
+        map_2d_device: str = "cpu",
+        use_median_filter: bool = False,
+        median_filter_size: int = 5,
+        median_filter_max_error: float = 0.01,
+        use_derivative_filter: bool = False,
+        derivative_filter_threshold: float = 0.5,
+        point_update_threshold: float = 0.9,
+    ):
+        """
+        Args:
+            resolution(float): in meters, size of a voxel
+            feature_dim(int): size of feature embeddings to capture per-voxel point
+        """
+        print('------------------------YOU ARE NOW RUNNING PEIQI VOXEL MAP CODES V3-----------------')
+        # TODO: We an use fastai.store_attr() to get rid of this boilerplate code
+        self.resolution = resolution
+        self.feature_dim = feature_dim
+        self.obs_min_height = obs_min_height
+        self.obs_max_height = obs_max_height
+        self.obs_min_density = obs_min_density
+        self.exp_min_density = exp_min_density
+
+        # Smoothing kernel params
+        self.smooth_kernel_size = smooth_kernel_size
+        if self.smooth_kernel_size > 0:
+            self.smooth_kernel = torch.nn.Parameter(
+                torch.from_numpy(skimage.morphology.disk(smooth_kernel_size))
+                .unsqueeze(0)
+                .unsqueeze(0)
+                .float(),
+                requires_grad=False,
+            )
+        else:
+            self.smooth_kernel = None
+
+        # Median filter params
+        self.median_filter_size = median_filter_size
+        self.use_median_filter = use_median_filter
+        self.median_filter_max_error = median_filter_max_error
+
+        # Derivative filter params
+        self.use_derivative_filter = use_derivative_filter
+        self.derivative_filter_threshold = derivative_filter_threshold
+        self.point_update_threshold = point_update_threshold
+
+        self.grid_resolution = grid_resolution
+        self.voxel_resolution = resolution
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.pad_obstacles = int(pad_obstacles)
+
+        self.voxel_kwargs = voxel_kwargs
+        self.map_2d_device = map_2d_device
+
+        if self.pad_obstacles > 0:
+            self.dilate_obstacles_kernel = torch.nn.Parameter(
+                torch.from_numpy(skimage.morphology.disk(self.pad_obstacles))
+                .unsqueeze(0)
+                .unsqueeze(0)
+                .float(),
+                requires_grad=False,
+            )
+        else:
+            self.dilate_obstacles_kernel = None
+
+        # Add points with local_radius to the voxel map at (0,0,0) unless we receive lidar points
+        self._add_local_radius_points = add_local_radius_points
+        self._remove_visited_from_obstacles = remove_visited_from_obstacles
+        self.local_radius = local_radius
+
+        # Create disk for mapping explored areas near the robot - since camera can't always see it
+        self._disk_size = np.ceil(self.local_radius / self.grid_resolution)
+
+        self._visited_disk = torch.from_numpy(
+            create_disk(self._disk_size, (2 * self._disk_size) + 1)
+        ).to(map_2d_device)
+
+        if grid_size is not None:
+            self.grid_size = [grid_size[0], grid_size[1]]
+        else:
+            self.grid_size = DEFAULT_GRID_SIZE
+        # Track the center of the grid - (0, 0) in our coordinate system
+        # We then just need to update everything when we want to track obstacles
+        self.grid_origin = Tensor(self.grid_size + [0], device=map_2d_device) // 2
+        # Used to track the offset from our observations so maps dont use too much space
+
+        # Used for tensorized bounds checks
+        self._grid_size_t = Tensor(self.grid_size, device=map_2d_device)
+
+        # Init variables
+        self.reset()
+
+    def reset(self) -> None:
+        """Clear out the entire voxel map."""
+        self.observations = []
+        # Create voxelized pointcloud
+        self.voxel_pcd = VoxelizedPointcloud(
+            voxel_size=self.voxel_resolution,
+            dim_mins=None,
+            dim_maxs=None,
+            feature_pool_method="mean",
+            **self.voxel_kwargs,
+        )
+
+        self._seq = 0
+        self._2d_last_updated = -1
+        # Create map here - just reset *some* variables
+        self.reset_cache()
+
+    def reset_cache(self):
+        """Clear some tracked things"""
+        # Stores points in 2d coords where robot has been
+        self._visited = torch.zeros(self.grid_size, device=self.map_2d_device)
+
+        self.voxel_pcd.reset()
+
+        # Store 2d map information
+        # This is computed from our various point clouds
+        self._map2d = None
+
+    def add_obs(
+        self,
+        obs: Observations,
+        camera_K: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ):
+        """Unpack an observation and convert it properly, then add to memory. Pass all other inputs into the add() function as provided."""
+        rgb = self.fix_data_type(obs.rgb)
+        depth = self.fix_data_type(obs.depth)
+        xyz = self.fix_data_type(obs.xyz)
+        camera_pose = self.fix_data_type(obs.camera_pose)
+        base_pose = torch.from_numpy(
+            np.array([obs.gps[0], obs.gps[1], obs.compass[0]])
+        ).float()
+        K = self.fix_data_type(obs.camera_K) if camera_K is None else camera_K
+        
+        self.add(
+            camera_pose=camera_pose,
+            xyz=xyz,
+            rgb=rgb,
+            depth=depth,
+            base_pose=base_pose,
+            camera_K=K,
+            *args,
+            **kwargs,
+        )
+
+    def add(
+        self,
+        camera_pose: Tensor,
+        rgb: Tensor,
+        xyz: Optional[Tensor] = None,
+        camera_K: Optional[Tensor] = None,
+        feats: Optional[Tensor] = None,
+        depth: Optional[Tensor] = None,
+        base_pose: Optional[Tensor] = None,
+        instance_image: Optional[Tensor] = None,
+        instance_classes: Optional[Tensor] = None,
+        instance_scores: Optional[Tensor] = None,
+        obs: Optional[Observations] = None,
+        xyz_frame: str = "camera",
+        **info,
+    ):
+        """Add this to our history of observations. Also update the current running map.
+
+        Parameters:
+            camera_pose(Tensor): [4,4] cam_to_world matrix
+            rgb(Tensor): N x 3 color points
+            camera_K(Tensor): [3,3] camera instrinsics matrix -- usually pinhole model
+            xyz(Tensor): N x 3 point cloud points in camera coordinates
+            feats(Tensor): N x D point cloud features; D == 3 for RGB is most common
+            base_pose(Tensor): optional location of robot base
+            instance_image(Tensor): [H,W] image of ints where values at a pixel correspond to instance_id
+            instance_classes(Tensor): [K] tensor of ints where class = instance_classes[instance_id]
+            instance_scores: [K] of detection confidence score = instance_scores[instance_id]
+            # obs: observations
+        """
+        # TODO: we should remove the xyz/feats maybe? just use observations as input?
+        # TODO: switch to using just Obs struct?
+        # Shape checking
+        assert rgb.ndim == 3 or rgb.ndim == 2, f"{rgb.ndim=}: must be 2 or 3"
+        if isinstance(rgb, np.ndarray):
+            rgb = torch.from_numpy(rgb)
+        if isinstance(camera_pose, np.ndarray):
+            camera_pose = torch.from_numpy(camera_pose)
+        if depth is not None:
+            assert (
+                rgb.shape[:-1] == depth.shape
+            ), f"depth and rgb image sizes must match; got {rgb.shape=} {depth.shape=}"
+        assert xyz is not None or (camera_K is not None and depth is not None)
+        if xyz is not None:
+            assert (
+                xyz.shape[-1] == 3
+            ), "xyz must have last dimension = 3 for x, y, z position of points"
+            assert rgb.shape == xyz.shape, "rgb shape must match xyz"
+            # Make sure shape is correct here for xyz and any passed-in features
+            if feats is not None:
+                assert (
+                    feats.shape[-1] == self.feature_dim
+                ), f"features must match voxel feature dimenstionality of {self.feature_dim}"
+                assert (
+                    xyz.shape[0] == feats.shape[0]
+                ), "features must be available for each point"
+            else:
+                pass
+            if isinstance(xyz, np.ndarray):
+                xyz = torch.from_numpy(xyz)
+        if depth is not None:
+            assert depth.ndim == 2 or xyz_frame == "world"
+        if camera_K is not None:
+            assert camera_K.ndim == 2, "camera intrinsics K must be a 3x3 matrix"
+        assert (
+            camera_pose.ndim == 2
+            and camera_pose.shape[0] == 4
+            and camera_pose.shape[1] == 4
+        ), "Camera pose must be a 4x4 matrix representing a pose in SE(3)"
+        assert (
+            xyz_frame in VALID_FRAMES
+        ), f"frame {xyz_frame} was not valid; should one one of {VALID_FRAMES}"
+
+        # Apply a median filter to remove bad depth values when mapping and exploring
+        # This is not strictly necessary but the idea is to clean up bad pixels
+        if depth is not None and self.use_median_filter:
+            median_depth = torch.from_numpy(
+                scipy.ndimage.median_filter(depth, size=self.median_filter_size)
+            )
+            median_filter_error = (depth - median_depth).abs()
+
+        # Get full_world_xyz
+        if xyz is not None:
+            if xyz_frame == "camera":
+                full_world_xyz = (
+                    torch.cat([xyz, torch.ones_like(xyz[..., [0]])], dim=-1)
+                    @ camera_pose.T
+                )[..., :3]
+            elif xyz_frame == "world":
+                full_world_xyz = xyz
+            else:
+                raise NotImplementedError(f"Unknown xyz_frame {xyz_frame}")
+            # trimesh.transform_points(xyz, camera_pose)
+        else:
+            full_world_xyz = unproject_masked_depth_to_xyz_coordinates(  # Batchable!
+                depth=depth.unsqueeze(0).unsqueeze(1),
+                pose=camera_pose.unsqueeze(0),
+                inv_intrinsics=torch.linalg.inv(camera_K[:3, :3]).unsqueeze(0),
+            )
+        # add observations before we start changing things
+        self.observations.append(
+            Frame(
+                camera_pose,
+                camera_K,
+                xyz,
+                rgb,
+                feats,
+                depth,
+                base_pose,
+                info,
+                full_world_xyz,
+                xyz_frame=xyz_frame,
+            )
+        )
+
+        valid_depth = torch.full_like(rgb[:, 0], fill_value=True, dtype=torch.bool)
+        if depth is not None:
+            valid_depth = (depth > self.min_depth) & (depth < self.max_depth)
+
+            if self.use_derivative_filter:
+                edges = get_edges(depth, threshold=self.derivative_filter_threshold)
+                valid_depth = valid_depth & ~edges
+
+            if self.use_median_filter:
+                valid_depth = (
+                    valid_depth
+                    & (median_filter_error < self.median_filter_max_error).bool()
+                )
+
+        # Add to voxel grid
+        if feats is not None:
+            feats = feats[valid_depth].reshape(-1, feats.shape[-1])
+        rgb = rgb[valid_depth].reshape(-1, 3)
+        world_xyz = full_world_xyz.view(-1, 3)[valid_depth.flatten()]
+
+        # TODO: weights could also be confidence, inv distance from camera, etc
+        if world_xyz.nelement() > 0:
+            selected_indices = torch.randperm(len(world_xyz))[:int((1 - self.point_update_threshold) * len(world_xyz))]
+            if len(selected_indices) == 0:
+                return
+            if world_xyz is not None:
+                world_xyz = world_xyz[selected_indices]
+            if feats is not None:
+                feats = feats[selected_indices]
+            if rgb is not None:
+                rgb = rgb[selected_indices]
+            self.voxel_pcd.add(world_xyz, features=feats, rgb=rgb, weights=None)
+
+        if self._add_local_radius_points:
+            # TODO: just get this from camera_pose?
+            self._update_visited(camera_pose[:3, 3].to(self.map_2d_device))
+        if base_pose is not None:
+            self._update_visited(base_pose.to(self.map_2d_device))
+
+        # Increment sequence counter
+        self._seq += 1
+
+    def _update_visited(self, base_pose: Tensor):
+        """Update 2d map of where robot has visited"""
+        # Add exploration here
+        # Base pose can be whatever, going to assume xyt for now
+        map_xy = ((base_pose[:2] / self.grid_resolution) + self.grid_origin[:2]).int()
+        x0 = int(map_xy[0] - self._disk_size)
+        x1 = int(map_xy[0] + self._disk_size + 1)
+        y0 = int(map_xy[1] - self._disk_size)
+        y1 = int(map_xy[1] + self._disk_size + 1)
+        assert x0 >= 0
+        assert y0 >= 0
+        self._visited[x0:x1, y0:y1] += self._visited_disk
+
+    def write_to_pickle(self, filename: str):
+        """Write out to a pickle file. This is a rough, quick-and-easy output for debugging, not intended to replace the scalable data writer in data_tools for bigger efforts."""
+        data = {}
+        data["camera_poses"] = []
+        data["camera_K"] = []
+        data["base_poses"] = []
+        data["xyz"] = []
+        data["world_xyz"] = []
+        data["rgb"] = []
+        data["depth"] = []
+        data["feats"] = []
+        for frame in self.observations:
+            # add it to pickle
+            # TODO: switch to using just Obs struct?
+            data["camera_poses"].append(frame.camera_pose)
+            data["base_poses"].append(frame.base_pose)
+            data["camera_K"].append(frame.camera_K)
+            data["xyz"].append(frame.xyz)
+            data["world_xyz"].append(frame.full_world_xyz)
+            data["rgb"].append(frame.rgb)
+            data["depth"].append(frame.depth)
+            data["feats"].append(frame.feats)
+            for k, v in frame.info.items():
+                if k not in data:
+                    data[k] = []
+                data[k].append(v)
+        (
+            data["combined_xyz"],
+            data["combined_feats"],
+            data["combined_weights"],
+            data["combined_rgb"],
+        ) = self.voxel_pcd.get_pointcloud()
+        with open(filename, "wb") as f:
+            pickle.dump(data, f)
+
+    def write_to_pickle_add_data(self, filename: str, newdata: dict):
+        """Write out to a pickle file. This is a rough, quick-and-easy output for debugging, not intended to replace the scalable data writer in data_tools for bigger efforts."""
+        data = {}
+        data["camera_poses"] = []
+        data["base_poses"] = []
+        data["xyz"] = []
+        data["rgb"] = []
+        data["depth"] = []
+        data["feats"] = []
+        for key, value in newdata.items():
+            data[key] = value
+        for frame in self.observations:
+            # add it to pickle
+            # TODO: switch to using just Obs struct?
+            data["camera_poses"].append(frame.camera_pose)
+            data["base_poses"].append(frame.base_pose)
+            data["xyz"].append(frame.xyz)
+            data["rgb"].append(frame.rgb)
+            data["depth"].append(frame.depth)
+            data["feats"].append(frame.feats)
+            for k, v in frame.info.items():
+                if k not in data:
+                    data[k] = []
+                data[k].append(v)
+        (
+            data["combined_xyz"],
+            data["combined_feats"],
+            data["combined_weights"],
+            data["combined_rgb"],
+        ) = self.voxel_pcd.get_pointcloud()
+        with open(filename, "wb") as f:
+            pickle.dump(data, f)
+
+    def fix_data_type(self, tensor) -> torch.Tensor:
+        """make sure tensors are in the right format for this model"""
+        # If its empty just hope we're handling that somewhere else
+        if tensor is None:
+            return None
+        # Conversions
+        if isinstance(tensor, np.ndarray):
+            tensor = torch.from_numpy(tensor)
+        # Data types
+        if isinstance(tensor, torch.Tensor):
+            return tensor.float()
+        else:
+            raise NotImplementedError("unsupported data type for tensor:", tensor)
+
+    def read_from_pickle(self, filename: str, num_frames: int = -1):
+        """Read from a pickle file as above. Will clear all currently stored data first."""
+        self.reset_cache()
+        if isinstance(filename, str):
+            filename = Path(filename)
+        assert filename.exists(), f"No file found at {filename}"
+        with filename.open("rb") as f:
+            data = pickle.load(f)
+        for i, (
+            camera_pose,
+            xyz,
+            rgb,
+            feats,
+            depth,
+            base_pose,
+            K,
+            world_xyz,
+        ) in enumerate(
+            zip(
+                data["camera_poses"],
+                data["xyz"],
+                data["rgb"],
+                data["feats"],
+                data["depth"],
+                data["base_poses"],
+                data["camera_K"],
+                data["world_xyz"],
+            )
+        ):
+            # Handle the case where we dont actually want to load everything
+            if num_frames > 0 and i >= num_frames:
+                break
+
+            camera_pose = self.fix_data_type(camera_pose)
+            xyz = self.fix_data_type(xyz)
+            rgb = self.fix_data_type(rgb)
+            depth = self.fix_data_type(depth)
+            if feats is not None:
+                feats = self.fix_data_type(feats)
+            base_pose = self.fix_data_type(base_pose)
+            self.add(
+                camera_pose=camera_pose,
+                xyz=xyz,
+                rgb=rgb,
+                feats=feats,
+                depth=depth,
+                base_pose=base_pose,
+                camera_K=K,
+            )
+
+    def recompute_map(self):
+        """Recompute the entire map from scratch instead of doing incremental updates.
+        This is a helper function which recomputes everything from the beginning.
+
+        Currently this will be slightly inefficient since it recreates all the objects incrementally.
+        """
+        old_observations = self.observations
+        self.reset_cache()
+        for frame in old_observations:
+            self.add(
+                frame.camera_pose,
+                frame.xyz,
+                frame.rgb,
+                frame.feats,
+                frame.depth,
+                frame.base_pose,
+                frame.obs,
+                **frame.info,
+            )
+
+    def get_2d_alignment_heuristics(
+        self, voxel_map_localizer, text, debug: bool = False
+    ):
+        if voxel_map_localizer.voxel_pcd._points is None:
+            return None
+        # Convert metric measurements to discrete
+        # Gets the xyz correctly - for now everything is assumed to be within the correct distance of origin
+        xyz, _, _, _ = voxel_map_localizer.voxel_pcd.get_pointcloud()
+        xyz = xyz.detach().cpu()
+        if xyz is None:
+            xyz = torch.zeros((0, 3))
+
+        device = xyz.device
+        xyz = ((xyz / self.grid_resolution) + self.grid_origin).long()
+        xyz[xyz[:, -1] < 0, -1] = 0
+
+        # Crop to robot height
+        min_height = int(self.obs_min_height / self.grid_resolution)
+        max_height = int(self.obs_max_height / self.grid_resolution)
+        grid_size = self.grid_size + [max_height]
+
+        # Mask out obstacles only above a certain height
+        obs_mask = xyz[:, -1] < max_height
+        xyz = xyz[obs_mask, :]
+        alignments = voxel_map_localizer.find_alignment_over_model(text)[0].detach().cpu()
+        alignments = alignments[obs_mask][:, None]
+
+        alignment_heuristics = scatter3d(xyz, alignments, grid_size, "max")
+        alignment_heuristics = torch.max(alignment_heuristics, dim=-1).values
+        alignment_heuristics = torch.from_numpy(maximum_filter(alignment_heuristics.numpy(), size = 7))
+        return alignment_heuristics
+
+
+    def get_2d_map(
+        self, debug: bool = False, return_history_id: bool = False
+    ) -> Tuple[np.ndarray, ...]:
+        """Get 2d map with explored area and frontiers."""
+
+        # Is this already cached? If so we don't need to go to all this work
+        if self._map2d is not None and self._history_soft is not None and self._seq == self._2d_last_updated:
+            return self._map2d if not return_history_id else (*self._map2d, self._history_soft)
+
+        # Convert metric measurements to discrete
+        # Gets the xyz correctly - for now everything is assumed to be within the correct distance of origin
+        xyz, _, counts, _ = self.voxel_pcd.get_pointcloud()
+        # print(counts)
+        if xyz is not None:
+            counts = torch.ones(xyz.shape[0])
+        obs_ids = self.voxel_pcd._obs_counts
+        if xyz is None:
+            xyz = torch.zeros((0, 3))
+            counts = torch.zeros((0))
+            obs_ids = torch.zeros((0))
+
+        device = xyz.device
+        xyz = ((xyz / self.grid_resolution) + self.grid_origin + 0.5).long()
+        xyz[xyz[:, -1] < 0, -1] = 0
+
+        # Crop to robot height
+        min_height = int(self.obs_min_height / self.grid_resolution)
+        max_height = int(self.obs_max_height / self.grid_resolution)
+        grid_size = self.grid_size + [max_height]
+        voxels = torch.zeros(grid_size, device=device)
+
+        # Mask out obstacles only above a certain height
+        obs_mask = xyz[:, -1] < max_height
+        xyz = xyz[obs_mask, :]
+        counts = counts[obs_mask][:, None]
+        # print(counts)
+        obs_ids = obs_ids[obs_mask][:, None]
+
+        # voxels[x_coords, y_coords, z_coords] = 1
+        voxels = scatter3d(xyz, counts, grid_size)
+        history_ids = scatter3d(xyz, obs_ids, grid_size, "max")
+
+        # Compute the obstacle voxel grid based on what we've seen
+        obstacle_voxels = voxels[:, :, min_height:max_height]
+        obstacles_soft = torch.sum(obstacle_voxels, dim=-1)
+        obstacles = obstacles_soft > self.obs_min_density
+
+        history_ids = history_ids[:, :, min_height:max_height]
+        history_soft = torch.max(history_ids, dim=-1).values
+        history_soft = torch.from_numpy(maximum_filter(history_soft.float().numpy(), size = 5))
+
+        if self._remove_visited_from_obstacles:
+            # Remove "visited" points containing observations of the robot
+            obstacles *= (1 - self._visited).bool()
+
+        if self.dilate_obstacles_kernel is not None:
+            obstacles = binary_dilation(
+                obstacles.float().unsqueeze(0).unsqueeze(0),
+                self.dilate_obstacles_kernel,
+            )[0, 0].bool()
+
+        # Explored area = only floor mass
+        # floor_voxels = voxels[:, :, :min_height]
+        explored_soft = torch.sum(voxels, dim=-1)
+
+        # Add explored radius around the robot, up to min depth
+        # TODO: make sure lidar is supported here as well; if we do not have lidar assume a certain radius is explored
+        explored = explored_soft > self.exp_min_density
+        explored = (torch.zeros_like(explored) + self._visited).to(torch.bool) | explored
+
+        # Also shrink the explored area to build more confidence
+        # That we will not collide with anything while moving around
+        # if self.dilate_obstacles_kernel is not None:
+        #    explored = binary_erosion(
+        #        explored.float().unsqueeze(0).unsqueeze(0),
+        #        self.dilate_obstacles_kernel,
+        #    )[0, 0].bool()
+        if self.smooth_kernel_size > 0:
+            # Opening and closing operations here on explore
+            explored = binary_erosion(
+                binary_dilation(
+                    explored.float().unsqueeze(0).unsqueeze(0), self.smooth_kernel
+                ),
+                self.smooth_kernel,
+            ) #[0, 0].bool()
+            explored = binary_dilation(
+                binary_erosion(explored, self.smooth_kernel),
+                self.smooth_kernel,
+            )[0, 0].bool()
+
+            # Obstacles just get dilated and eroded
+            # This might influence the obstacle size
+            # obstacles = binary_erosion(
+            #     binary_dilation(
+            #         obstacles.float().unsqueeze(0).unsqueeze(0), self.smooth_kernel
+            #     ),
+            #     self.smooth_kernel,
+            # )[0, 0].bool()
+
+        if debug:
+            import matplotlib.pyplot as plt
+
+            plt.subplot(2, 2, 1)
+            plt.imshow(obstacles_soft.detach().cpu().numpy())
+            plt.title("obstacles soft")
+            plt.axis("off")
+            plt.subplot(2, 2, 2)
+            plt.imshow(explored_soft.detach().cpu().numpy())
+            plt.title("explored soft")
+            plt.axis("off")
+            plt.subplot(2, 2, 3)
+            plt.imshow(obstacles.detach().cpu().numpy())
+            plt.title("obstacles")
+            plt.axis("off")
+            plt.subplot(2, 2, 4)
+            plt.imshow(explored.detach().cpu().numpy())
+            plt.axis("off")
+            plt.title("explored")
+            plt.show()
+
+        # Update cache
+        obstacles[0:30, :] = True
+        obstacles[-30:, :] = True
+        obstacles[:, 0:30] = True
+        obstacles[:, -30:] = True
+        self._map2d = (obstacles, explored)
+        self._2d_last_updated = self._seq
+        self._history_soft = history_soft
+        if not return_history_id:
+            return obstacles, explored
+        else:
+            return obstacles, explored, history_soft
+
+    def xy_to_grid_coords(self, xy: torch.Tensor) -> Optional[np.ndarray]:
+        """convert xy point to grid coords"""
+        if not isinstance(xy, torch.Tensor):
+            xy = torch.Tensor(xy)
+        assert xy.shape[-1] == 2, "coords must be Nx2 or 2d array"
+        # Handle convertion
+        if isinstance(xy, np.ndarray):
+            xy = torch.from_numpy(xy).float()
+        grid_xy = (xy / self.grid_resolution) + self.grid_origin[:2]
+        if torch.any(grid_xy >= self._grid_size_t) or torch.any(
+            grid_xy < torch.zeros(2)
+        ):
+            return None
+        else:
+            return grid_xy.int()
+
+    def plan_to_grid_coords(
+        self, plan_result: PlanResult
+    ) -> Optional[List[torch.Tensor]]:
+        """Convert a plan properly into grid coordinates"""
+        if not plan_result.success:
+            return None
+        else:
+            traj = []
+            for node in plan_result.trajectory:
+                traj.append(self.xy_to_grid_coords(node.state[:2]))
+            return traj
+
+    def grid_coords_to_xy(self, grid_coords: torch.Tensor) -> np.ndarray:
+        """convert grid coordinate point to metric world xy point"""
+        assert grid_coords.shape[-1] == 2, "grid coords must be an Nx2 or 2d array"
+        return (grid_coords - self.grid_origin[:2]) * self.grid_resolution
+
+    def grid_coords_to_xyt(self, grid_coords: np.ndarray) -> np.ndarray:
+        """convert grid coordinate point to metric world xyt point"""
+        res = torch.ones(3)
+        res[:2] = self.grid_coords_to_xy(grid_coords)
+        return res
+
+    def show(
+        self, backend: str = "open3d", **backend_kwargs
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Display the aggregated point cloud."""
+        if backend == "open3d":
+            return self._show_open3d(**backend_kwargs)
+        else:
+            raise NotImplementedError(
+                f"Uknown backend {backend}, must be 'open3d' or 'pytorch3d"
+            )
+
+    def get_xyz_rgb(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Return xyz and rgb of the current map"""
+        points, _, _, rgb = self.voxel_pcd.get_pointcloud()
+        return points, rgb
+
+    def sample_from_mask(self, mask: torch.Tensor) -> Optional[np.ndarray]:
+        """Sample from any mask"""
+        valid_indices = torch.nonzero(mask, as_tuple=False)
+        if valid_indices.size(0) > 0:
+            random_index = torch.randint(valid_indices.size(0), (1,))
+            return self.grid_coords_to_xy(valid_indices[random_index])
+        else:
+            return None
+
+    def xyt_is_safe(self, xyt: np.ndarray, robot: Optional[RobotModel] = None) -> bool:
+        """Check to see if a given xyt position is known to be safe."""
+        if robot is not None:
+            raise NotImplementedError(
+                "not currently checking against robot base geometry"
+            )
+        obstacles, explored = self.get_2d_map()
+        # Convert xy to grid coords
+        grid_xy = self.xy_to_grid_coords(xyt[:2])
+        # Check to see if grid coords are explored and obstacle free
+        if grid_xy is None:
+            # Conversion failed - probably out of bounds
+            return False
+        obstacles, explored = self.get_2d_map()
+        # Convert xy to grid coords
+        grid_xy = self.xy_to_grid_coords(xyt[:2])
+        # Check to see if grid coords are explored and obstacle free
+        if grid_xy is None:
+            # Conversion failed - probably out of bounds
+            return False
+        navigable = ~obstacles & explored
+        return bool(navigable[grid_xy[0], grid_xy[1]])
+        # if robot is not None:
+        #     # TODO: check against robot geometry
+        #     raise NotImplementedError(
+        #         "not currently checking against robot base geometry"
+        #     )
+        # return True
+
+    def _get_boxes_from_points(
+        self,
+        traversible: torch.Tensor,
+        color: List[float],
+        is_map: bool = True,
+        height: float = 0.0,
+        offset: Optional[np.ndarray] = None,
+    ):
+        """Get colored boxes for a mask"""
+        # Get indices for all traversible locations
+        traversible_indices = np.argwhere(traversible)
+        # Traversible indices will be a 2xN array, so we need to transpose it.
+        # Set to floor/max obs height and bright red
+        if is_map:
+            traversible_pts = self.grid_coords_to_xy(traversible_indices.T)
+        else:
+            traversible_pts = (
+                traversible_indices.T - np.ceil([d / 2 for d in traversible.shape])
+            ) * self.grid_resolution
+        if offset is not None:
+            traversible_pts += offset
+
+        geoms = []
+        for i in range(traversible_pts.shape[0]):
+            center = np.array(
+                [
+                    traversible_pts[i, 0],
+                    traversible_pts[i, 1],
+                    self.obs_min_height + height,
+                ]
+            )
+            dimensions = np.array(
+                [self.grid_resolution, self.grid_resolution, self.grid_resolution]
+            )
+
+            # Create a custom geometry with red color
+            mesh_box = open3d.geometry.TriangleMesh.create_box(
+                width=dimensions[0], height=dimensions[1], depth=dimensions[2]
+            )
+            mesh_box.paint_uniform_color(color)  # Set color to red
+            mesh_box.translate(center)
+
+            # Visualize the red box
+            geoms.append(mesh_box)
+        return geoms
+
+    def _get_open3d_geometries(
+        self,
+        orig: Optional[np.ndarray] = None,
+        norm: float = 255.0,
+        xyt: Optional[np.ndarray] = None,
+        footprint: Optional[Footprint] = None,
+        **backend_kwargs,
+    ):
+        """Show and return bounding box information and rgb color information from an explored point cloud. Uses open3d."""
+
+        # Create a combined point cloud
+        # pc_xyz, pc_rgb, pc_feats = self.get_data()
+        points, _, _, rgb = self.voxel_pcd.get_pointcloud()
+        pcd = numpy_to_pcd(
+            points.detach().cpu().numpy(), (rgb / norm).detach().cpu().numpy()
+        )
+        if orig is None:
+            orig = np.zeros(3)
+        geoms = create_visualization_geometries(pcd=pcd, orig=orig)
+
+        # Get the explored/traversible area
+        obstacles, explored = self.get_2d_map()
+        traversible = explored & ~obstacles
+
+        geoms += self._get_boxes_from_points(traversible, [0, 1, 0])
+        geoms += self._get_boxes_from_points(obstacles, [1, 0, 0])
+
+        if xyt is not None and footprint is not None:
+            geoms += self._get_boxes_from_points(
+                footprint.get_rotated_mask(self.grid_resolution, float(xyt[2])),
+                [0, 0, 1],
+                is_map=False,
+                height=0.1,
+                offset=xyt[:2],
+            )
+
+        return geoms
+
+    def _get_o3d_robot_footprint_geometry(
+        self,
+        xyt: np.ndarray,
+        dimensions: Optional[np.ndarray] = None,
+        length_offset: float = 0,
+    ):
+        """Get a 3d mesh cube for the footprint of the robot. But this does not work very well for whatever reason."""
+        # Define the dimensions of the cube
+        if dimensions is None:
+            dimensions = np.array(
+                [0.2, 0.2, 0.2]
+            )  # Cube dimensions (length, width, height)
+
+        x, y, theta = xyt
+        # theta = theta - np.pi/2
+
+        # Create a custom mesh cube with the specified dimensions
+        mesh_cube = open3d.geometry.TriangleMesh.create_box(
+            width=dimensions[0], height=dimensions[1], depth=dimensions[2]
+        )
+
+        # Define the transformation matrix for position and orientation
+        rotation_matrix = np.array(
+            [
+                [math.cos(theta), -math.sin(theta), 0],
+                [math.sin(theta), math.cos(theta), 0],
+                [0, 0, 1],
+            ]
+        )
+        # Calculate the translation offset based on theta
+        length_offset = 0
+        x_offset = length_offset - (dimensions[0] / 2)
+        y_offset = -1 * dimensions[1] / 2
+        dx = (math.cos(theta) * x_offset) + (math.cos(theta - np.pi / 2) * y_offset)
+        dy = (math.sin(theta + np.pi / 2) * y_offset) + (math.sin(theta) * x_offset)
+        translation_vector = np.array(
+            [x + dx, y + dy, 0]
+        )  # Apply offset based on theta
+        transformation_matrix = np.identity(4)
+        transformation_matrix[:3, :3] = rotation_matrix
+        transformation_matrix[:3, 3] = translation_vector
+
+        # Apply the transformation to the cube
+        mesh_cube.transform(transformation_matrix)
+
+        # Set the color of the cube to blue
+        mesh_cube.paint_uniform_color([0.0, 0.0, 1.0])  # Set color to blue
+
+        return mesh_cube
+
+    def _show_open3d(
+        self,
+        orig: Optional[np.ndarray] = None,
+        norm: float = 255.0,
+        xyt: Optional[np.ndarray] = None,
+        footprint: Optional[Footprint] = None,
+        **backend_kwargs,
+    ):
+        """Show and return bounding box information and rgb color information from an explored point cloud. Uses open3d."""
+
+        # get geometries so we can use them
+        geoms = self._get_open3d_geometries(
+            orig, norm, xyt=xyt, footprint=footprint
+        )
+
+        # Show the geometries of where we have explored
+        open3d.visualization.draw_geometries(geoms)
+
+        # Returns xyz and rgb for further inspection
+        points, _, _, rgb = self.voxel_pcd.get_pointcloud()
+        return points, rgb
diff --git a/src/stretch/dynav/mapping_utils/voxel_map.py b/src/stretch/dynav/mapping_utils/voxel_map.py
new file mode 100644
index 000000000..8cd14c743
--- /dev/null
+++ b/src/stretch/dynav/mapping_utils/voxel_map.py
@@ -0,0 +1,815 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import Optional, Tuple
+
+import matplotlib.pyplot as plt
+import numpy as np
+import skfmm
+import skimage
+import skimage.morphology
+import torch
+
+from stretch.dynav.mapping_utils import SparseVoxelMap
+from stretch.motion import XYT, Footprint
+from stretch.utils.geometry import angle_difference, interpolate_angles
+from stretch.utils.morphology import (
+    binary_dilation,
+    binary_erosion,
+    expand_mask,
+    find_closest_point_on_mask,
+    get_edges,
+)
+
+class SparseVoxelMapNavigationSpace(XYT):
+    """subclass for sampling XYT states from explored space"""
+
+    # Used for making sure we do not divide by zero anywhere
+    tolerance: float = 1e-8
+
+    def __init__(
+        self,
+        voxel_map: SparseVoxelMap,
+        step_size: float = 0.1,
+        rotation_step_size: float = 0.5,
+        use_orientation: bool = False,
+        orientation_resolution: int = 64,
+        dilate_frontier_size: int = 12,
+        dilate_obstacle_size: int = 2,
+        extend_mode: str = "separate",
+    ):
+        print('------------------------YOU ARE NOW RUNNING PEIQI VOXEL NAVIGATION SPACE CODES-----------------')
+        self.step_size = step_size
+        self.rotation_step_size = rotation_step_size
+        self.voxel_map = voxel_map
+        self.create_collision_masks(orientation_resolution)
+        self.extend_mode = extend_mode
+
+        # Always use 3d states
+        self.use_orientation = use_orientation
+        if self.use_orientation:
+            self.dof = 3
+        else:
+            self.dof = 2
+
+        self._kernels = {}
+
+        if dilate_frontier_size > 0:
+            self.dilate_explored_kernel = torch.nn.Parameter(
+                torch.from_numpy(skimage.morphology.disk(dilate_frontier_size))
+                .unsqueeze(0)
+                .unsqueeze(0)
+                .float(),
+                requires_grad=False,
+            )
+        else:
+            self.dilate_explored_kernel = None
+        if dilate_obstacle_size > 0:
+            self.dilate_obstacles_kernel = torch.nn.Parameter(
+                torch.from_numpy(skimage.morphology.disk(dilate_obstacle_size))
+                .unsqueeze(0)
+                .unsqueeze(0)
+                .float(),
+                requires_grad=False,
+            )
+        else:
+            self.dilate_obstacles_kernel = None
+
+    def draw_state_on_grid(
+        self, img: np.ndarray, state: np.ndarray, weight: int = 10
+    ) -> np.ndarray:
+        """Helper function to draw masks on image"""
+        grid_xy = self.voxel_map.xy_to_grid_coords(state[:2])
+        mask = self.get_oriented_mask(state[2])
+        x0 = int(np.round(float(grid_xy[0] - mask.shape[0] // 2)))
+        x1 = x0 + mask.shape[0]
+        y0 = int(np.round(float(grid_xy[1] - mask.shape[1] // 2)))
+        y1 = y0 + mask.shape[1]
+        img[x0:x1, y0:y1] += mask * weight
+        return img
+
+    def create_collision_masks(
+        self, orientation_resolution: int, show_all: bool = False
+    ):
+        """Create a set of orientation masks
+
+        Args:
+            orientation_resolution: number of bins to break it into
+        """
+        self._footprint = Footprint(width=0.34, length=0.33, width_offset=0.0, length_offset=-0.1)
+        self._orientation_resolution = 64
+        self._oriented_masks = []
+
+        # NOTE: this is just debug code - lets you see waht the masks look like
+        assert not show_all or orientation_resolution == 64
+
+        for i in range(orientation_resolution):
+            theta = i * 2 * np.pi / orientation_resolution
+            mask = self._footprint.get_rotated_mask(
+                self.voxel_map.grid_resolution, angle_radians=theta
+            )
+            if show_all:
+                plt.subplot(8, 8, i + 1)
+                plt.axis("off")
+                plt.imshow(mask.cpu().numpy())
+            self._oriented_masks.append(mask)
+        if show_all:
+            plt.show()
+
+    def distance(self, q0: np.ndarray, q1: np.ndarray) -> float:
+        """Return distance between q0 and q1."""
+        assert len(q0) == 3, "must use 3 dimensions for current state"
+        assert len(q1) == 3 or len(q1) == 2, "2 or 3 dimensions for goal"
+        if len(q1) == 3:
+            # Measure to the final position exactly
+            return np.linalg.norm(q0 - q1)
+        else:
+            # Measure only to the final goal x/y position
+            return np.linalg.norm(q0[:2] - q1[:2])
+
+    def extend(self, q0: np.ndarray, q1: np.ndarray) -> np.ndarray:
+        """extend towards another configuration in this space. Will be either separate or joint depending on if the robot can "strafe":
+        separate: move then rotate
+        joint: move and rotate all at once."""
+        assert len(q0) == 3, f"initial configuration must be 3d, was {q0}"
+        assert (
+            len(q1) == 3 or len(q1) == 2
+        ), f"final configuration can be 2d or 3d, was {q1}"
+        if self.extend_mode == "separate":
+            return self._extend_separate(q0, q1)
+        elif self.extend_mode == "joint":
+            # Just default to linear interpolation, does not use rotation_step_size
+            return super().extend(q0, q1)
+        else:
+            raise NotImplementedError(f"not supported: {self.extend_mode=}")
+
+    def _extend_separate(
+        self, q0: np.ndarray, q1: np.ndarray, xy_tol: float = 1e-8
+    ) -> np.ndarray:
+        """extend towards another configuration in this space.
+        TODO: we can set the classes here, right now assuming still np.ndarray"""
+        assert len(q0) == 3, f"initial configuration must be 3d, was {q0}"
+        assert (
+            len(q1) == 3 or len(q1) == 2
+        ), f"final configuration can be 2d or 3d, was {q1}"
+        dxy = q1[:2] - q0[:2]
+        step = dxy / np.linalg.norm(dxy + self.tolerance) * self.step_size
+        xy = np.copy(q0[:2])
+        goal_dxy = np.linalg.norm(q1[:2] - q0[:2])
+        if (
+            goal_dxy
+            > xy_tol
+            # or goal_dxy > self.step_size
+            # or angle_difference(q1[-1], q0[-1]) > self.rotation_step_size
+        ):
+            # Turn to new goal
+            # Compute theta looking at new goal point
+            new_theta = math.atan2(dxy[1], dxy[0])
+            if new_theta < 0:
+                new_theta += 2 * np.pi
+
+            # TODO: orient towards the new theta
+            cur_theta = q0[-1]
+            angle_diff = angle_difference(new_theta, cur_theta)
+            while angle_diff > self.rotation_step_size:
+                # Interpolate
+                cur_theta = interpolate_angles(
+                    cur_theta, new_theta, self.rotation_step_size
+                )
+                # print("interp ang =", cur_theta, "from =", cur_theta, "to =", new_theta)
+                yield np.array([xy[0], xy[1], cur_theta])
+                angle_diff = angle_difference(new_theta, cur_theta)
+
+            # First, turn in the right direction
+            next_pt = np.array([xy[0], xy[1], new_theta])
+            # After this we should have finished turning
+            yield next_pt
+
+            # Now take steps towards the right goal
+            while np.linalg.norm(xy - q1[:2]) > self.step_size:
+                xy = xy + step
+                yield np.array([xy[0], xy[1], new_theta])
+
+            # Update current angle
+            cur_theta = new_theta
+
+            # Finish stepping to goal
+            xy[:2] = q1[:2]
+            yield np.array([xy[0], xy[1], cur_theta])
+        else:
+            cur_theta = q0[-1]
+
+        # now interpolate to goal angle
+        angle_diff = angle_difference(q1[-1], cur_theta)
+        while angle_diff > self.rotation_step_size:
+            # Interpolate
+            cur_theta = interpolate_angles(cur_theta, q1[-1], self.rotation_step_size)
+            yield np.array([xy[0], xy[1], cur_theta])
+            angle_diff = angle_difference(q1[-1], cur_theta)
+
+        # Get to final angle
+        yield np.array([xy[0], xy[1], q1[-1]])
+
+        # At the end, rotate into the correct orientation
+        yield q1
+
+    def _get_theta_index(self, theta: float) -> int:
+        """gets the index associated with theta here"""
+        if theta < 0:
+            theta += 2 * np.pi
+        if theta >= 2 * np.pi:
+            theta -= 2 * np.pi
+        assert (
+            theta >= 0 and theta <= 2 * np.pi
+        ), "only angles between 0 and 2*PI allowed"
+        theta_idx = np.round((theta / (2 * np.pi) * self._orientation_resolution) - 0.5)
+        if theta_idx == self._orientation_resolution:
+            theta_idx = 0
+        return int(theta_idx)
+
+    def get_oriented_mask(self, theta: float) -> torch.Tensor:
+        theta_idx = self._get_theta_index(theta)
+        return self._oriented_masks[theta_idx]
+
+    def is_valid(
+        self,
+        state: torch.Tensor,
+        is_safe_threshold=1.0,
+        debug: bool = False,
+        verbose: bool = False,
+    ) -> bool:
+        """Check to see if state is valid; i.e. if there's any collisions if mask is at right place"""
+        assert len(state) == 3
+        if isinstance(state, np.ndarray):
+            state = torch.from_numpy(state).float()
+        ok = bool(self.voxel_map.xyt_is_safe(state[:2]))
+        # if verbose:
+        #     print('is navigable:', ok)
+        if not ok:
+            # This was
+            return False
+
+        # Now sample mask at this location
+        mask = self.get_oriented_mask(state[-1])
+        assert mask.shape[0] == mask.shape[1], "square masks only for now"
+        dim = mask.shape[0]
+        half_dim = dim // 2
+        grid_xy = self.voxel_map.xy_to_grid_coords(state[:2])
+        x0 = int(grid_xy[0]) - half_dim
+        x1 = x0 + dim
+        y0 = int(grid_xy[1]) - half_dim
+        y1 = y0 + dim
+
+        obstacles, explored = self.voxel_map.get_2d_map()
+
+        crop_obs = obstacles[x0:x1, y0:y1]
+        crop_exp = explored[x0:x1, y0:y1]
+        assert mask.shape == crop_obs.shape
+        assert mask.shape == crop_exp.shape
+
+        collision = torch.any(crop_obs & mask)
+
+        p_is_safe = (
+            torch.sum((crop_exp & mask) | ~mask) / (mask.shape[0] * mask.shape[1])
+        ).item()
+        is_safe = p_is_safe >= is_safe_threshold
+        if verbose:
+            print(f"{collision=}, {is_safe=}, {p_is_safe=}, {is_safe_threshold=}")
+
+        valid = bool((not collision) and is_safe)
+        if debug:
+            if collision:
+                print("- state in collision")
+            if not is_safe:
+                print("- not safe")
+
+            print(f"{valid=}")
+            obs = obstacles.cpu().numpy().copy()
+            exp = explored.cpu().numpy().copy()
+            obs[x0:x1, y0:y1] = 1
+            plt.subplot(321)
+            plt.imshow(obs)
+            plt.subplot(322)
+            plt.imshow(exp)
+            plt.subplot(323)
+            plt.imshow(crop_obs.cpu().numpy())
+            plt.title("obstacles")
+            plt.subplot(324)
+            plt.imshow(crop_exp.cpu().numpy())
+            plt.title("explored")
+            plt.subplot(325)
+            plt.imshow(mask.cpu().numpy())
+            plt.show()
+
+        return valid
+
+    def compute_theta(self, cur_x, cur_y, end_x, end_y):
+        theta = 0
+        if end_x == cur_x and end_y >= cur_y:
+            theta = np.pi / 2
+        elif end_x == cur_x and end_y < cur_y:
+            theta = -np.pi / 2
+        else:
+            theta = np.arctan((end_y - cur_y) / (end_x - cur_x))
+            if end_x < cur_x:
+                theta = theta + np.pi
+            if theta > np.pi:
+                theta = theta - 2 * np.pi
+            if theta < -np.pi:
+                theta = theta + 2 * np.pi
+        return theta
+
+    def sample_target_point(
+        self,
+        start: torch.Tensor,
+        point: torch.Tensor,
+        planner,
+    ) -> Optional[np.array]:
+        """Sample a position near the mask and return.
+
+        Args:
+            look_at_any_point(bool): robot should look at the closest point on target mask instead of average pt
+        """
+
+        obstacles, explored = self.voxel_map.get_2d_map()
+
+        # Extract edges from our explored mask
+        start_pt = planner.to_pt(start)
+        reachable_points = planner.get_reachable_points(start_pt)
+        if len(reachable_points) == 0:
+            print('No target point find, maybe no point is reachable')
+            return None
+        reachable_xs, reachable_ys = zip(*reachable_points)
+        reachable_xs = torch.tensor(reachable_xs)
+        reachable_ys = torch.tensor(reachable_ys)
+        reachable = torch.empty(obstacles.shape, dtype = torch.bool).fill_(False)
+        reachable[reachable_xs, reachable_ys] = True
+        
+        obstacles, explored = self.voxel_map.get_2d_map()
+        if self.dilate_obstacles_kernel is not None:
+            obstacles = binary_dilation(
+                obstacles.float().unsqueeze(0).unsqueeze(0), self.dilate_obstacles_kernel
+            )[0, 0].bool()
+        reachable = reachable & ~obstacles
+
+        target_x, target_y = planner.to_pt(point)
+
+        xs, ys = torch.where(reachable)
+        if len(xs) < 1:
+            print('No target point find, maybe no point is reachable')
+            return None
+        selected_targets = torch.stack([xs, ys], dim = -1) \
+            [torch.linalg.norm( \
+                (torch.stack([xs, ys], dim = -1) - torch.tensor([target_x, target_y])).float(), dim = -1 \
+            ).topk(k = len(xs), largest = False).indices]
+
+        # TODO: was this:
+        # expanded_mask = expanded_mask & less_explored & ~obstacles
+
+        for selected_target in selected_targets:
+            selected_x, selected_y = planner.to_xy([selected_target[0], selected_target[1]])
+            theta = self.compute_theta(selected_x, selected_y, point[0], point[1])
+
+            # if debug and self.is_valid([selected_x, selected_y, theta]):
+            #     import matplotlib.pyplot as plt
+
+            #     obstacles, explored = self.voxel_map.get_2d_map()
+            #     plt.scatter(ys, xs, s = 1)
+            #     plt.scatter(selected_target[1], selected_target[0], s = 10)
+            #     plt.scatter(target_y, target_x, s = 10)
+            #     plt.imshow(obstacles)
+            target_is_valid = self.is_valid([selected_x, selected_y, theta])
+            # print('Target:', [selected_x, selected_y, theta])
+            # print('Target is valid:', target_is_valid)
+            if not target_is_valid:
+                continue
+            if np.linalg.norm([selected_x - point[0], selected_y - point[1]]) < 0.35:
+                continue
+            elif np.linalg.norm([selected_x - point[0], selected_y - point[1]]) <= 0.5:
+                print('OBSTACLE AVOIDANCE')
+                print(selected_target[0].int(), selected_target[1].int())
+                i = (point[0] - selected_target[0]) // abs(point[0] - selected_target[0])
+                j = (point[1] - selected_target[1]) // abs(point[1] - selected_target[1])
+                index_i = int(selected_target[0].int() + i)
+                index_j = int(selected_target[1].int() + j)
+                if obstacles[index_i][index_j]:
+                    target_is_valid = False
+            # elif np.linalg.norm([selected_x - point[0], selected_y - point[1]]) <= 0.5:
+            #     for i in [-1, 0, 1]:
+            #         for j in [-1, 0, 1]:
+            #             if obstacles[selected_target[0] + i][selected_target[1] + j]:
+            #                 target_is_valid = False
+            if not target_is_valid:
+                continue
+            
+            return np.array([selected_x, selected_y, theta])
+        return None
+
+    def sample_near_mask(
+        self,
+        mask: torch.Tensor,
+        radius_m: float = 0.6,
+        max_tries: int = 1000,
+        verbose: bool = False,
+        debug: bool = False,
+        look_at_any_point: bool = False,
+    ) -> Optional[np.ndarray]:
+        """Sample a position near the mask and return.
+
+        Args:
+            look_at_any_point(bool): robot should look at the closest point on target mask instead of average pt
+        """
+
+        obstacles, explored = self.voxel_map.get_2d_map()
+
+        # Extract edges from our explored mask
+
+        # Radius computed from voxel map measurements
+        radius = np.ceil(radius_m / self.voxel_map.grid_resolution)
+        expanded_mask = expand_mask(mask, radius)
+
+        # TODO: was this:
+        # expanded_mask = expanded_mask & less_explored & ~obstacles
+        expanded_mask = expanded_mask & explored & ~obstacles
+        # print(torch.where(explored & ~obstacles))
+        # print(torch.where(expanded_mask))
+
+        if debug:
+            import matplotlib.pyplot as plt
+
+            plt.imshow(
+                mask.int() * 20
+                + expanded_mask.int() * 10
+                + explored.int()
+                + obstacles.int() * 5
+            )
+            # import datetime
+            # current_datetime = datetime.datetime.now()
+            # formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
+            # plt.savefig('debug_' + formatted_datetime + '.png')
+
+
+        # Where can the robot go?
+        valid_indices = torch.nonzero(expanded_mask, as_tuple=False)
+        if valid_indices.size(0) == 0:
+            print("[VOXEL MAP: sampling] No valid goals near mask!")
+            return None
+        if not look_at_any_point:
+            mask_indices = torch.nonzero(mask, as_tuple=False)
+            outside_point = mask_indices.float().mean(dim=0)
+
+        # maximum number of tries
+        for i in range(max_tries):
+            random_index = torch.randint(valid_indices.size(0), (1,))
+            point_grid_coords = valid_indices[random_index]
+
+            if look_at_any_point:
+                outside_point = find_closest_point_on_mask(
+                    mask, point_grid_coords.float()
+                )
+
+            # convert back
+            point = self.voxel_map.grid_coords_to_xy(point_grid_coords)
+            if point is None:
+                print("[VOXEL MAP: sampling] ERR:", point, point_grid_coords)
+                continue
+            if outside_point is None:
+                print(
+                    "[VOXEL MAP: sampling] ERR finding closest pt:",
+                    point,
+                    point_grid_coords,
+                    "closest =",
+                    outside_point,
+                )
+                continue
+            theta = math.atan2(
+                outside_point[1] - point_grid_coords[0, 1],
+                outside_point[0] - point_grid_coords[0, 0],
+            )
+
+            # Ensure angle is in 0 to 2 * PI
+            if theta < 0:
+                theta += 2 * np.pi
+
+            xyt = torch.zeros(3)
+            xyt[:2] = point
+            xyt[2] = theta
+
+            # Check to see if this point is valid
+            if verbose:
+                print("[VOXEL MAP: sampling]", radius, i, "sampled", xyt)
+            if self.is_valid(xyt, verbose=verbose):
+                yield xyt
+
+        # We failed to find anything useful
+        return None
+
+    def has_zero_contour(self, phi):
+        """
+        Check if a zero contour exists in the given phi array.
+
+        Parameters:
+        - phi: 2D NumPy array with boolean values.
+
+        Returns:
+        - True if a zero contour exists, False otherwise.
+        """
+        # Check if there are True and False values in the array
+        has_true_values = np.any(phi)
+        has_false_values = np.any(~phi)
+
+        # Return True if both True and False values are present
+        return has_true_values and has_false_values
+
+    def _get_kernel(self, size: int):
+        """Return a kernel for expanding/shrinking areas."""
+        if size <= 0:
+            return None
+        if size not in self._kernels:
+            kernel = torch.nn.Parameter(
+                torch.from_numpy(skimage.morphology.disk(size))
+                .unsqueeze(0)
+                .unsqueeze(0)
+                .float(),
+                requires_grad=False,
+            )
+            self._kernels[size] = kernel
+        return self._kernels[size]
+
+    def get_frontier(
+        self, expand_size: int = 5, debug: bool = False
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute frontier regions of the map"""
+
+        obstacles, explored = self.voxel_map.get_2d_map()
+        # Extract edges from our explored mask
+        # if self.dilate_obstacles_kernel is not None:
+        #     obstacles = binary_dilation(
+        #         obstacles.float().unsqueeze(0).unsqueeze(0), self.dilate_obstacles_kernel
+        #     )[0, 0].bool()
+        if self.dilate_explored_kernel is not None:
+            less_explored = binary_erosion(
+                explored.float().unsqueeze(0).unsqueeze(0), self.dilate_explored_kernel
+            )[0, 0]
+        else:
+            less_explored = explored
+
+        # Get the masks from our 3d map
+        edges = get_edges(less_explored)
+
+        # Do not explore obstacles any more
+        traversible = explored & ~obstacles
+        frontier_edges = edges & ~obstacles
+
+        kernel = self._get_kernel(expand_size)
+        if kernel is not None:
+            expanded_frontier = binary_dilation(
+                frontier_edges.float().unsqueeze(0).unsqueeze(0),
+                kernel,
+            )[0, 0].bool()
+        else:
+            # This is a bad idea, planning will probably fail
+            expanded_frontier = frontier_edges
+
+        outside_frontier = expanded_frontier & ~explored
+        frontier = expanded_frontier & ~obstacles & explored
+
+        if debug:
+            import matplotlib.pyplot as plt
+
+            plt.subplot(321)
+            plt.imshow(obstacles.cpu().numpy())
+            plt.title("obstacles")
+            plt.subplot(322)
+            plt.imshow(explored.bool().cpu().numpy())
+            plt.title("explored")
+            plt.subplot(323)
+            plt.imshow((traversible & frontier).cpu().numpy())
+            plt.title("traversible & frontier")
+            plt.subplot(324)
+            plt.imshow((expanded_frontier).cpu().numpy())
+            plt.title("just frontiers")
+            plt.subplot(325)
+            plt.imshow((edges).cpu().numpy())
+            plt.title("edges")
+            plt.subplot(326)
+            plt.imshow((frontier_edges).cpu().numpy())
+            plt.title("frontier_edges")
+            # plt.show()
+
+        return frontier, outside_frontier, traversible
+
+    def sample_exploration(
+        self,
+        xyt,
+        planner,
+        voxel_map_localizer = None,
+        text = None,
+        debug = False,
+    ):
+        obstacles, explored, history_soft = self.voxel_map.get_2d_map(return_history_id = True)
+        if len(xyt) == 3:  
+            xyt = xyt[:2]
+        reachable_points = planner.get_reachable_points(planner.to_pt(xyt))
+        reachable_xs, reachable_ys = zip(*reachable_points)
+        reachable_xs = torch.tensor(reachable_xs)
+        reachable_ys = torch.tensor(reachable_ys)
+
+        reachable_map = torch.zeros_like(obstacles)
+        reachable_map[reachable_xs, reachable_ys] = 1
+        reachable_map = reachable_map.to(torch.bool)
+        edges = get_edges(reachable_map)
+        # kernel = self._get_kernel(expand_size)
+        kernel = None
+        if kernel is not None:
+            expanded_frontier = binary_dilation(
+                edges.float().unsqueeze(0).unsqueeze(0),
+                kernel,
+            )[0, 0].bool()
+        else:
+            expanded_frontier = edges
+        outside_frontier = expanded_frontier & ~reachable_map
+        time_heuristics = self._time_heuristic(history_soft, outside_frontier, debug = debug)
+        if voxel_map_localizer is not None:
+            alignments_heuristics = self.voxel_map.get_2d_alignment_heuristics(voxel_map_localizer, text)
+            alignments_heuristics = self._alignment_heuristic(alignments_heuristics, outside_frontier, debug = debug)
+            total_heuristics = time_heuristics + alignments_heuristics
+        else:
+            alignments_heuristics = None
+            total_heuristics = time_heuristics
+
+        rounded_heuristics = np.ceil(total_heuristics * 100) / 100
+        max_heuristic = rounded_heuristics.max()
+        indices = np.column_stack(np.where(rounded_heuristics == max_heuristic))
+        closest_index = np.argmin(np.linalg.norm(indices -  np.asarray(planner.to_pt([0, 0, 0])), axis = -1))
+        index = indices[closest_index]
+        # index = np.unravel_index(np.argmax(total_heuristics), total_heuristics.shape)
+        # debug = True
+        if debug:
+            from matplotlib import pyplot as plt
+            plt.subplot(221)
+            plt.imshow(obstacles.int() * 5 + outside_frontier.int() * 10)
+            plt.subplot(222)
+            plt.imshow(explored.int() * 5)
+            plt.subplot(223)
+            plt.imshow(total_heuristics)
+            plt.scatter(index[1], index[0], s = 15, c = 'g')
+            plt.show()
+        return index, time_heuristics, alignments_heuristics, total_heuristics
+        
+    def _alignment_heuristic(self, alignments, outside_frontier, alignment_smooth = 100, alignment_threshold = 0.13, debug = False):
+        alignments = np.ma.masked_array(alignments, ~outside_frontier)
+        alignment_heuristics = 1 / (1 + np.exp(-alignment_smooth * (alignments - alignment_threshold)))
+        index = np.unravel_index(np.argmax(alignment_heuristics), alignments.shape)
+        if debug:
+            plt.clf()
+            plt.title('alignment')
+            plt.imshow(alignment_heuristics)
+            plt.scatter(index[1], index[0], s = 15, c = 'g')
+            plt.show()
+        return alignment_heuristics
+
+    def _time_heuristic(self, history_soft, outside_frontier, time_smooth = 0.1, time_threshold = 24, debug = False):
+        history_soft = np.ma.masked_array(history_soft, ~outside_frontier)
+        time_heuristics = history_soft.max() - history_soft
+        time_heuristics[history_soft < 1] = float('inf')
+        time_heuristics = 1 / (1 + np.exp(-time_smooth * (time_heuristics - time_threshold)))
+        index = np.unravel_index(np.argmax(time_heuristics), history_soft.shape)
+        # return index
+        if debug:
+            plt.clf()
+            plt.title('time')
+            plt.imshow(time_heuristics)
+            plt.scatter(index[1], index[0], s = 15, c = 'r')
+            plt.show()
+        return time_heuristics
+
+    def sample_closest_frontier(
+        self,
+        xyt: np.ndarray,
+        max_tries: int = 1000,
+        expand_size: int = 5,
+        debug: bool = False,
+        verbose: bool = False,
+        step_dist: float = 0.1,
+        min_dist: float = 0.1,
+    ) -> Optional[torch.Tensor]:
+        """Sample a valid location on the current frontier using FMM planner to compute geodesic distance. Returns points in order until it finds one that's valid.
+
+        Args:
+            xyt(np.ndrray): [x, y, theta] of the agent; must be of size 2 or 3.
+            max_tries(int): number of attempts to make for rejection sampling
+            debug(bool): show visualizations of frontiers
+            step_dist(float): how far apart in geo dist these points should be
+        """
+        assert (
+            len(xyt) == 2 or len(xyt) == 3
+        ), f"xyt must be of size 2 or 3 instead of {len(xyt)}"
+
+        frontier, outside_frontier, traversible = self.get_frontier(
+            expand_size=expand_size, debug=debug
+        )
+
+        # from scipy.ndimage.morphology import distance_transform_edt
+        m = np.ones_like(traversible)
+        start_x, start_y = self.voxel_map.xy_to_grid_coords(xyt[:2]).int().cpu().numpy()
+        if verbose or debug:
+            print("--- Coordinates ---")
+            print(f"{xyt=}")
+            print(f"{start_x=}, {start_y=}")
+
+        m[start_x, start_y] = 0
+        m = np.ma.masked_array(m, ~traversible)
+
+        if not self.has_zero_contour(m):
+            if verbose:
+                print("traversible frontier had zero contour! no where to go.")
+            return None
+
+        distance_map = skfmm.distance(m, dx=1)
+        frontier_map = distance_map.copy()
+        # Masks are the areas we are ignoring - ignore everything but the frontiers
+        frontier_map.mask = np.bitwise_or(frontier_map.mask, ~frontier.cpu().numpy())
+
+        # Get distances of frontier points
+        distances = frontier_map.compressed()
+        xs, ys = np.where(~frontier_map.mask)
+
+        if verbose or debug:
+            print(f"-> found {len(distances)} items")
+
+        assert len(xs) == len(ys) and len(xs) == len(distances)
+        tries = 1
+        prev_dist = -1 * float("Inf")
+        for x, y, dist in sorted(zip(xs, ys, distances), key=lambda x: x[2]):
+            if dist < min_dist:
+                continue
+
+            # Don't explore too close to where we are
+            if dist < prev_dist + step_dist:
+                continue
+            prev_dist = dist
+
+            point_grid_coords = torch.FloatTensor([[x, y]])
+            outside_point = find_closest_point_on_mask(
+                outside_frontier, point_grid_coords
+            )
+
+            if outside_point is None:
+                print(
+                    "[VOXEL MAP: sampling] ERR finding closest pt:",
+                    point_grid_coords,
+                    "closest =",
+                    outside_point,
+                )
+                continue
+
+            yield self.voxel_map.grid_coords_to_xy(outside_point)
+            # # convert back to real-world coordinates
+            # point = self.voxel_map.grid_coords_to_xy(point_grid_coords)
+            # if point is None:
+            #     print("[VOXEL MAP: sampling] ERR:", point, point_grid_coords)
+            #     continue
+
+            # theta = math.atan2(
+            #     outside_point[1] - point_grid_coords[0, 1],
+            #     outside_point[0] - point_grid_coords[0, 0],
+            # )
+            # if debug:
+            #     print(f"{dist=}, {x=}, {y=}, {theta=}")
+
+            # # Ensure angle is in 0 to 2 * PI
+            # if theta < 0:
+            #     theta += 2 * np.pi
+
+            # xyt = torch.zeros(3)
+            # xyt[:2] = point
+            # xyt[2] = theta
+
+            # # Check to see if this point is valid
+            # if verbose:
+            #     print("[VOXEL MAP: sampling] sampled", xyt)
+            # if self.is_valid(xyt, debug=debug):
+            #     yield xyt
+
+            # tries += 1
+            # if tries > max_tries:
+            #     break
+        yield None
+
+    def show(
+        self,
+        orig: Optional[np.ndarray] = None,
+        norm: float = 255.0,
+        backend: str = "open3d",
+    ):
+        """Tool for debugging map representations that we have created"""
+        geoms = self.voxel_map._get_open3d_geometries(orig, norm)
+
+        # lazily import open3d - it's a tough dependency
+        import open3d
+
+        # Show the geometries of where we have explored
+        open3d.visualization.draw_geometries(geoms)
diff --git a/src/stretch/dynav/mapping_utils/voxelized_pcd.py b/src/stretch/dynav/mapping_utils/voxelized_pcd.py
new file mode 100644
index 000000000..09d3a4196
--- /dev/null
+++ b/src/stretch/dynav/mapping_utils/voxelized_pcd.py
@@ -0,0 +1,605 @@
+"""
+    This file implements VoxelizedPointcloud module in home-robot project (https://github.com/facebookresearch/home-robot).
+    Adapted to be used in ok-robot's navigation voxel map:
+    https://github.com/facebookresearch/home-robot/blob/main/src/home_robot/home_robot/utils/voxel.py
+
+    License:
+
+    MIT License
+
+    Copyright (c) Meta Platforms, Inc. and affiliates.
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE.
+    
+"""
+
+from typing import List, Optional, Tuple, Union
+
+import cv2
+import numpy as np
+import torch
+from torch import Tensor
+USE_TORCH_GEOMETRIC = True
+if USE_TORCH_GEOMETRIC:
+    from torch_geometric.nn.pool.consecutive import consecutive_cluster
+    from torch_geometric.nn.pool.voxel_grid import voxel_grid
+    from torch_geometric.utils import scatter
+else:
+    from stretch.utils.torch_geometric import consecutive_cluster, voxel_grid
+    from stretch.utils.torch_scatter import scatter
+
+
+def project_points(points_3d, K, pose):
+    if not isinstance(K, torch.Tensor):
+        K = torch.Tensor(K)
+    K = K.to(points_3d)
+    if not isinstance(pose, torch.Tensor):
+        pose = torch.Tensor(pose)
+    pose = pose.to(points_3d)
+    # Convert points to homogeneous coordinates
+    points_3d_homogeneous = torch.hstack((points_3d, torch.ones((points_3d.shape[0], 1)).to(points_3d)))
+
+    # Transform points into camera coordinate system
+    points_camera_homogeneous = torch.matmul(torch.linalg.inv(pose), points_3d_homogeneous.T).T
+    points_camera_homogeneous = points_camera_homogeneous[:, :3]
+
+    # Project points into image plane
+    points_2d_homogeneous = torch.matmul(K, points_camera_homogeneous.T).T
+    points_2d = points_2d_homogeneous[:, :2] / points_2d_homogeneous[:, 2:]
+
+    return points_2d
+
+
+def get_depth_values(points_3d, pose):
+    # Convert points to homogeneous coordinates
+    if not isinstance(pose, torch.Tensor):
+        pose = torch.Tensor(pose)
+    pose = pose.to(points_3d)
+    points_3d_homogeneous = torch.hstack((points_3d, torch.ones((points_3d.shape[0], 1)).to(points_3d)))
+
+    # Transform points into camera coordinate system
+    points_camera_homogeneous = torch.matmul(torch.linalg.inv(pose), points_3d_homogeneous.T).T
+
+    # Extract depth values (z-coordinates)
+    depth_values = points_camera_homogeneous[:, 2]
+
+    return depth_values
+
+
+class VoxelizedPointcloud:
+    _INTERNAL_TENSORS = [
+        "_points",
+        "_features",
+        "_weights",
+        "_rgb",
+        "dim_mins",
+        "dim_maxs",
+        "_mins",
+        "_maxs",
+    ]
+
+    _INIT_ARGS = ["voxel_size", "dim_mins", "dim_maxs", "feature_pool_method"]
+
+    def __init__(
+        self,
+        voxel_size: float = 0.05,
+        dim_mins: Optional[Tensor] = None,
+        dim_maxs: Optional[Tensor] = None,
+        feature_pool_method: str = "mean",
+    ):
+        """
+
+        Args:
+            voxel_size (Tensor): float, voxel size in each dim
+            dim_mins (Tensor): 3, tensor of minimum coords possible in voxel grid
+            dim_maxs (Tensor): 3, tensor of maximum coords possible in voxel grid
+            feature_pool_method (str, optional): How to pool features within a voxel. One of 'mean', 'max', 'sum'. Defaults to 'mean'.
+        """
+
+        assert (dim_mins is None) == (dim_maxs is None)
+        self.dim_mins = dim_mins
+        self.dim_maxs = dim_maxs
+        self.voxel_size = voxel_size
+        self.feature_pool_method = feature_pool_method
+        assert self.feature_pool_method in [
+            "mean",
+            "max",
+            "sum",
+        ], f"Unknown feature pool method {feature_pool_method}"
+
+        self.reset()
+
+    def reset(self):
+        """Resets internal tensors"""
+        self._points, self._features, self._weights, self._rgb = None, None, None, None
+        self._obs_counts = None
+        self._entity_ids = None
+        self._mins = self.dim_mins
+        self._maxs = self.dim_maxs
+        self.obs_count = 1
+
+    def clear_points(self, depth, intrinsics, pose, depth_is_valid = None):
+        if self._points is not None:
+            xys = project_points(self._points.detach().cpu(), intrinsics, pose).int()
+            xys = xys[:, [1, 0]]
+            proj_depth = get_depth_values(self._points.detach().cpu(), pose)
+            H, W = depth.shape
+
+            # Some points are projected to (i, j) on image plane and i, j might be smaller than 0 or greater than image size
+            # which will lead to Index Error.
+            valid_xys = xys.clone()
+            valid_xys[torch.any(torch.stack([
+                    xys[:, 0] < 0, 
+                    xys[:, 0] >= H, 
+                    xys[:, 1] < 0, 
+                    xys[:, 1] >= W,
+                ], dim = 0), dim = 0)] = 0
+            indices = torch.any(
+                torch.stack(
+                    [
+                        # the points are projected outside image frame
+                        xys[:, 0] < 0, xys[:, 0] >= H, 
+                        xys[:, 1] < 0, xys[:, 1] >= W, 
+                        # the points are projected to the image frame but is blocked by some obstacles
+                        depth[valid_xys[:, 0], valid_xys[:, 1]] < (proj_depth - 0.1), 
+                        # the points are projected to the image frame but they are behind camera
+                        depth[valid_xys[:, 0], valid_xys[:, 1]] < 0.01,
+                        proj_depth < 0.01,
+                        # depth is too large
+                        # (~depth_is_valid)[valid_xys[:, 0], valid_xys[:, 1]],
+                        proj_depth > 2.0
+                    ],
+                    dim = 0
+                ),
+                dim = 0)
+
+            if self._entity_ids is not None:
+                removed_entities, removed_counts = torch.unique(self._entity_ids.detach().cpu()[~indices], return_counts = True)
+                total_counts = torch.bincount(self._entity_ids.detach().cpu())
+                entities_to_be_removed = removed_entities[(removed_counts > total_counts[removed_entities] * 0.75) | (total_counts[removed_entities] - removed_counts < 5)]
+                indices = indices & ~torch.isin(self._entity_ids.detach().cpu(), entities_to_be_removed)
+        
+            # print('Clearing non valid points...')
+            # print('Removing ' + str((~indices).sum().item()) + ' points.')
+            indices = indices.to(self._points.device)
+            self._points = self._points[indices]
+            if self._features is not None:
+                self._features = self._features[indices]
+            if self._weights is not None:
+                self._weights= self._weights[indices]
+            if self._rgb is not None:
+                self._rgb = self._rgb[indices]
+            if self._obs_counts is not None:
+                self._obs_counts = self._obs_counts[indices]
+            if self._entity_ids is not None:
+                self._entity_ids = self._entity_ids[indices]
+
+    def add(
+        self,
+        points: Tensor,
+        features: Optional[Tensor],
+        rgb: Optional[Tensor],
+        weights: Optional[Tensor] = None,
+        obs_count: Optional[int] = None,
+        entity_id: Optional[int] = None
+    ):
+        """Add a feature pointcloud to the voxel grid.
+
+        Args:
+            points (Tensor): N x 3 points to add to the voxel grid
+            features (Tensor): N x D features associated with each point.
+                Reduction method can be set with feature_reduciton_method in init
+            rgb (Tensor): N x 3 colors s associated with each point.
+            weights (Optional[Tensor], optional): Weights for each point.
+                Can be detection confidence, distance to camera, etc.
+                Defaults to None.
+        """
+
+        if weights is None:
+            weights = torch.ones_like(points[..., 0])
+
+        if obs_count is None:
+            obs_count = torch.ones_like(weights) * self.obs_count
+        else:
+            obs_count = torch.ones_like(weights) * obs_count
+        if entity_id is None:
+            entity_id =  torch.ones_like(weights) * self.obs_count
+        else:
+            obs_count = torch.ones_like(weights) * entity_id
+        self.obs_count += 1
+
+        # Update voxel grid bounds
+        # This isn't strictly necessary since the functions below can infer the bounds
+        # But we might want to do this anyway to enforce that bounds are a multiple of self.voxel_size
+        # And to enforce that the added points are within user-defined boundaries, if those were specified.
+        pos_mins, _ = points.min(dim=0)
+        pos_maxs, _ = points.max(dim=0)
+        if self.dim_mins is not None:
+            assert torch.all(
+                self.dim_mins <= pos_mins
+            ), "Got points outside of user-defined 3D bounds"
+        if self.dim_maxs is not None:
+            assert torch.all(
+                pos_maxs <= self.dim_maxs
+            ), "Got points outside of user-defined 3D bounds"
+
+        if self._mins is None:
+            self._mins, self._maxs = pos_mins, pos_maxs
+            # recompute_voxels = True
+        else:
+            assert (
+                self._maxs is not None
+            ), "How did self._mins get set without self._maxs?"
+            # recompute_voxels = torch.any(pos_mins < self._mins) or torch.any(self._maxs < pos_maxs)
+            self._mins = torch.min(self._mins, pos_mins)
+            self._maxs = torch.max(self._maxs, pos_maxs)
+
+        if self._points is None:
+            assert (
+                self._features is None
+            ), "How did self._points get unset while _features is set?"
+            # assert self._rgbs is None, "How did self._points get unset while _rgbs is set?"
+            assert (
+                self._weights is None
+            ), "How did self._points get unset while _weights is set?"
+            all_points, all_features, all_weights, all_rgb = (
+                points,
+                features,
+                weights,
+                rgb,
+            )
+            all_obs_counts = obs_count
+            all_entity_ids = entity_id
+        else:
+            assert (self._features is None) == (features is None)
+            all_points = torch.cat([self._points, points], dim=0)
+            all_weights = torch.cat([self._weights, weights], dim=0)
+            all_features = (
+                torch.cat([self._features, features], dim=0)
+                if (features is not None)
+                else None
+            )
+            all_rgb = torch.cat([self._rgb, rgb], dim=0) if (rgb is not None) else None
+            all_obs_counts = torch.cat([self._obs_counts, obs_count], dim=0)
+            all_entity_ids = torch.cat([self._entity_ids, entity_id], dim=0)
+        # Future optimization:
+        # If there are no new voxels, then we could save a bit of compute time
+        # by only recomputing the voxel/cluster for the new points
+        # e.g. if recompute_voxels:
+        #   raise NotImplementedError
+        cluster_voxel_idx, cluster_consecutive_idx, _ = voxelize(
+            all_points, voxel_size=self.voxel_size, start=self._mins, end=self._maxs
+        )
+        self._points, self._features, self._weights, self._rgb, self._obs_counts, self._entity_ids = reduce_pointcloud(
+            cluster_consecutive_idx,
+            pos=all_points,
+            features=all_features,
+            weights=all_weights,
+            rgbs=all_rgb,
+            obs_counts = all_obs_counts,
+            entity_ids = all_entity_ids,
+            feature_reduce=self.feature_pool_method,
+        )
+        self._obs_counts, self._entity_ids = self._obs_counts.int(), self._entity_ids.int()
+        return
+
+    def get_idxs(self, points: Tensor) -> Tensor:
+        """Returns voxel index (long tensor) for each point in points
+
+        Args:
+            points (Tensor): N x 3
+
+        Returns:
+            cluster_voxel_idx (Tensor): The voxel grid index (long tensor) for each point in points
+            cluster_consecutive_idx (Tensor): Voxel grid reindexed to be consecutive (packed)
+        """
+        (
+            cluster_voxel_idx,
+            cluster_consecutive_idx,
+            _,
+        ) = voxelize(points, self.voxel_size, start=self._mins, end=self._maxs)
+        return cluster_voxel_idx, cluster_consecutive_idx
+
+    def get_voxel_idx(self, points: Tensor) -> Tensor:
+        """Returns voxel index (long tensor) for each point in points
+
+        Args:
+            points (Tensor): N x 3
+
+        Returns:
+            Tensor: voxel index (long tensor) for each point in points
+        """
+        (
+            cluster_voxel_idx,
+            _,
+        ) = self.get_idxs(points)
+        return cluster_voxel_idx
+
+    def get_consecutive_cluster_idx(self, points: Tensor) -> Tensor:
+        """Returns voxel index (long tensor) for each point in points
+
+        Args:
+            points (Tensor): N x 3
+
+        Returns:
+            Tensor: voxel index (long tensor) for each point in points
+        """
+        (
+            _,
+            cluster_consecutive_idx,
+        ) = self.get_idxs(points)
+        return cluster_consecutive_idx
+
+    def get_pointcloud(self) -> Tuple[Tensor]:
+        """Returns pointcloud (1 point per occupied voxel)
+
+        Returns:
+            points (Tensor): N x 3
+            features (Tensor): N x D
+            weights (Tensor): N
+        """
+        return self._points, self._features, self._weights, self._rgb
+
+    def clone(self):
+        """
+        Deep copy of object. All internal tensors are cloned individually.
+
+        Returns:
+            new VoxelizedPointcloud object.
+        """
+        other = self.__class__(**{k: getattr(self, k) for k in self._INIT_ARGS})
+        for k in self._INTERNAL_TENSORS:
+            v = getattr(self, k)
+            if torch.is_tensor(v):
+                setattr(other, k, v.clone())
+        return other
+
+    def to(self, device: Union[str, torch.device]):
+        """
+
+        Args:
+          device: Device (as str or torch.device) for the new tensor.
+
+        Returns:
+          self
+        """
+        other = self.clone()
+        for k in self._INTERNAL_TENSORS:
+            v = getattr(self, k)
+            if torch.is_tensor(v):
+                setattr(other, k, v.to(device))
+        return other
+
+    def cpu(self):
+        return self.to("cpu")
+
+    def cuda(self):
+        return self.to("cuda")
+
+    def detach(self):
+        """
+        Detach object. All internal tensors are detached individually.
+
+        Returns:
+            new VoxelizedPointcloud object.
+        """
+        other = self.__class__({k: getattr(self, k) for k in self._INIT_ARGS})
+        for k in self._INTERNAL_TENSORS:
+            v = getattr(self, k)
+            if torch.is_tensor(v):
+                setattr(other, k, v.detach())
+        return other
+
+
+def voxelize(
+    pos: Tensor,
+    voxel_size: float,
+    batch: Optional[Tensor] = None,
+    start: Optional[Union[float, Tensor]] = None,
+    end: Optional[Union[float, Tensor]] = None,
+) -> Tuple[Tensor]:
+    """Returns voxel indices and packed (consecutive) indices for points
+
+    Args:
+        pos (Tensor): [N, 3] locations
+        voxel_size (float): Size (resolution) of each voxel in the grid
+        batch (Optional[Tensor], optional): Batch index of each point in pos. Defaults to None.
+        start (Optional[Union[float, Tensor]], optional): Mins along each coordinate for the voxel grid.
+            Defaults to None, in which case the starts are inferred from min values in pos.
+        end (Optional[Union[float, Tensor]], optional):  Maxes along each coordinate for the voxel grid.
+            Defaults to None, in which case the starts are inferred from max values in pos.
+    Returns:
+        voxel_idx (LongTensor): Idx of each point's voxel coordinate. E.g. [0, 0, 4, 3, 3, 4]
+        cluster_consecutive_idx (LongTensor): Packed idx -- contiguous in cluster ID. E.g. [0, 0, 2, 1, 1, 2]
+        batch_sample: See https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/nn/pool/max_pool.html
+    """
+    voxel_cluster = voxel_grid(
+        pos=pos, batch=batch, size=voxel_size, start=start, end=end
+    )
+    cluster_consecutive_idx, perm = consecutive_cluster(voxel_cluster)
+    batch_sample = batch[perm] if batch is not None else None
+    cluster_idx = voxel_cluster
+    return cluster_idx, cluster_consecutive_idx, batch_sample
+
+
+def scatter_weighted_mean(
+    features: Tensor,
+    weights: Tensor,
+    cluster: Tensor,
+    weights_cluster: Tensor,
+    dim: int,
+) -> Tensor:
+    """_summary_
+
+    Args:
+        features (Tensor): [N, D] features at each point
+        weights (Optional[Tensor], optional): [N,] weights of each point. Defaults to None.
+        cluster (LongTensor): [N] IDs of each point (clusters.max() should be <= N, or you'll OOM)
+        weights_cluster (Tensor): [N,] aggregated weights of each cluster, used to normalize
+        dim (int): Dimension along which to do the reduction -- should be 0
+
+    Returns:
+        Tensor: Agggregated features, weighted by weights and normalized by weights_cluster
+    """
+    assert dim == 0, "Dim != 0 not yet implemented"
+    feature_cluster = scatter(
+        features * weights[:, None], cluster, dim=dim, reduce="sum"
+    )
+    feature_cluster = feature_cluster / weights_cluster[:, None]
+    return feature_cluster
+
+
+def reduce_pointcloud(
+    voxel_cluster: Tensor,
+    pos: Tensor,
+    features: Tensor,
+    weights: Optional[Tensor] = None,
+    rgbs: Optional[Tensor] = None,
+    obs_counts: Optional[Tensor] = None,
+    entity_ids: Optional[Tensor] = None,
+    feature_reduce: str = "mean",
+) -> Tuple[Tensor]:
+    """Pools values within each voxel
+
+    Args:
+        voxel_cluster (LongTensor): [N] IDs of each point
+        pos (Tensor): [N, 3] position of each point
+        features (Tensor): [N, D] features at each point
+        weights (Optional[Tensor], optional): [N,] weights of each point. Defaults to None.
+        rgbs (Optional[Tensor], optional): [N, 3] colors of each point. Defaults to None.
+        feature_reduce (str, optional): Feature reduction method. Defaults to 'mean'.
+
+    Raises:
+        NotImplementedError: if unknown reduction method
+
+    Returns:
+        pos_cluster (Tensor): weighted average position within each voxel
+        feature_cluster (Tensor): aggregated feature of each voxel
+        weights_cluster (Tensor): aggregated weights of each voxel
+        rgb_cluster (Tensor): colors of each voxel
+    """
+    if weights is None:
+        weights = torch.ones_like(pos[..., 0])
+    weights_cluster = scatter(weights, voxel_cluster, dim=0, reduce="sum")
+
+    pos_cluster = scatter_weighted_mean(
+        pos, weights, voxel_cluster, weights_cluster, dim=0
+    )
+
+    if rgbs is not None:
+        rgb_cluster = scatter_weighted_mean(
+            rgbs, weights, voxel_cluster, weights_cluster, dim=0
+        )
+    else:
+        rgb_cluster = None
+    
+    if obs_counts is not None:
+        obs_count_cluster = scatter(obs_counts, voxel_cluster, dim = 0, reduce = "max")
+    else:
+        obs_count_cluster = None
+
+    if entity_ids is not None:
+        entity_ids_cluster = scatter(entity_ids, voxel_cluster, dim = 0, reduce = "max")
+    else:
+        entity_ids_cluster = None
+
+    if features is None:
+        return pos_cluster, None, weights_cluster, rgb_cluster, obs_count_cluster, entity_ids_cluster
+
+    if feature_reduce == "mean":
+        feature_cluster = scatter_weighted_mean(
+            features, weights, voxel_cluster, weights_cluster, dim=0
+        )
+    elif feature_reduce == "max":
+        feature_cluster = scatter(features, voxel_cluster, dim=0, reduce="max")
+    elif feature_reduce == "sum":
+        feature_cluster = scatter(
+            features * weights[:, None], voxel_cluster, dim=0, reduce="sum"
+        )
+    else:
+        raise NotImplementedError(f"Unknown feature reduction method {feature_reduce}")
+
+    return pos_cluster, feature_cluster, weights_cluster, rgb_cluster, obs_count_cluster, entity_ids_cluster
+
+
+def scatter3d(
+    voxel_indices: Tensor, weights: Tensor, grid_dimensions: List[int], method: str = "mean",
+) -> Tensor:
+    """Scatter weights into a 3d voxel grid of the appropriate size.
+
+    Args:
+        voxel_indices (LongTensor): [N, 3] indices to scatter values to.
+        weights (FloatTensor): [N] values of equal size to scatter through voxel map.
+        grid_dimenstions (List[int]): sizes of the resulting voxel map, should be 3d.
+
+    Returns:
+        voxels (FloatTensor): [grid_dimensions] voxel map containing combined weights."""
+
+    assert voxel_indices.shape[0] == weights.shape[0], "weights and indices must match"
+    assert len(grid_dimensions) == 3, "this is designed to work only in 3d"
+    assert voxel_indices.shape[-1] == 3, "3d points expected for indices"
+
+    # Calculate a unique index for each voxel in the 3D grid
+    unique_voxel_indices = (
+        voxel_indices[:, 0] * (grid_dimensions[1] * grid_dimensions[2])
+        + voxel_indices[:, 1] * grid_dimensions[2]
+        + voxel_indices[:, 2]
+    )
+
+    # Use scatter to accumulate weights into voxels
+    voxel_weights = scatter(
+        weights,
+        unique_voxel_indices,
+        dim=0,
+        reduce=method,
+        dim_size=grid_dimensions[0] * grid_dimensions[1] * grid_dimensions[2],
+    )
+    return voxel_weights.reshape(*grid_dimensions)
+
+
+def drop_smallest_weight_points(
+    points: Tensor,
+    voxel_size: float = 0.01,
+    drop_prop: float = 0.1,
+    min_points_after_drop: int = 3,
+):
+    voxel_pcd = VoxelizedPointcloud(
+        voxel_size=voxel_size,
+        dim_mins=None,
+        dim_maxs=None,
+        feature_pool_method="mean",
+    )
+    voxel_pcd.add(
+        points=points,
+        features=None,  # instance.point_cloud_features,
+        rgb=None,  # instance.point_cloud_rgb,
+    )
+    orig_points = points
+    points = voxel_pcd._points
+    weights = voxel_pcd._weights
+    assert len(points) > 0, points.shape
+    weights_sorted, sort_idxs = torch.sort(weights, dim=0)
+    points_sorted = points[sort_idxs]
+    weights_cumsum = torch.cumsum(weights_sorted, dim=0)
+    above_cutoff = weights_cumsum >= (drop_prop * weights_cumsum[-1])
+    cutoff_idx = int(above_cutoff.max(dim=0).indices)
+    if len(points_sorted[cutoff_idx:]) < min_points_after_drop:
+        return orig_points
+    # print(f"Reduced {len(orig_points)} -> {len(points)} -> {above_cutoff.sum()}")
+    return points_sorted[cutoff_idx:]
\ No newline at end of file
diff --git a/src/stretch/dynav/ok_robot_hw/robot.py b/src/stretch/dynav/ok_robot_hw/robot.py
index fe5bd395a..669c8a338 100644
--- a/src/stretch/dynav/ok_robot_hw/robot.py
+++ b/src/stretch/dynav/ok_robot_hw/robot.py
@@ -1,5 +1,4 @@
 import numpy as np
-# import PyKDL
 import sys
 import os
 
@@ -225,7 +224,6 @@ def move_to_pose(self, translation_tensor, rotational_tensor, gripper, move_mode
         """
             Function to move the gripper to a desired translation and rotation
         """
-        import PyKDL
         translation = [translation_tensor[0], translation_tensor[1], translation_tensor[2]]
         rotation = rotational_tensor
         
diff --git a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
index 193b2fd30..7ab3093d3 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
@@ -122,7 +122,6 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
     # print(f"pin rotation{pin_rotation}")
 
     # Relative rotation and translation of grasping point relative to camera
-    # dest_frame = PyKDL.Frame(rotation, point) 
     pin_dest_frame = pin.SE3(np.array(pin_rotation), np.array(pin_point))
     # print(f"pin dest frame {pin_dest_frame}")
 
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index 1bc8bf2c9..562a49663 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -10,10 +10,8 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 
-import clip
 import numpy as np
 import torch
-from loguru import logger
 
 from stretch.core.parameters import Parameters, get_parameters
 from stretch.agent import RobotClient
@@ -27,6 +25,7 @@
 from stretch.dynav.ok_robot_hw.camera import RealSenseCamera
 from stretch.dynav.ok_robot_hw.utils.grasper_utils import pickup, move_to_point, capture_and_process_image
 from stretch.dynav.ok_robot_hw.utils.communication_utils import send_array, recv_array
+from stretch.dynav.voxel_map_server import ImageProcessor
 
 import cv2
 
@@ -77,6 +76,7 @@ def __init__(
         )
 
         self.image_sender = ImageSender(ip = ip, image_port = image_port, text_port = text_port, manip_port = manip_port)
+        self.image_processor = ImageProcessor(rerun = True, static = False)
 
         self.look_around_times = []
         self.execute_times = []
@@ -86,15 +86,20 @@ def __init__(
         # self.head_lock = threading.Lock()
 
     def look_around(self):
-        logger.info("Look around to check")
+        print("*" * 10, "Look around to check", "*" * 10)
         for pan in [0.5, -0.5, -1.5]:
-            for tilt in [-0.55]:
+            for tilt in [-0.6]:
+                start_time = time.time()
                 self.robot.head_to(pan, tilt, blocking = True)
+                end_time = time.time()
+                print('moving head takes ', end_time - start_time, 'seconds.')
                 self.update()
+        self.robot.look_front()
 
     def rotate_in_place(self):
-        logger.info("Rotate in place")
+        print("*" * 10, "Rotate in place", "*" * 10)
         xyt = self.robot.get_base_pose()
+        self.robot.head_to(head_pan = 0, head_tilt = -0.6, blocking = True)
         for i in range(8):
             xyt[2] += 2 * np.pi / 8
             self.robot.navigate_to(xyt, blocking = True)
@@ -106,8 +111,11 @@ def update(self):
         # self.image_sender.send_images(obs)
         self.obs_history.append(obs)
         self.obs_count += 1
-        self.image_sender.send_images(obs)
-
+        rgb, depth, K, camera_pose = obs.rgb, obs.depth, obs.camera_K, obs.camera_pose
+        start_time = time.time()
+        self.image_processor.process_rgbd_images(rgb, depth, K, camera_pose)
+        end_time = time.time()
+        print('Image processing takes', end_time - start_time, 'seconds.')
 
     def execute_action(
         self,
@@ -122,7 +130,8 @@ def execute_action(
 
         start = self.robot.get_base_pose()
         print("       Start:", start)
-        res = self.image_sender.query_text(text, start)  
+        # res = self.image_sender.query_text(text, start)  
+        res = self.image_processor.process_text(text, start)
 
         look_around_finish = time.time()
         look_around_take = look_around_finish - start_time
@@ -155,7 +164,7 @@ def execute_action(
                     res,
                     pos_err_threshold=self.pos_err_threshold,
                     rot_err_threshold=self.rot_err_threshold,
-                    blocking = True
+                    blocking = False
                 )
 
                 execution_finish = time.time()
@@ -183,7 +192,7 @@ def run_exploration(
             return False
         return True
 
-    def navigate(self, text, max_step = 15):
+    def navigate(self, text, max_step = 10):
         finished = False
         step = 0
         end_point = None
diff --git a/src/stretch/dynav/run_manip.py b/src/stretch/dynav/run_manip.py
index afb353cce..8d4f18e79 100644
--- a/src/stretch/dynav/run_manip.py
+++ b/src/stretch/dynav/run_manip.py
@@ -2,32 +2,15 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
-import datetime
-import sys
-import time
-import timeit
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
 
 import click
-import matplotlib.pyplot as plt
 import numpy as np
-import open3d
-import torch
-from PIL import Image
 
 # Mapping and perception
 from stretch.core.parameters import Parameters, get_parameters
 from stretch.dynav import RobotAgentMDP
-
-# Chat and UI tools
-from stretch.utils.point_cloud import numpy_to_pcd, show_point_cloud
 from stretch.agent import RobotClient
 
-import cv2
-import threading
-
-import os
 
 def compute_tilt(camera_xyz, target_xyz):
     '''
@@ -66,14 +49,13 @@ def main(
         random_goals(bool): randomly sample frontier goals instead of looking for closest
     """
     click.echo("Will connect to a Stretch robot and collect a short trajectory.")
-    robot = RobotClient(robot_ip = "127.0.0.1")
+    robot = RobotClient(robot_ip = "100.79.44.11")
 
     print("- Load parameters")
     parameters = get_parameters("dynav_config.yaml")
     # print(parameters)
     if explore_iter >= 0:
         parameters["exploration_steps"] = explore_iter
-    object_to_find, location_to_place = None, None
 
     print("- Start robot agent with data collection")
     demo = RobotAgentMDP(
diff --git a/src/stretch/dynav/run_ok_nav.py b/src/stretch/dynav/run_ok_nav.py
index 18d706b68..3bee44f06 100644
--- a/src/stretch/dynav/run_ok_nav.py
+++ b/src/stretch/dynav/run_ok_nav.py
@@ -2,18 +2,9 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
-import datetime
-import sys
-import time
-import timeit
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
-
 import click
 import matplotlib.pyplot as plt
 import numpy as np
-import open3d
-import torch
 from PIL import Image
 
 # Mapping and perception
@@ -25,9 +16,6 @@
 from stretch.agent import RobotClient
 
 import cv2
-import threading
-
-import os
 
 def compute_tilt(camera_xyz, target_xyz):
     '''
@@ -66,7 +54,7 @@ def main(
         random_goals(bool): randomly sample frontier goals instead of looking for closest
     """
     click.echo("Will connect to a Stretch robot and collect a short trajectory.")
-    robot = RobotClient(robot_ip = "127.0.0.1")
+    robot = RobotClient(robot_ip = "100.79.44.11")
 
     print("- Load parameters")
     parameters = get_parameters("dynav_config.yaml")
@@ -82,7 +70,10 @@ def main(
         robot, parameters, ip = ip, re = re
     )
 
-    demo.rotate_in_place()
+    if input_path is None:
+        demo.rotate_in_place()
+    else:
+        demo.image_processor.read_from_pickle(input_path)
 
     # def keep_looking_around():
     #     while True:
diff --git a/src/stretch/dynav/scannet.py b/src/stretch/dynav/scannet.py
new file mode 100644
index 000000000..2fe92aaae
--- /dev/null
+++ b/src/stretch/dynav/scannet.py
@@ -0,0 +1,209 @@
+CLASS_LABELS_200 =(
+    'wall',
+    'background',
+    'object',
+    'range hood',
+    'door',
+    'stove',
+    'bathroom stall',
+    'divider',
+    'clothes dryer',
+    'file cabinet',
+    'cabinet',
+    'bathroom cabinet',
+    'blackboard',
+    'curtain',
+    'blinds',
+    'tv',
+    'mattress',
+    'bathtub',
+    'glass',
+    'shower',
+    'power outlet',
+    'water dispenser',
+    'bulletin board',
+    'printer',
+    'windowsill',
+    'bookshelf',
+    'dresser',
+    'storage organizer',
+    'fireplace',
+    'fire alarm',
+    'piano',
+    'shelf',
+    'kitchen counter',
+    'washing machine',
+    'stairs',
+    'paper cutter',
+    'sink',
+    'window',
+    'refrigerator',
+    'counter',
+    'closet',
+    'bathroom vanity',
+    'radiator',
+    'vent',
+    'kitchen cabinet',
+    'water cooler',
+    'doorframe',
+    'closet door',
+    'table',
+
+    'shower head',
+    'spray',
+    'inhaler',
+    'guitar case',
+    'plunger',
+    'toilet paper dispenser',
+    'adapter',
+    'soy sauce',
+    'pipe',
+    'bottle',
+    'scale',
+    'paper towel',
+    'paper towel roll',
+    'scissors',
+    'tape',
+    'chopsticks',
+    'case of water bottles',
+    'hand sanitizer',
+    'laptop',
+    'alcohol disinfection',
+    'keyboard',
+    'coffee maker',
+    'light',
+    'toaster',
+    'stuffed animal',
+    'toilet seat cover dispenser',
+    'ironing board',
+    'fire extinguisher',
+    'fruit',
+    'container',
+    'bag',
+    'oven',
+    'body wash',
+    'bucket',
+    'cd case',
+    'tray',
+    'bowl',
+    'speaker',
+    'crate',
+    'projector',
+    'book',
+    'school bag',
+    'laundry detergent',
+    'clothes',
+    'candle',
+    'basket',
+    'face wash',
+    'notebook',
+    'purse',
+    'trash bin',
+    'paper bag',
+    'package',
+    'disinfecting wipes',
+    'recycling bin',
+    'headphones',
+    'mouse',
+    'shower gel',
+    'dustpan',
+    'cup',
+    'vacuum cleaner',
+    'dish rack',
+    'coffee kettle',
+    'plants',
+    'rag',
+    'can',
+    'cushion',
+    'monitor',
+    'fan',
+    'tube',
+    'box',
+    'ball',
+    'bicycle',
+    'guitar',
+    'trash can',
+    'hand sanitizers',
+    'paper towel dispenser',
+    'whiteboard',
+    'bin',
+    'potted plant',
+    'tennis',
+    'soap dish',
+    'structure',
+    'calendar',
+    'dumbbell',
+    'fish oil',
+    'ottoman',
+    'stool',
+    'hand wash',
+    'lamp',
+    'toaster oven',
+    'music stand',
+    'water bottle',
+    'clock',
+    'charger',
+    'picture',
+    'bascketball',
+    'microwave',
+    'screwdriver',
+    'rack',
+    'apple',
+    'suitcase',
+    'ladder',
+    'ping pong ball',
+    'dishwasher',
+    'storage container',
+    'toilet paper holder',
+    'coat rack',
+    'soap dispenser',
+    'banana',
+    'toilet paper',
+    'mug',
+    'marker pen',
+    'hat',
+    'aerosol',
+    'luggage',
+    'poster',
+    'bed',
+    'cart',
+    'light switch',
+    'backpack',
+    'power strip',
+    'baseball',
+    'mustard',
+    'water pitcher',
+    'couch',
+    'beverage',
+    'toy',
+    'salt',
+    'plant',
+    'pillow',
+    'broom',
+    'pepper',
+    'muffins',
+    'multivitamin',
+    'towel',
+    'storage bin',
+    'nightstand',
+    'telephone',
+    'tissue box',
+    'hair dryer',
+    'mirror',
+    'sign',
+    'plate',
+    'tripod',
+    'chair',
+    'plastic bag',
+    'umbrella',
+    'paper',
+    'laundry hamper',
+    'food',
+    'jacket',
+    'computer tower',
+    'keyboard piano',
+    'person',
+    'machine',
+    'projector screen',
+    'shoe',
+)
\ No newline at end of file
diff --git a/src/stretch/dynav/voxel_map_localizer.py b/src/stretch/dynav/voxel_map_localizer.py
new file mode 100644
index 000000000..7408e54ed
--- /dev/null
+++ b/src/stretch/dynav/voxel_map_localizer.py
@@ -0,0 +1,233 @@
+import numpy as np
+
+import cv2
+
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+
+import clip
+
+from stretch.dynav.mapping_utils import VoxelizedPointcloud
+
+from typing import List, Optional, Tuple, Union
+from torch import Tensor
+
+from transformers import AutoProcessor, AutoModel
+
+from sklearn.cluster import DBSCAN
+
+# from ultralytics import YOLOWorld
+from transformers import Owlv2Processor, Owlv2ForObjectDetection
+
+import math
+from PIL import Image
+
+def get_inv_intrinsics(intrinsics):
+    # return intrinsics.double().inverse().to(intrinsics)
+    fx, fy, ppx, ppy = intrinsics[..., 0, 0], intrinsics[..., 1, 1], intrinsics[..., 0, 2], intrinsics[..., 1, 2]
+    inv_intrinsics = torch.zeros_like(intrinsics)
+    inv_intrinsics[..., 0, 0] = 1.0 / fx
+    inv_intrinsics[..., 1, 1] = 1.0 / fy
+    inv_intrinsics[..., 0, 2] = -ppx / fx
+    inv_intrinsics[..., 1, 2] = -ppy / fy
+    inv_intrinsics[..., 2, 2] = 1.0
+    return inv_intrinsics
+
+def get_xyz(depth, pose, intrinsics):
+    """Returns the XYZ coordinates for a set of points.
+
+    Args:
+        depth: The depth array, with shape (B, 1, H, W)
+        pose: The pose array, with shape (B, 4, 4)
+        intrinsics: The intrinsics array, with shape (B, 3, 3)
+
+    Returns:
+        The XYZ coordinates of the projected points, with shape (B, H, W, 3)
+    """
+    if not isinstance(depth, torch.Tensor):
+        depth = torch.from_numpy(depth)
+    if not isinstance(pose, torch.Tensor):
+        pose = torch.from_numpy(pose)
+    if not isinstance(intrinsics, torch.Tensor):
+        intrinsics = torch.from_numpy(intrinsics)
+    while depth.ndim < 4:
+        depth = depth.unsqueeze(0)
+    while pose.ndim < 3:
+        pose = pose.unsqueeze(0)
+    while intrinsics.ndim < 3:
+        intrinsics = intrinsics.unsqueeze(0)
+    (bsz, _, height, width), device, dtype = depth.shape, depth.device, intrinsics.dtype
+
+    # Gets the pixel grid.
+    xs, ys = torch.meshgrid(
+        torch.arange(0, width, device=device, dtype=dtype),
+        torch.arange(0, height, device=device, dtype=dtype),
+        indexing="xy",
+    )
+    xy = torch.stack([xs, ys], dim=-1).flatten(0, 1).unsqueeze(0).repeat_interleave(bsz, 0)
+    xyz = torch.cat((xy, torch.ones_like(xy[..., :1])), dim=-1)
+
+    # Applies intrinsics and extrinsics.
+    # xyz = xyz @ intrinsics.inverse().transpose(-1, -2)
+    xyz = xyz @ get_inv_intrinsics(intrinsics).transpose(-1, -2)
+    xyz = xyz * depth.flatten(1).unsqueeze(-1)
+    xyz = (xyz[..., None, :] * pose[..., None, :3, :3]).sum(dim=-1) + pose[..., None, :3, 3]
+    
+    xyz = xyz.unflatten(1, (height, width))
+
+    return xyz
+
+class VoxelMapLocalizer():
+    def __init__(self, voxel_map_wrapper = None, exist_model = True, clip_model = None, processor = None, device = 'cuda', siglip = True):
+        print('Localizer V3')
+        self.voxel_map_wrapper = voxel_map_wrapper
+        self.device = device
+        # self.clip_model, self.preprocessor = clip.load(model_config, device=device)
+        self.siglip = siglip
+        if clip_model is not None and processor is not None:
+            self.clip_model, self.preprocessor = clip_model, processor
+        elif not self.siglip:
+            self.clip_model, self.preprocessor = clip.load("ViT-B/16", device=self.device)
+            self.clip_model.eval()
+        else:
+            self.clip_model = AutoModel.from_pretrained("google/siglip-so400m-patch14-384").to(self.device)
+            self.preprocessor = AutoProcessor.from_pretrained("google/siglip-so400m-patch14-384")
+            self.clip_model.eval()
+        self.voxel_pcd = VoxelizedPointcloud(voxel_size = 0.05).to(self.device)
+        # self.exist_model = YOLOWorld("yolov8l-worldv2.pt")
+        self.existence_checking_model = exist_model
+        if exist_model:
+            print('WE ARE USING OWLV2!')
+            self.exist_processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
+            self.exist_model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble").to(self.device)
+        else:
+            print('YOU ARE USING NOTHING!')
+        
+
+    def add(self,
+        points: Tensor,
+        features: Optional[Tensor],
+        rgb: Optional[Tensor],
+        weights: Optional[Tensor] = None,
+        obs_count: Optional[Tensor] = None,
+    ):
+        points = points.to(self.device)
+        if features is not None:
+            features = features.to(self.device)
+        if rgb is not None:
+            rgb = rgb.to(self.device)
+        if weights is not None:
+            weights = weights.to(self.device)
+        self.voxel_pcd.add(points = points, 
+                        features = features,
+                        rgb = rgb,
+                        weights = weights,
+                        obs_count = obs_count)
+
+    def calculate_clip_and_st_embeddings_for_queries(self, queries):
+        if isinstance(queries, str):
+            queries = [queries] 
+        if self.siglip:
+            inputs = self.preprocessor(text=queries, padding="max_length", return_tensors="pt")
+            for input in inputs:
+                inputs[input] = inputs[input].to(self.clip_model.device)
+            all_clip_tokens = self.clip_model.get_text_features(**inputs)
+        else:
+            text = clip.tokenize(queries).to(self.clip_model.device)
+            all_clip_tokens = self.clip_model.encode_text(text)
+        # text = clip.tokenize(queries).to(self.device)
+        # all_clip_tokens = self.clip_model.encode_text(text)
+        all_clip_tokens = F.normalize(all_clip_tokens, p=2, dim=-1)
+        return all_clip_tokens
+        
+    def find_alignment_over_model(self, queries):
+        clip_text_tokens = self.calculate_clip_and_st_embeddings_for_queries(queries)
+        points, features, weights, _ = self.voxel_pcd.get_pointcloud()
+        if points is None:
+            return None
+        features = F.normalize(features, p=2, dim=-1)
+        point_alignments = clip_text_tokens.float() @ features.float().T
+    
+        # print(point_alignments.shape)
+        return point_alignments
+
+    def find_alignment_for_A(self, A):
+        points, features, _, _ = self.voxel_pcd.get_pointcloud()
+        alignments = self.find_alignment_over_model(A).cpu()
+        return points[alignments.argmax(dim = -1)].detach().cpu()
+    
+    def find_obs_id_for_A(self, A):
+        obs_counts = self.voxel_pcd._obs_counts
+        alignments = self.find_alignment_over_model(A).cpu()
+        return obs_counts[alignments.argmax(dim = -1)].detach().cpu()
+
+    def compute_coord(self, text, obs_id, threshold = 0.2):
+        if obs_id <= 0:
+            return None
+        rgb = self.voxel_map_wrapper.observations[obs_id - 1].rgb
+        rgb = rgb.permute(2, 0, 1).to(torch.uint8)
+        inputs = self.exist_processor(text=[['a photo of a ' + text]], images=rgb, return_tensors="pt")
+        for input in inputs:
+            inputs[input] = inputs[input].to('cuda')
+        
+        with torch.no_grad():
+            outputs = self.exist_model(**inputs)
+    
+        target_sizes = torch.Tensor([rgb.size()[-2:]]).to(self.device)
+        results = self.exist_processor.image_processor.post_process_object_detection(
+            outputs, threshold=threshold, target_sizes=target_sizes
+        )[0]
+        depth = self.voxel_map_wrapper.observations[obs_id - 1].depth
+        for idx, (score, bbox) in enumerate(sorted(zip(results['scores'], results['boxes']), key=lambda x: x[0], reverse=True)):
+        
+            tl_x, tl_y, br_x, br_y = bbox
+            w, h = depth.shape
+            # if w > h:
+            #     tl_x, br_x = tl_x * w / h, br_x * w / h
+            # else:
+            #     tl_y, br_y = tl_y * h / w, br_y * h / w
+            tl_x, tl_y, br_x, br_y = int(max(0, tl_x.item())), int(max(0, tl_y.item())), int(min(h, br_x.item())), int(min(w, br_y.item()))
+            pose = self.voxel_map_wrapper.observations[obs_id - 1].camera_pose
+            K = self.voxel_map_wrapper.observations[obs_id - 1].camera_K
+            xyzs = get_xyz(depth, pose, K)[0]
+            if torch.median(depth[tl_y: br_y, tl_x: br_x].reshape(-1)) < 3:
+                return torch.median(xyzs[tl_y: br_y, tl_x: br_x].reshape(-1, 3), dim = 0).values
+            # if depth[(tl_y + br_y) // 2, (tl_x + br_x) // 2] < 3.:
+            #     return xyzs[(tl_y + br_y) // 2, (tl_x + br_x) // 2]
+        return None
+
+    def localize_A(self, A, debug = True, return_debug = False):
+        points, _, _, _ = self.voxel_pcd.get_pointcloud()
+        alignments = self.find_alignment_over_model(A).cpu()
+        point = points[alignments.argmax(dim = -1)].detach().cpu().squeeze()
+        obs_counts = self.voxel_pcd._obs_counts
+        image_id = obs_counts[alignments.argmax(dim = -1)].detach().cpu()
+        debug_text = ''
+        target_point = None
+
+        res = self.compute_coord(A, image_id)
+        # res = None
+        if res is not None:
+            target_point = res
+            debug_text += '#### - Obejct is detected in observations . **😃** Directly navigate to it.\n'
+        else:
+            # debug_text += '#### - Directly ignore this instance is the target. **😞** \n'
+            if self.siglip:
+                cosine_similarity_check = alignments.max().item() > 0.14
+            else:
+                cosine_similarity_check = alignments.max().item() > 0.3
+            if cosine_similarity_check:
+                target_point = point
+
+                debug_text += '#### - The point has high cosine similarity. **😃** Directly navigate to it.\n'
+            else:
+                debug_text += '#### - Cannot verify whether this instance is the target. **😞** \n'
+        # print('target_point', target_point)
+        if not debug:
+            return target_point
+        elif not return_debug:
+            return target_point, debug_text
+        else:
+            return target_point, debug_text, image_id, point
\ No newline at end of file
diff --git a/src/stretch/dynav/voxel_map_server.py b/src/stretch/dynav/voxel_map_server.py
new file mode 100644
index 000000000..2c9e57c20
--- /dev/null
+++ b/src/stretch/dynav/voxel_map_server.py
@@ -0,0 +1,849 @@
+import zmq
+
+from stretch.dynav.scannet import CLASS_LABELS_200
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchvision.transforms.functional as V
+# from segment_anything import sam_model_registry, SamPredictor
+# from transformers import AutoProcessor, OwlViTForObjectDetection
+from ultralytics import YOLO, SAM, YOLOWorld
+from sam2.build_sam import build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+
+import clip
+from torchvision import transforms
+
+import os
+# import wget
+import time
+
+import open3d as o3d
+
+from matplotlib import pyplot as plt
+import pickle
+from pathlib import Path
+import wget
+from stretch.dynav.voxel_map_localizer import VoxelMapLocalizer
+from stretch.core import get_parameters
+from stretch.dynav.mapping_utils import (
+    SparseVoxelMap, 
+    SparseVoxelMapNavigationSpace, 
+    AStar,
+    VoxelizedPointcloud
+)
+
+import datetime
+
+import threading
+import scipy
+import hydra
+
+from transformers import AutoProcessor, AutoModel
+import rerun as rr
+
+from io import BytesIO
+from PIL import Image
+
+from stretch.dynav.communication_util import load_socket, send_array, recv_array, send_rgb_img, recv_rgb_img, send_depth_img, recv_depth_img, send_everything, recv_everything
+
+def get_inv_intrinsics(intrinsics):
+    # return intrinsics.double().inverse().to(intrinsics)
+    fx, fy, ppx, ppy = intrinsics[..., 0, 0], intrinsics[..., 1, 1], intrinsics[..., 0, 2], intrinsics[..., 1, 2]
+    inv_intrinsics = torch.zeros_like(intrinsics)
+    inv_intrinsics[..., 0, 0] = 1.0 / fx
+    inv_intrinsics[..., 1, 1] = 1.0 / fy
+    inv_intrinsics[..., 0, 2] = -ppx / fx
+    inv_intrinsics[..., 1, 2] = -ppy / fy
+    inv_intrinsics[..., 2, 2] = 1.0
+    return inv_intrinsics
+
+def get_xyz(depth, pose, intrinsics):
+    """Returns the XYZ coordinates for a set of points.
+
+    Args:
+        depth: The depth array, with shape (B, 1, H, W)
+        pose: The pose array, with shape (B, 4, 4)
+        intrinsics: The intrinsics array, with shape (B, 3, 3)
+
+    Returns:
+        The XYZ coordinates of the projected points, with shape (B, H, W, 3)
+    """
+    if not isinstance(depth, torch.Tensor):
+        depth = torch.from_numpy(depth)
+    if not isinstance(pose, torch.Tensor):
+        pose = torch.from_numpy(pose)
+    if not isinstance(intrinsics, torch.Tensor):
+        intrinsics = torch.from_numpy(intrinsics)
+    while depth.ndim < 4:
+        depth = depth.unsqueeze(0)
+    while pose.ndim < 3:
+        pose = pose.unsqueeze(0)
+    while intrinsics.ndim < 3:
+        intrinsics = intrinsics.unsqueeze(0)
+    (bsz, _, height, width), device, dtype = depth.shape, depth.device, intrinsics.dtype
+
+    # Gets the pixel grid.
+    xs, ys = torch.meshgrid(
+        torch.arange(0, width, device=device, dtype=dtype),
+        torch.arange(0, height, device=device, dtype=dtype),
+        indexing="xy",
+    )
+    xy = torch.stack([xs, ys], dim=-1).flatten(0, 1).unsqueeze(0).repeat_interleave(bsz, 0)
+    xyz = torch.cat((xy, torch.ones_like(xy[..., :1])), dim=-1)
+
+    # Applies intrinsics and extrinsics.
+    # xyz = xyz @ intrinsics.inverse().transpose(-1, -2)
+    xyz = xyz @ get_inv_intrinsics(intrinsics).transpose(-1, -2)
+    xyz = xyz * depth.flatten(1).unsqueeze(-1)
+    xyz = (xyz[..., None, :] * pose[..., None, :3, :3]).sum(dim=-1) + pose[..., None, :3, 3]
+    
+    xyz = xyz.unflatten(1, (height, width))
+
+    return xyz
+
+class ImageProcessor:
+    def __init__(self,  
+        vision_method = 'mask*lip', 
+        siglip = True,
+        device = 'cuda',
+        min_depth = 0.25,
+        max_depth = 2.5,
+        img_port = 5555,
+        text_port = 5556,
+        open_communication = True,
+        rerun = True,
+        static = True
+    ):
+        self.static = static
+        self.siglip = siglip
+        current_datetime = datetime.datetime.now()
+        self.log = 'debug_' + current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
+        self.rerun = rerun
+        if self.rerun:
+            if self.static:
+                rr.init(self.log, spawn = False)
+                rr.connect('100.108.67.79:9876')
+            else:
+                rr.init(self.log, spawn = True)
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.obs_count = 0
+        # There are three vision methods:
+        # 1. 'mask*lip' Following the idea of https://arxiv.org/abs/2112.01071, remove the last layer of any VLM and obtain the dense features
+        # 2. 'mask&*lip' Following the idea of https://mahis.life/clip-fields/, extract segmentation mask and assign a vision-language feature to it
+        self.vision_method = vision_method
+        # If cuda is not available, then device will be forced to be cpu
+        if not torch.cuda.is_available():
+            device = 'cpu'
+        self.device = device
+
+        self.create_obstacle_map()
+        self.create_vision_model()
+
+        self.voxel_map_lock = threading.Lock()  # Create a lock for synchronizing access to `self.voxel_map_localizer`
+        
+        if open_communication:
+            self.img_socket = load_socket(img_port)
+            self.text_socket = load_socket(text_port)
+
+            self.img_thread = threading.Thread(target=self._recv_image)
+            self.img_thread.daemon = True
+            self.img_thread.start()
+    
+    def create_obstacle_map(self):
+        print("- Load parameters")
+        parameters = get_parameters("dynav_config.yaml")
+        self.default_expand_frontier_size = parameters["default_expand_frontier_size"]
+        self.voxel_map = SparseVoxelMap(
+            resolution=parameters["voxel_size"],
+            local_radius=parameters["local_radius"],
+            obs_min_height=parameters["obs_min_height"],
+            obs_max_height=parameters["obs_max_height"],
+            obs_min_density = parameters["obs_min_density"],
+            exp_min_density = parameters["exp_min_density"],
+            min_depth=self.min_depth,
+            max_depth=self.max_depth,
+            pad_obstacles=parameters["pad_obstacles"],
+            add_local_radius_points=parameters.get(
+                "add_local_radius_points", default=True
+            ),
+            remove_visited_from_obstacles=parameters.get(
+                "remove_visited_from_obstacles", default=False
+            ),
+            smooth_kernel_size=parameters.get("filters/smooth_kernel_size", -1),
+            use_median_filter=parameters.get("filters/use_median_filter", False),
+            median_filter_size=parameters.get("filters/median_filter_size", 5),
+            median_filter_max_error=parameters.get(
+                "filters/median_filter_max_error", 0.01
+            ),
+            use_derivative_filter=parameters.get(
+                "filters/use_derivative_filter", False
+            ),
+            derivative_filter_threshold=parameters.get(
+                "filters/derivative_filter_threshold", 0.5
+            )
+        )
+        self.space = SparseVoxelMapNavigationSpace(
+            self.voxel_map,
+            # step_size=parameters["step_size"],
+            rotation_step_size=parameters["rotation_step_size"],
+            dilate_frontier_size=parameters[
+                "dilate_frontier_size"
+            ],  # 0.6 meters back from every edge = 12 * 0.02 = 0.24
+            dilate_obstacle_size=parameters["dilate_obstacle_size"],
+        )
+
+        # Create a simple motion planner
+        self.planner = AStar(self.space)
+        self.value_map = torch.zeros(self.voxel_map.grid_size)
+
+    def create_vision_model(self):
+        if not self.siglip:
+            self.clip_model, self.clip_preprocess = clip.load("ViT-B/16", device=self.device)
+            self.clip_model.eval()
+        else:
+            self.clip_model = AutoModel.from_pretrained("google/siglip-so400m-patch14-384").to(self.device)
+            self.clip_preprocess = AutoProcessor.from_pretrained("google/siglip-so400m-patch14-384")
+            self.clip_model.eval()
+        if self.vision_method != 'mask*lip':
+            sam_checkpoint = f"./sam2_hiera_small.pt"
+            sam_config = "sam2_hiera_s.yaml"
+            if not os.path.exists(sam_checkpoint):
+                wget.download('https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_small.pt', out = sam_checkpoint)
+            sam2_model = build_sam2(sam_config, sam_checkpoint, device=self.device, apply_postprocessing=False)
+            self.mask_predictor = SAM2ImagePredictor(sam2_model)
+        self.yolo_model = YOLOWorld('yolov8s-worldv2.pt')
+        self.texts = CLASS_LABELS_200
+        self.yolo_model.set_classes(self.texts)
+        self.voxel_map_localizer = VoxelMapLocalizer(self.voxel_map, clip_model = self.clip_model, processor = self.clip_preprocess, device = self.device, siglip = self.siglip)
+
+    def process_text(self, text, start_pose):
+        if self.rerun:
+            if not self.static:
+                rr.set_time_sequence("frame", self.obs_count)
+            rr.log('/object', rr.Clear(recursive = True), static = self.static)
+            rr.log('/robot_start_pose', rr.Clear(recursive = True), static = self.static)
+            rr.log('/direction', rr.Clear(recursive = True), static = self.static)
+            rr.log('robot_monologue', rr.Clear(recursive = True), static = self.static)
+            rr.log('/Past_observation_most_similar_to_text', rr.Clear(recursive = True), static = self.static)
+            if not self.static:
+                rr.connect('100.108.67.79:9876')
+        
+        debug_text = ''
+        mode = 'navigation'
+        obs = None
+        # Do visual grounding
+        if text is not None and text != '':
+            with self.voxel_map_lock:
+                localized_point, debug_text, obs, pointcloud = self.voxel_map_localizer.localize_A(text, debug = True, return_debug = True)
+            if localized_point is not None:
+                rr.log("/object", rr.Points3D([localized_point[0], localized_point[1], 1.5], colors=torch.Tensor([1, 0, 0]), radii=0.1), static = self.static)
+        # Do Frontier based exploration
+        if text is None or text == '' or localized_point is None:
+            debug_text += '## Navigation fails, so robot starts exploring environments.\n'
+            localized_point = self.sample_frontier(start_pose, text)
+            mode = 'exploration'
+            rr.log("/object", rr.Points3D([0, 0, 0], colors=torch.Tensor([1, 0, 0]), radii=0), static = self.static)
+            print('\n', localized_point, '\n')
+
+        if localized_point is None:
+            return []
+        
+        if len(localized_point) == 2:
+            localized_point = np.array([localized_point[0], localized_point[1], 0])
+
+        point = self.sample_navigation(start_pose, localized_point)
+        # if mode == 'navigation' and torch.linalg.norm(torch.Tensor(point)[:2] - torch.linalg.norm(localized_point[:2])) > 2.0:
+        #     localized_point = self.sample_frontier(start_pose, None)
+        #     mode = 'exploration'
+        #     point = self.sample_navigation(start_pose, localized_point)
+        #     debug_text += '## All reachable points of robot are too far from the target object, explore to find new paths. \n'
+
+        if self.rerun:
+            buf = BytesIO()
+            plt.savefig(buf, format='png')
+            buf.seek(0)
+            img = Image.open(buf)
+            img = np.array(img)
+            buf.close()
+            rr.log('2d_map', rr.Image(img), static = self.static)
+        else:
+            if text != '':
+                plt.savefig(self.log + '/debug_' + text + str(self.obs_count) + '.png')
+            else:
+                plt.savefig(self.log + '/debug_exploration' + str(self.obs_count) + '.png')
+        plt.clf()
+
+        if self.rerun:
+            if text is not None and text != '':
+                debug_text = '### The goal is to navigate to ' + text + '.\n' + debug_text
+            else:
+                debug_text = '### I have not received any text query from human user.\n ### So, I plan to explore the environment with Frontier-based exploration.\n'
+            debug_text = '# Robot\'s monologue: \n' + debug_text
+            rr.log("robot_monologue", rr.TextDocument(debug_text, media_type = rr.MediaType.MARKDOWN), static = self.static)
+
+        if obs is not None and mode == 'navigation':
+            rgb = self.voxel_map.observations[obs - 1].rgb
+            if not self.rerun:
+                cv2.imwrite(self.log + '/debug_' + text + '.png', rgb[:, :, [2, 1, 0]])
+            else:
+                rr.log('/Past_observation_most_similar_to_text', rr.Image(rgb), static = self.static)
+        traj = []
+        waypoints = None
+        if point is None:
+            print('Unable to find any target point, some exception might happen')
+        else:
+            print('Target point is', point)
+            res = self.planner.plan(start_pose, point)
+            if res.success:
+                waypoints = [pt.state for pt in res.trajectory]
+                # If we are navigating to some object of interst, send (x, y, z) of 
+                # the object so that we can make sure the robot looks at the object after navigation
+                print(waypoints)
+                # finished = (len(waypoints) <= 4 or torch.linalg.norm(torch.Tensor(point)[:2] - torch.Tensor(start_pose[:2])) > 0.8) and mode == 'navigation'
+                # finished = mode == 'navigation'
+                finished = len(waypoints) <= 4 and mode == 'navigation'
+                if not finished:
+                    waypoints = waypoints[:8]
+                traj = self.planner.clean_path_for_xy(waypoints)
+                # traj = traj[1:]
+                if finished:
+                    traj.append([np.nan, np.nan, np.nan])
+                    if isinstance(localized_point, torch.Tensor):
+                        localized_point = localized_point.tolist()
+                    traj.append(localized_point)
+                print('Planned trajectory:', traj)
+            else:
+                print('[FAILURE]', res.reason)
+            
+        if traj is not None:
+            origins = []
+            vectors = []
+            for idx in range(len(traj)):
+                if idx != len(traj) - 1:
+                    origins.append([traj[idx][0], traj[idx][1], 1.5])
+                    vectors.append([traj[idx + 1][0] - traj[idx][0], traj[idx + 1][1] - traj[idx][1], 0])
+            rr.log("/direction", rr.Arrows3D(origins = origins, vectors = vectors, colors=torch.Tensor([0, 1, 0]), radii=0.05), static = self.static)
+            rr.log("/robot_start_pose", rr.Points3D([start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1), static = self.static)
+        
+        self.write_to_pickle()
+        return traj
+
+
+    # def recv_text(self):
+    #     text = self.text_socket.recv_string()
+    #     self.text_socket.send_string('Text recevied, waiting for robot pose')
+    #     start_pose = recv_array(self.text_socket)
+    #     if self.rerun:
+    #         if not self.static:
+    #             rr.set_time_sequence("frame", self.obs_count)
+    #         rr.log('/object', rr.Clear(recursive = True), static = self.static)
+    #         rr.log('/robot_start_pose', rr.Clear(recursive = True), static = self.static)
+    #         rr.log('/direction', rr.Clear(recursive = True), static = self.static)
+    #         rr.log('robot_monologue', rr.Clear(recursive = True), static = self.static)
+    #         rr.log('/Past_observation_most_similar_to_text', rr.Clear(recursive = True), static = self.static)
+    #         if not self.static:
+    #             rr.connect('100.108.67.79:9876')
+
+    #     debug_text = ''
+    #     mode = 'navigation'
+    #     obs = None
+    #     # Do visual grounding
+    #     if text != '':
+    #         with self.voxel_map_lock:
+    #             localized_point, debug_text, obs, pointcloud = self.voxel_map_localizer.localize_A(text, debug = True, return_debug = True)
+    #         if localized_point is not None:
+    #             rr.log("/object", rr.Points3D([localized_point[0], localized_point[1], 1.5], colors=torch.Tensor([1, 0, 0]), radii=0.1), static = self.static)
+    #     # Do Frontier based exploration
+    #     if text is None or text == '' or localized_point is None:
+    #         debug_text += '## Navigation fails, so robot starts exploring environments.\n'
+    #         localized_point = self.sample_frontier(start_pose, text)
+    #         mode = 'exploration'
+    #         rr.log("/object", rr.Points3D([0, 0, 0], colors=torch.Tensor([1, 0, 0]), radii=0), static = self.static)
+    #         print('\n', localized_point, '\n')
+        
+    #     if localized_point is None:
+    #         print('Unable to find any target point, some exception might happen')
+    #         send_array(self.text_socket, [])
+    #         return
+        
+    #     if len(localized_point) == 2:
+    #         localized_point = np.array([localized_point[0], localized_point[1], 0])
+
+    #     point = self.sample_navigation(start_pose, localized_point)
+    #     if mode == 'navigation' and np.min(np.linalg.norm(np.asarray(point)[:2] - np.asarray(pointcloud)[:, :2], axis = -1)) > 1.2:
+    #         localized_point = self.sample_frontier(start_pose, None)
+    #         mode = 'exploration'
+    #         point = self.sample_navigation(start_pose, localized_point)
+    #         debug_text += '## All reachable points of robot are too far from the target object, explore to find new paths. \n'
+
+    #     if self.rerun:
+    #         buf = BytesIO()
+    #         plt.savefig(buf, format='png')
+    #         buf.seek(0)
+    #         img = Image.open(buf)
+    #         img = np.array(img)
+    #         buf.close()
+    #         rr.log('2d_map', rr.Image(img), static = self.static)
+    #     else:
+    #         if text != '':
+    #             plt.savefig(self.log + '/debug_' + text + str(self.obs_count) + '.png')
+    #         else:
+    #             plt.savefig(self.log + '/debug_exploration' + str(self.obs_count) + '.png')
+    #     plt.clf()
+
+    #     if self.rerun:
+    #         if text is not None and text != '':
+    #             debug_text = '### The goal is to navigate to ' + text + '.\n' + debug_text
+    #         else:
+    #             debug_text = '### I have not received any text query from human user.\n ### So, I plan to explore the environment with Frontier-based exploration.\n'
+    #         debug_text = '# Robot\'s monologue: \n' + debug_text
+    #         rr.log("robot_monologue", rr.TextDocument(debug_text, media_type = rr.MediaType.MARKDOWN), static = self.static)
+
+    #     if obs is not None and mode == 'navigation':
+    #         rgb = self.voxel_map.observations[obs - 1].rgb
+    #         if not self.rerun:
+    #             cv2.imwrite(self.log + '/debug_' + text + '.png', rgb[:, :, [2, 1, 0]])
+    #         else:
+    #             rr.log('/Past_observation_most_similar_to_text', rr.Image(rgb), static = self.static)
+    #     traj = None
+    #     waypoints = None
+    #     if point is None:
+    #         print('Unable to find any target point, some exception might happen')
+    #         send_array(self.text_socket, [])
+    #     else:
+    #         print('Target point is', point)
+    #         res = self.planner.plan(start_pose, point)
+    #         if res.success:
+    #             waypoints = [pt.state for pt in res.trajectory]
+    #             # If we are navigating to some object of interst, send (x, y, z) of 
+    #             # the object so that we can make sure the robot looks at the object after navigation
+    #             # print(waypoints[-1][:2], start_pose[:2])
+    #             finished = len(waypoints) <= 10 and mode == 'navigation'
+    #             # finished = mode == 'navigation'
+    #             if not finished:
+    #                 waypoints = waypoints[:7]
+    #             print(waypoints)
+    #             traj = self.planner.clean_path_for_xy(waypoints)
+    #             # traj = traj[1:]
+    #             if finished:
+    #                 traj.append([np.nan, np.nan, np.nan])
+    #                 if isinstance(localized_point, torch.Tensor):
+    #                     localized_point = localized_point.tolist()
+    #                 traj.append(localized_point)
+    #             print('Planned trajectory:', traj)
+    #             send_array(self.text_socket, traj)
+    #         else:
+    #             print('[FAILURE]', res.reason)
+    #             send_array(self.text_socket, [])
+            
+    #     if traj is not None:
+    #         origins = []
+    #         vectors = []
+    #         for idx in range(len(traj)):
+    #             if idx != len(traj) - 1:
+    #                 origins.append([traj[idx][0], traj[idx][1], 1.5])
+    #                 vectors.append([traj[idx + 1][0] - traj[idx][0], traj[idx + 1][1] - traj[idx][1], 0])
+    #         rr.log("/direction", rr.Arrows3D(origins = origins, vectors = vectors, colors=torch.Tensor([0, 1, 0]), radii=0.05), static = self.static)
+    #         rr.log("/robot_start_pose", rr.Points3D([start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1), static = self.static)
+        
+    #     self.write_to_pickle()
+
+    def sample_navigation(self, start, point):
+        plt.clf()
+        obstacles, _ = self.voxel_map.get_2d_map()
+        plt.imshow(obstacles)
+        if point is None:
+            start_pt = self.planner.to_pt(start)
+            plt.scatter(start_pt[1], start_pt[0], s = 10)
+            return None
+        goal = self.space.sample_target_point(start, point, self.planner)
+        print("point:", point, "goal:", goal)
+        obstacles, explored = self.voxel_map.get_2d_map()
+        start_pt = self.planner.to_pt(start)
+        plt.scatter(start_pt[1], start_pt[0], s = 15, c = 'b')
+        point_pt = self.planner.to_pt(point)
+        plt.scatter(point_pt[1], point_pt[0], s = 15, c = 'g')
+        if goal is not None:
+            goal_pt = self.planner.to_pt(goal)
+            plt.scatter(goal_pt[1], goal_pt[0], s = 10, c = 'r')
+        return goal
+
+    def sample_frontier(self, start_pose = [0, 0, 0], text = None):
+        with self.voxel_map_lock:
+            if text is not None and text != '':
+                index, time_heuristics, alignments_heuristics, total_heuristics = self.space.sample_exploration(start_pose, self.planner, self.voxel_map_localizer, text, debug = False)
+            else:
+                index, time_heuristics, _, total_heuristics = self.space.sample_exploration(start_pose, self.planner, None, None, debug = False)
+                alignments_heuristics = time_heuristics
+                
+        obstacles, explored = self.voxel_map.get_2d_map()
+        plt.clf()
+        plt.imshow(obstacles * 0.5 + explored * 0.5)
+        plt.scatter(index[1], index[0], s = 20, c = 'r')
+        return self.voxel_map.grid_coords_to_xyt(torch.tensor([index[0], index[1]]))
+            
+
+    def _recv_image(self):
+        while True:
+            # data = recv_array(self.img_socket)
+            rgb, depth, intrinsics, pose = recv_everything(self.img_socket)
+            print('Image received')
+            start_time = time.time()
+            self.process_rgbd_images(rgb, depth, intrinsics, pose)
+            process_time = time.time() - start_time
+            print('Image processing takes', process_time, 'seconds')
+            print('processing took ' + str(process_time) + ' seconds')
+
+    def forward_one_block(self, resblocks, x):
+        q, k, v = None, None, None
+        y = resblocks.ln_1(x)
+        y = F.linear(y, resblocks.attn.in_proj_weight, resblocks.attn.in_proj_bias)
+        N, L, C = y.shape
+        y = y.view(N, L, 3, C//3).permute(2, 0, 1, 3).reshape(3*N, L, C//3)
+        y = F.linear(y, resblocks.attn.out_proj.weight, resblocks.attn.out_proj.bias)
+        q, k, v = y.tensor_split(3, dim=0)
+        v += x
+        v = v + resblocks.mlp(resblocks.ln_2(v))
+
+        return v
+
+    def forward_one_block_siglip(self, resblocks, x):
+        q, k, v = None, None, None
+        x = F.linear(x, resblocks.in_proj_weight, resblocks.in_proj_bias)
+        N, L, C = x.shape
+        x = x.view(N, L, 3, C//3).permute(2, 0, 1, 3).reshape(3*N, L, C//3)
+        x = F.linear(x, resblocks.out_proj.weight, resblocks.out_proj.bias)
+        q, k, v = x.tensor_split(3, dim=0)
+
+        return v
+
+    def extract_mask_clip_features(self, x, image_shape):
+        if self.siglip:
+            with torch.no_grad():
+                output = self.clip_model.vision_model(x['pixel_values'], output_hidden_states = True)
+            feat = output.last_hidden_state
+            feat = self.forward_one_block_siglip(self.clip_model.vision_model.head.attention, feat)
+            feat = self.clip_model.vision_model.head.layernorm(feat)
+            feat = feat + self.clip_model.vision_model.head.mlp(feat)
+            feat = feat.detach().cpu()
+            with torch.no_grad():
+                N, L, H, W = self.clip_model.vision_model.embeddings.patch_embedding(x['pixel_values']).shape
+            feat = feat.reshape(N, H, W, L).permute(0, 3, 1, 2)
+        else:
+            with torch.no_grad():
+                x = self.clip_model.visual.conv1(x)
+                N, L, H, W = x.shape
+                x = x.reshape(x.shape[0], x.shape[1], -1)
+                x = x.permute(0, 2, 1)
+                x = torch.cat([self.clip_model.visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)
+                x = x + self.clip_model.visual.positional_embedding.to(x.dtype)
+                x = self.clip_model.visual.ln_pre(x)
+                x = x.permute(1, 0, 2)
+                for idx in range(self.clip_model.visual.transformer.layers):
+                    if idx == self.clip_model.visual.transformer.layers - 1:
+                        break
+                    x = self.clip_model.visual.transformer.resblocks[idx](x)
+                x = self.forward_one_block(self.clip_model.visual.transformer.resblocks[-1], x)
+                x = x[1:]
+                x = x.permute(1, 0, 2)
+                x = self.clip_model.visual.ln_post(x)
+                x = x @ self.clip_model.visual.proj
+                feat = x.reshape(N, H, W, -1).permute(0, 3, 1, 2)
+        feat = F.interpolate(feat, image_shape, mode = 'bilinear', align_corners = True)
+        feat = F.normalize(feat, dim = 1)
+        return feat.permute(0, 2, 3, 1)
+    
+    def run_mask_clip(self, rgb, mask, world_xyz):
+
+        with torch.no_grad():
+            results = self.yolo_model.predict(rgb.permute(1,2,0)[:, :, [2, 1, 0]].numpy(), conf=0.15, verbose=False)
+            xyxy_tensor = results[0].boxes.xyxy
+            if len(xyxy_tensor) == 0:
+                return
+            bounding_boxes = torch.stack(sorted(xyxy_tensor, key=lambda box: (box[2] - box[0]) * (box[3] - box[1]), reverse = True), dim = 0)
+            bbox_mask = torch.zeros_like(mask)
+            for box in bounding_boxes:
+                tl_x, tl_y, br_x, br_y = box
+                bbox_mask[max(int(tl_y), 0): min(int(br_y), rgb.shape[1]), max(int(tl_x), 0): min(int(br_x), rgb.shape[2])] = 1
+            bbox_mask = bbox_mask.bool()
+            # print(mask, bbox_mask)
+            mask = torch.logical_or(mask, ~bbox_mask)
+            
+            if not self.siglip:
+                if self.device == 'cpu':
+                    input = self.clip_preprocess(transforms.ToPILImage()(rgb)).unsqueeze(0).to(self.device)
+                else:
+                    input = self.clip_preprocess(transforms.ToPILImage()(rgb)).unsqueeze(0).to(self.device).half()
+            else:
+                input = self.clip_preprocess(images=rgb, padding="max_length", return_tensors="pt")
+                for i in input:
+                    input[i] = input[i].to(self.device)
+            features = self.extract_mask_clip_features(input, rgb.shape[-2:])[0].cpu()
+            
+        valid_xyz = world_xyz[~mask]
+        features = features[~mask]
+        valid_rgb = rgb.permute(1, 2, 0)[~mask]
+        if len(valid_xyz) != 0:
+            self.add_to_voxel_pcd(valid_xyz, features, valid_rgb)
+
+    def run_owl_sam_clip(self, rgb, mask, world_xyz):
+        with torch.no_grad():
+            results = self.yolo_model.predict(rgb.permute(1,2,0)[:, :, [2, 1, 0]].numpy(), conf=0.15, verbose=False)
+            xyxy_tensor = results[0].boxes.xyxy
+            if len(xyxy_tensor) == 0:
+                return
+
+            self.mask_predictor.set_image(rgb.permute(1,2,0).numpy())
+            # bounding_boxes = torch.stack(sorted(results[0]['boxes'], key=lambda box: (box[2] - box[0]) * (box[3] - box[1]), reverse = True), dim = 0)
+            bounding_boxes = torch.stack(sorted(xyxy_tensor, key=lambda box: (box[2] - box[0]) * (box[3] - box[1]), reverse = True), dim = 0)
+            # transformed_boxes = self.mask_predictor.transform.apply_boxes_torch(bounding_boxes.detach().to(self.device), rgb.shape[-2:])
+            # masks, _, _= self.mask_predictor.predict_torch(
+            masks, _, _= self.mask_predictor.predict(
+                point_coords=None,
+                point_labels=None,
+                # boxes=transformed_boxes,
+                box = bounding_boxes,
+                multimask_output=False
+            )
+            if masks.ndim == 3:
+                masks = torch.Tensor(masks).bool()
+            else:
+                masks = torch.Tensor(masks[:, 0, :, :]).bool()
+            
+            # Debug code, visualize all bounding boxes and segmentation masks
+
+            image_vis = np.array(rgb.permute(1, 2, 0))
+            segmentation_color_map = np.zeros(image_vis.shape, dtype=np.uint8)
+            # cv2.imwrite('clean_' + str(self.obs_count) + '.jpg', cv2.cvtColor(image_vis, cv2.COLOR_RGB2BGR))
+            for idx, box in enumerate(bounding_boxes):
+                tl_x, tl_y, br_x, br_y = box
+                tl_x, tl_y, br_x, br_y = tl_x.item(), tl_y.item(), br_x.item(), br_y.item()
+                cv2.rectangle(image_vis, (int(tl_x), int(tl_y)), (int(br_x), int(br_y)), (255, 0, 0), 2)
+            image_vis = cv2.cvtColor(image_vis, cv2.COLOR_RGB2BGR) 
+            for vis_mask in masks:
+                segmentation_color_map[vis_mask.detach().cpu().numpy()] = [0, 255, 0]
+            image_vis = cv2.addWeighted(image_vis, 0.7, segmentation_color_map, 0.3, 0)
+            if not self.rerun:
+                cv2.imwrite(self.log + "/seg" + str(self.obs_count) + ".jpg", image_vis)
+            # else:
+            #     rr.log('Segmentation mask', rr.Image(image_vis[:, :, [2, 1, 0]]))
+    
+            crops = []
+            if not self.siglip:
+                for (box, mask) in zip(bounding_boxes, masks):
+                    tl_x, tl_y, br_x, br_y = box
+                    crops.append(self.clip_preprocess(transforms.ToPILImage()(rgb[:, max(int(tl_y), 0): min(int(br_y), rgb.shape[1]), max(int(tl_x), 0): min(int(br_x), rgb.shape[2])])))
+                features = self.clip_model.encode_image(torch.stack(crops, dim = 0).to(self.device))
+            else:
+                for box in bounding_boxes:
+                    tl_x, tl_y, br_x, br_y = box
+                    crops.append(rgb[:, max(int(tl_y), 0): min(int(br_y), rgb.shape[1]), max(int(tl_x), 0): min(int(br_x), rgb.shape[2])])
+                inputs = self.clip_preprocess(images = crops, padding="max_length", return_tensors="pt").to(self.device)
+                features = self.clip_model.get_image_features(**inputs)
+            features = F.normalize(features, dim = -1).cpu()
+
+
+        for idx, (sam_mask, feature) in enumerate(zip(masks.cpu(), features.cpu())):
+            valid_mask = torch.logical_and(~mask, sam_mask)
+            valid_xyz = world_xyz[valid_mask]
+            if valid_xyz.shape[0] == 0:
+                continue
+            feature = feature.repeat(valid_xyz.shape[0], 1)
+            valid_rgb = rgb.permute(1, 2, 0)[valid_mask]
+            self.add_to_voxel_pcd(valid_xyz, feature, valid_rgb)
+    
+    def add_to_voxel_pcd(self, valid_xyz, feature, valid_rgb, weights = None, threshold = 0.95):
+        # Adding all points to voxelizedPointCloud is useless and expensive, we should exclude threshold of all points
+        selected_indices = torch.randperm(len(valid_xyz))[:int((1 - threshold) * len(valid_xyz))]
+        if len(selected_indices) == 0:
+            return
+        if valid_xyz is not None:
+            valid_xyz = valid_xyz[selected_indices]
+        if feature is not None:
+            feature = feature[selected_indices]
+        if valid_rgb is not None:
+            valid_rgb = valid_rgb[selected_indices]
+        if weights is not None:
+            weights = weights[selected_indices]
+        with self.voxel_map_lock:
+            self.voxel_map_localizer.add(points = valid_xyz, 
+                                    features = feature,
+                                    rgb = valid_rgb,
+                                    weights = weights,
+                                    obs_count = self.obs_count)
+
+    def process_rgbd_images(self, rgb, depth, intrinsics, pose):
+        # import time
+        # start_time = time.time()
+        if not os.path.exists(self.log):
+            os.mkdir(self.log)
+        self.obs_count += 1
+        world_xyz = get_xyz(depth, pose, intrinsics).squeeze(0)
+
+        # cv2.imwrite('debug/rgb' + str(self.obs_count) + '.jpg', rgb[:, :, [2, 1, 0]])
+        np.save(self.log + '/rgb' + str(self.obs_count) + '.npy', rgb)
+        np.save(self.log + '/depth' + str(self.obs_count) + '.npy', depth)
+        np.save(self.log + '/intrinsics' + str(self.obs_count) + '.npy', intrinsics)
+        np.save(self.log + '/pose' + str(self.obs_count) + '.npy', pose)
+
+        rgb, depth = torch.from_numpy(rgb), torch.from_numpy(depth)
+        rgb = rgb.permute(2, 0, 1).to(torch.uint8)
+
+        median_depth = torch.from_numpy(
+            scipy.ndimage.median_filter(depth, size=5)
+        )
+        median_filter_error = (depth - median_depth).abs()
+        valid_depth = torch.logical_and(depth < self.max_depth, depth > self.min_depth)
+        valid_depth = (
+            valid_depth
+            & (median_filter_error < 0.01).bool()
+        )
+        
+        with self.voxel_map_lock:
+            self.voxel_map_localizer.voxel_pcd.clear_points(depth, torch.from_numpy(intrinsics), torch.from_numpy(pose))
+            self.voxel_map.voxel_pcd.clear_points(depth, torch.from_numpy(intrinsics), torch.from_numpy(pose))
+        
+        if self.vision_method == 'mask&*lip':
+            self.run_owl_sam_clip(rgb, ~valid_depth, world_xyz)
+        else:
+            self.run_mask_clip(rgb, ~valid_depth, world_xyz)
+
+        self.voxel_map.add(
+            camera_pose = torch.Tensor(pose), 
+            rgb = torch.Tensor(rgb).permute(1, 2, 0), 
+            depth = torch.Tensor(depth), 
+            camera_K = torch.Tensor(intrinsics)
+        )
+        obs, exp = self.voxel_map.get_2d_map()
+        if self.rerun:
+            if not self.static:
+                rr.set_time_sequence("frame", self.obs_count)
+            # rr.log('robot_pov', rr.Image(rgb.permute(1, 2, 0)), static = self.static)
+            if self.voxel_map.voxel_pcd._points is not None:
+                rr.log("Obstalce_map/pointcloud", rr.Points3D(self.voxel_map.voxel_pcd._points.detach().cpu(), \
+                                                              colors=self.voxel_map.voxel_pcd._rgb.detach().cpu() / 255., radii=0.03), static = self.static)
+            if self.voxel_map_localizer.voxel_pcd._points is not None:
+                rr.log("Semantic_memory/pointcloud", rr.Points3D(self.voxel_map_localizer.voxel_pcd._points.detach().cpu(), \
+                                                                 colors=self.voxel_map_localizer.voxel_pcd._rgb.detach().cpu() / 255., radii=0.03), static = self.static)
+            # rr.log("Obstalce_map/2D_obs_map", rr.Image(obs.int() * 127 + exp.int() * 127))
+        else:
+            cv2.imwrite(self.log + '/debug_' + str(self.obs_count) + '.jpg', np.asarray(obs.int() * 127 + exp.int() * 127))
+        # end_time = time.time()
+        # print('image processing takes ' + str(end_time - start_time) + ' seconds.')
+
+    def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
+        if isinstance(pickle_file_name, str):
+            pickle_file_name = Path(pickle_file_name)
+        assert pickle_file_name.exists(), f"No file found at {pickle_file_name}"
+        with pickle_file_name.open("rb") as f:
+            data = pickle.load(f)
+        for i, (
+            camera_pose,
+            xyz,
+            rgb,
+            feats,
+            depth,
+            base_pose,
+            K,
+            world_xyz,
+        ) in enumerate(
+            zip(
+                data["camera_poses"],
+                data["xyz"],
+                data["rgb"],
+                data["feats"],
+                data["depth"],
+                data["base_poses"],
+                data["camera_K"],
+                data["world_xyz"],
+            )
+        ):
+            # Handle the case where we dont actually want to load everything
+            if num_frames > 0 and i >= num_frames:
+                break
+
+            camera_pose = self.voxel_map.fix_data_type(camera_pose)
+            xyz = self.voxel_map.fix_data_type(xyz)
+            rgb = self.voxel_map.fix_data_type(rgb)
+            depth = self.voxel_map.fix_data_type(depth)
+            if feats is not None:
+                feats = self.voxel_map.fix_data_type(feats)
+            base_pose = self.voxel_map.fix_data_type(base_pose)
+            self.voxel_map.add(
+                camera_pose=camera_pose,
+                xyz=xyz,
+                rgb=rgb,
+                feats=feats,
+                depth=depth,
+                base_pose=base_pose,
+                camera_K=K,
+            )
+            self.obs_count += 1
+        self.voxel_map_localizer.voxel_pcd._points = data["combined_xyz"]
+        self.voxel_map_localizer.voxel_pcd._features = data["combined_feats"]
+        self.voxel_map_localizer.voxel_pcd._weights = data["combined_weights"]
+        self.voxel_map_localizer.voxel_pcd._rgb = data["combined_rgb"]
+        self.voxel_map_localizer.voxel_pcd._obs_counts = data["obs_id"]
+        self.voxel_map_localizer.voxel_pcd._entity_ids = data["entity_id"]
+
+    def write_to_pickle(self):
+        """Write out to a pickle file. This is a rough, quick-and-easy output for debugging, not intended to replace the scalable data writer in data_tools for bigger efforts."""
+        if not os.path.exists('debug'):
+            os.mkdir('debug')
+        filename = 'debug/' + self.log + '.pkl'
+        data = {}
+        data["camera_poses"] = []
+        data["camera_K"] = []
+        data["base_poses"] = []
+        data["xyz"] = []
+        data["world_xyz"] = []
+        data["rgb"] = []
+        data["depth"] = []
+        data["feats"] = []
+        for frame in self.voxel_map.observations:
+            # add it to pickle
+            # TODO: switch to using just Obs struct?
+            data["camera_poses"].append(frame.camera_pose)
+            data["base_poses"].append(frame.base_pose)
+            data["camera_K"].append(frame.camera_K)
+            data["xyz"].append(frame.xyz)
+            data["world_xyz"].append(frame.full_world_xyz)
+            data["rgb"].append(frame.rgb)
+            data["depth"].append(frame.depth)
+            data["feats"].append(frame.feats)
+            for k, v in frame.info.items():
+                if k not in data:
+                    data[k] = []
+                data[k].append(v)
+        (
+            data["combined_xyz"],
+            data["combined_feats"],
+            data["combined_weights"],
+            data["combined_rgb"],
+        ) = self.voxel_map_localizer.voxel_pcd.get_pointcloud()
+        data["obs_id"] = self.voxel_map_localizer.voxel_pcd._obs_counts
+        data["entity_id"] = self.voxel_map_localizer.voxel_pcd._entity_ids
+        with open(filename, "wb") as f:
+            pickle.dump(data, f)
+
+# @hydra.main(version_base="1.2", config_path=".", config_name="config.yaml")
+# def main(cfg):
+#     torch.manual_seed(1)
+#     imageProcessor = ImageProcessor(rerun = False, static = False, min_depth = 0., max_depth = 2.5)
+#     imageProcessor = ImageProcessor(rerun = cfg.rerun, static = cfg.static, min_depth = cfg.min_depth, max_depth = cfg.max_depth)
+#     if not cfg.pickle_file_name is None:
+#         imageProcessor.read_from_pickle(cfg.pickle_file_name)
+#     print(imageProcessor.voxel_map_localizer.voxel_pcd._points)
+#     if cfg.open_communication:
+#         while True:
+#             imageProcessor.recv_text()
+#     for i in range(8, 16):
+#         imageProcessor.read_from_pickle('debug/debug_2024-09-02_16-22-55.pkl', i)
+#         imageProcessor.space.sample_exploration(xyt = [0, 0, 0], planner = imageProcessor.planner, text = None)
+
+# if __name__ == "__main__":
+#     main(None)
\ No newline at end of file
diff --git a/src/stretch/requirements.txt b/src/stretch/requirements.txt
new file mode 100644
index 000000000..a2cb95b17
--- /dev/null
+++ b/src/stretch/requirements.txt
@@ -0,0 +1,66 @@
+torch_cluster == 1.6.2
+torch_geometric
+ultralytics
+wget
+# SAM2
+# conda install pyg -c pyg
+
+
+# openai
+# zmq
+# torch
+# ultralytics
+# rerun-sdk
+# numpy <1.24 # certain deprecated operations were used in other deps
+# scipy
+# opencv-python
+# pybullet
+# trimesh
+# pytest
+# open3d
+# scikit-image
+# scikit-fmm
+# scikit-learn
+# numpy-quaternion
+# natsort
+# # Neural networks
+# openai-clip
+# timm
+# pandas >= 2.1.3
+# mss
+# # visualizations and debugging
+# matplotlib
+# # Command line tools
+# click
+# transformers
+# data tools
+# yacs
+# # h5py
+# imageio
+# pygifsicle
+# pynput
+# torch_geometric
+# torch_cluster
+# atomicwrites
+# lxml
+# transforms3d
+
+# git+https://github.com/facebookresearch/segment-anything-2.git
+
+# accelerate
+# wget
+# setuptools==69.5.1
+# loguru
+# sophuspy==1.2.0
+# sentencepiece
+# protobuf
+# packaging
+# pyparsing
+# cycler
+# kiwisolver
+# six
+# importlib_metadata
+# pytz
+# psutil
+# transforms3d
+# lxml

From 4ead1b0d27f2b74a29adc19fde39e34b7ab34d4d Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 4 Sep 2024 11:12:48 -0400
Subject: [PATCH 010/410] updates to voxel code

---
 src/stretch/mapping/voxel/voxel.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index 4a3340d12..56195dfda 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -129,6 +129,7 @@ def __init__(
         voxel_kwargs: Dict[str, Any] = {},
         encoder: Optional[BaseImageTextEncoder] = None,
         map_2d_device: str = "cpu",
+        device: Optional[str] = None,
         use_instance_memory: bool = False,
         use_median_filter: bool = False,
         median_filter_size: int = 5,
@@ -209,6 +210,12 @@ def __init__(
         self.encoder = encoder
         self.map_2d_device = map_2d_device
 
+        # Set the device we use for things here
+        if device is not None:
+            self.device = device
+        else:
+            self.device = self.map_2d_device
+
         # Create kernel(s) for obstacle dilation over 2d/3d maps
         if self.pad_obstacles > 0:
             self.dilate_obstacles_kernel = torch.nn.Parameter(

From e17246a61cc260181b54a9d9ae888c8c8bfbdaba Mon Sep 17 00:00:00 2001
From: stretch-se3-3055 <grail-hello-stretch@nyu.edu>
Date: Wed, 4 Sep 2024 09:57:38 -0700
Subject: [PATCH 011/410] modification

---
 src/stretch/dynav/ok_robot_hw/robot.py               | 6 ++++--
 src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py | 4 ++--
 src/stretch_ros2_bridge/launch/cameras.launch.py     | 6 +++---
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/stretch/dynav/ok_robot_hw/robot.py b/src/stretch/dynav/ok_robot_hw/robot.py
index fe5bd395a..f535183b8 100644
--- a/src/stretch/dynav/ok_robot_hw/robot.py
+++ b/src/stretch/dynav/ok_robot_hw/robot.py
@@ -113,9 +113,10 @@ def move_to_position(
             target_state[5] = wrist_roll    
         
         # Actual Movement
-        print('Target Position', target_state)
         self.robot.arm_to(target_state, blocking = blocking)
-        print('Actual location', self.robot.get_six_joints())
+        print('Expected', target_state)
+        print('Actual', self.robot.get_six_joints())
+        print('Error', target_state - self.robot.get_six_joints())
 
         # Head state update and Movement
         target_head_pan, target_head_tilt = self.robot.get_pan_tilt()
@@ -202,6 +203,7 @@ def move_to_joints(self, joints, gripper, mode=0):
         self.robot.arm_to(target_state, blocking = True)
         print(f"current state {self.robot.get_six_joints()}")
         print(f"target state {target_state}")
+        print(f"error {target_state - self.robot.get_six_joints()}")
         # time.sleep(1)
 
         #NOTE: below code is to fix the pitch drift issue in current hello-robot. Remove it if there is no pitch drift issue
diff --git a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
index 93f818058..de23b0320 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
@@ -163,13 +163,13 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
     pin_base_point = apply_se3_transform(pin_cam2base_transform, pin_point)
     # print(f"pin base point {pin_base_point}")
 
-    diff_value = (0.227 - gripper_depth - gripper_height) # 0.228 is the distance between link_Straight_gripper node and the gripper tip
+    diff_value = (0.228 - gripper_depth - gripper_height) # 0.228 is the distance between link_Straight_gripper node and the gripper tip
     pin_transformed_point1[2] -= (diff_value)
     ref_diff = (diff_value)
 
     # Moving gripper to a point that is 0.2m away from the pose center in the line of gripper
     robot.move_to_pose(
-        [pin_transformed_point1[0], pin_transformed_point1[1], pin_transformed_point1[2] - 0.2],
+        [pin_transformed_point1[0], pin_transformed_point1[1], pin_transformed_point1[2] - 0.22],
         [0, 0, 0],
         [1],
         move_mode = 1
diff --git a/src/stretch_ros2_bridge/launch/cameras.launch.py b/src/stretch_ros2_bridge/launch/cameras.launch.py
index e7d308e49..ced4403be 100644
--- a/src/stretch_ros2_bridge/launch/cameras.launch.py
+++ b/src/stretch_ros2_bridge/launch/cameras.launch.py
@@ -50,9 +50,9 @@ def generate_launch_description():
         # "temporal_filter.enable": "True",
         # "disparity_filter.enable": "True",
         "device_type": "d405",
-        "rgb_camera.color_profile": "480x270x15",
-        "depth_module.depth_profile": "480x270x15",
-        "depth_module.color_profile": "480X270X15",
+        "rgb_camera.color_profile": "640x480x15",
+        "depth_module.depth_profile": "640x480x15",
+        "depth_module.color_profile": "640X480X15",
         # "rgb_camera.profile": "480x270x30",
         # "depth_module.profile": "480x270x30",
         "rgb_camera.enable_auto_exposure": "true",

From 2ca0348ee71291e4ac1dfadcf117bb733fa65356 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 4 Sep 2024 13:22:17 -0400
Subject: [PATCH 012/410] update head sweeping behavior

---
 src/stretch/agent/zmq_client.py | 17 +++++++++++++----
 src/stretch/app/head_sweep.py   | 17 +++++++++++++----
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 7f7d8e222..345639dcd 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -557,7 +557,12 @@ def _wait_for_head(
     ) -> None:
         """Wait for the head to move to a particular configuration."""
         t0 = timeit.default_timer()
-        while True:
+        at_goal = False
+
+        # Wait for the head to move
+        # If the head is not moving, we are done
+        # Head must be stationary for at least min_wait_time
+        while not self._finish:
             joint_positions, joint_velocities, _ = self.get_joint_state()
             if joint_positions is None:
                 continue
@@ -573,9 +578,15 @@ def _wait_for_head(
             if verbose:
                 print("Waiting for head to move", pan_err, tilt_err, "head speed =", head_speed)
             if pan_err < self._head_pan_tolerance and tilt_err < self._head_tilt_tolerance:
-                break
+                at_goal = True
+                at_goal_t = timeit.default_timer()
             elif resend_action is not None:
                 self.send_socket.send_pyobj(resend_action)
+            else:
+                at_goal = False
+
+            if at_goal and timeit.default_timer() - at_goal_t > min_wait_time:
+                break
 
             t1 = timeit.default_timer()
             if t1 - t0 > min_wait_time and head_speed < self._head_not_moving_tolerance:
@@ -587,8 +598,6 @@ def _wait_for_head(
                 print("Timeout waiting for head to move")
                 break
             time.sleep(0.01)
-        # Tiny pause after head rotation
-        time.sleep(0.2)
 
     def _wait_for_mode(self, mode, verbose: bool = False, timeout: float = 20.0):
         t0 = timeit.default_timer()
diff --git a/src/stretch/app/head_sweep.py b/src/stretch/app/head_sweep.py
index 04076491b..c424b7f5b 100644
--- a/src/stretch/app/head_sweep.py
+++ b/src/stretch/app/head_sweep.py
@@ -47,6 +47,7 @@ def main(
             parameters=robot.parameters,
             device_id=device_id,
             verbose=verbose,
+            enable_rerun_server=(not show_open3d),
         )
     else:
         semantic_sensor = None
@@ -58,14 +59,22 @@ def main(
     # Wait and then...
     robot.head_to(head_pan=0, head_tilt=0, blocking=True)
     agent.update()
-    robot.head_to(head_pan=-np.pi / 2, head_tilt=0, blocking=True)
+
+    robot.head_to(head_pan=0, head_tilt=-np.pi / 4, blocking=True)
     agent.update()
-    robot.head_to(head_pan=-np.pi, head_tilt=0, blocking=True)
+
+    robot.head_to(head_pan=-np.pi / 4, head_tilt=-np.pi / 4, blocking=True)
     agent.update()
-    robot.head_to(head_pan=0, head_tilt=0, blocking=True)
+
+    robot.head_to(head_pan=-np.pi / 2, head_tilt=-np.pi / 4, blocking=True)
     agent.update()
-    robot.head_to(head_pan=0, head_tilt=-np.pi / 2, blocking=True)
+
+    robot.head_to(head_pan=-3 * np.pi / 4, head_tilt=-np.pi / 4, blocking=True)
+    agent.update()
+
+    robot.head_to(head_pan=-np.pi, head_tilt=-np.pi / 4, blocking=True)
     agent.update()
+
     robot.head_to(head_pan=0, head_tilt=0, blocking=True)
     agent.update()
 

From 8a0651ed4a587072480d3ac1b418fc7dac752170 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 4 Sep 2024 14:11:53 -0400
Subject: [PATCH 013/410] Rotation of head when updating

---
 .../agent/operations/rotate_in_place.py       |  6 +--
 src/stretch/agent/robot_agent.py              | 45 ++++++++++++++-----
 src/stretch/app/mapping.py                    |  4 +-
 src/stretch/config/default_planner.yaml       |  5 ++-
 4 files changed, 42 insertions(+), 18 deletions(-)

diff --git a/src/stretch/agent/operations/rotate_in_place.py b/src/stretch/agent/operations/rotate_in_place.py
index 565c2683c..cfb26c612 100644
--- a/src/stretch/agent/operations/rotate_in_place.py
+++ b/src/stretch/agent/operations/rotate_in_place.py
@@ -14,14 +14,14 @@ class RotateInPlaceOperation(ManagedOperation):
     """Rotate the robot in place. Number of steps is determined by parameters file."""
 
     def can_start(self) -> bool:
-        self.attempt(f"Rotating for {self.parameters['in_place_rotation_steps']} steps.")
+        self.attempt(f"Rotating for {self.parameters['agent']['in_place_rotation_steps']} steps.")
         return True
 
     def run(self) -> None:
-        self.intro("rotating for {self.parameters['in_place_rotation_steps']} steps.")
+        self.intro("rotating for {self.parameters['agent']['in_place_rotation_steps']} steps.")
         self._successful = False
         self.robot.rotate_in_place(
-            steps=self.parameters["in_place_rotation_steps"],
+            steps=self.parameters["agent"]["in_place_rotation_steps"],
             visualize=False,
         )
         self._successful = True
diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 7bde5e47d..1554be0a7 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -82,6 +82,8 @@ def __init__(
         self.tts = get_text_to_speech(parameters["tts_engine"])
         self._use_instance_memory = use_instance_memory
 
+        self._sweep_head_on_update = parameters["agent"]["sweep_head_on_update"]
+
         # Parameters for feature matching and exploration
         self._is_match_threshold = parameters["instance_memory"]["matching"][
             "feature_match_threshold"
@@ -338,10 +340,10 @@ def rotate_in_place(
         x, y, theta = self.robot.get_base_pose()
         if verbose:
             print(f"==== ROTATE IN PLACE at {x}, {y} ====")
-        while i < steps - 1:
+        while i < steps:
             t0 = timeit.default_timer()
             self.robot.navigate_to(
-                [x, y, theta + ((i + 1) * step_size)],
+                [x, y, theta + (i * step_size)],
                 relative=False,
                 blocking=True,
                 verbose=verbose,
@@ -396,6 +398,12 @@ def update(self, visualize_map: bool = False, debug_instances: bool = False):
         obs = None
         t0 = timeit.default_timer()
 
+        steps = 0
+        if self._sweep_head_on_update:
+            num_steps = 5
+        else:
+            num_steps = 1
+
         while obs is None:
             obs = self.robot.get_observation()
             t1 = timeit.default_timer()
@@ -403,18 +411,31 @@ def update(self, visualize_map: bool = False, debug_instances: bool = False):
                 logger.error("Failed to get observation")
                 return
 
-        t1 = timeit.default_timer()
-        self.obs_history.append(obs)
-        self.obs_count += 1
-        # Optionally do this
-        if self.semantic_sensor is not None:
-            # Semantic prediction
-            obs = self.semantic_sensor.predict(obs)
+        for i in range(num_steps):
+            if obs is None:
+                obs = self.robot.get_observation()
 
-        t2 = timeit.default_timer()
-        self.voxel_map.add_obs(obs)
+            if i > 0:
+                pan = -1 * i * np.pi / 4
+                tilt = -1 * np.pi / 4
+                print(f"Head sweep {i} at {pan}, {tilt}")
+                self.robot.head_to(pan, tilt, blocking=True)
+                input("Press enter to continue...")
 
-        t3 = timeit.default_timer()
+            t1 = timeit.default_timer()
+            self.obs_history.append(obs)
+            self.obs_count += 1
+            # Optionally do this
+            if self.semantic_sensor is not None:
+                # Semantic prediction
+                obs = self.semantic_sensor.predict(obs)
+
+            t2 = timeit.default_timer()
+            self.voxel_map.add_obs(obs)
+            t3 = timeit.default_timer()
+
+            # Clear it out
+            obs = None
 
         if self.use_scene_graph:
             self._update_scene_graph()
diff --git a/src/stretch/app/mapping.py b/src/stretch/app/mapping.py
index 376579ccb..54df9bb93 100644
--- a/src/stretch/app/mapping.py
+++ b/src/stretch/app/mapping.py
@@ -177,9 +177,9 @@ def demo_main(
         matches = []
 
     # Rotate in place
-    if parameters["in_place_rotation_steps"] > 0:
+    if parameters["agent"]["in_place_rotation_steps"] > 0:
         demo.rotate_in_place(
-            steps=parameters["in_place_rotation_steps"],
+            steps=parameters["agent"]["in_place_rotation_steps"],
             visualize=show_intermediate_maps,
         )
 
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 35ca6ae5f..068195e20 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -62,7 +62,10 @@ motion:
     gripper_open_threshold: 0.3
 
 # Exploration
-in_place_rotation_steps: 8
+agent:
+  # in_place_rotation_steps: 8  # If you are not moving the head, rotate more often
+  in_place_rotation_steps: 2
+  sweep_head_on_update: True
 
 # Instance memory parameters
 # These are mostly around making sure that we reject views of objects that are too small, too spotty, too unreliable, etc.

From 1f4d29e52c9f9b7770bbce4502dd884fd0547b2c Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 4 Sep 2024 14:13:32 -0400
Subject: [PATCH 014/410] updates to robot agent

---
 src/stretch/agent/robot_agent.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 1554be0a7..14c1b8d5c 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -412,15 +412,13 @@ def update(self, visualize_map: bool = False, debug_instances: bool = False):
                 return
 
         for i in range(num_steps):
-            if obs is None:
-                obs = self.robot.get_observation()
-
-            if i > 0:
+            if self._sweep_head_on_update:
                 pan = -1 * i * np.pi / 4
                 tilt = -1 * np.pi / 4
                 print(f"Head sweep {i} at {pan}, {tilt}")
                 self.robot.head_to(pan, tilt, blocking=True)
                 input("Press enter to continue...")
+                obs = self.robot.get_observation()
 
             t1 = timeit.default_timer()
             self.obs_history.append(obs)
@@ -434,8 +432,8 @@ def update(self, visualize_map: bool = False, debug_instances: bool = False):
             self.voxel_map.add_obs(obs)
             t3 = timeit.default_timer()
 
-            # Clear it out
-            obs = None
+            if not self._sweep_head_on_update:
+                break
 
         if self.use_scene_graph:
             self._update_scene_graph()

From 781f1796a170b0105f1a077bd507774769342756 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 4 Sep 2024 14:14:48 -0400
Subject: [PATCH 015/410] update robot agent code for rotation support

---
 src/stretch/agent/robot_agent.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 14c1b8d5c..7dadfaa18 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -415,9 +415,8 @@ def update(self, visualize_map: bool = False, debug_instances: bool = False):
             if self._sweep_head_on_update:
                 pan = -1 * i * np.pi / 4
                 tilt = -1 * np.pi / 4
-                print(f"Head sweep {i} at {pan}, {tilt}")
+                print(f"[UPDATE] Head sweep {i} at {pan}, {tilt}")
                 self.robot.head_to(pan, tilt, blocking=True)
-                input("Press enter to continue...")
                 obs = self.robot.get_observation()
 
             t1 = timeit.default_timer()

From 2b23a405c7ff6d99e7943ddc5885ba5ffa6b7d5e Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 4 Sep 2024 14:16:41 -0400
Subject: [PATCH 016/410] reduce terminal spam by getting rid of a message

---
 src/stretch/agent/zmq_client.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 345639dcd..83df6174b 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -920,10 +920,10 @@ def send_action(
             self._iter += 1
 
             # TODO: fix all of this - why do we need to do this?
-            print("SENDING THIS ACTION:", next_action)
+            # print("SENDING THIS ACTION:", next_action)
             self.send_socket.send_pyobj(next_action)
             time.sleep(0.01)
-            print("SENDING THIS ACTION:", next_action)
+            # print("SENDING THIS ACTION:", next_action)
             self.send_socket.send_pyobj(next_action)
 
             # For tracking goal

From 2326ce21b414ca1f2a1581a55ec2c93af7ec2ba3 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 4 Sep 2024 14:23:36 -0400
Subject: [PATCH 017/410] improve mapping

---
 src/stretch/agent/robot_agent.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 7dadfaa18..d12c48ad5 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -411,10 +411,10 @@ def update(self, visualize_map: bool = False, debug_instances: bool = False):
                 logger.error("Failed to get observation")
                 return
 
+        tilt = -1 * np.pi / 4
         for i in range(num_steps):
             if self._sweep_head_on_update:
                 pan = -1 * i * np.pi / 4
-                tilt = -1 * np.pi / 4
                 print(f"[UPDATE] Head sweep {i} at {pan}, {tilt}")
                 self.robot.head_to(pan, tilt, blocking=True)
                 obs = self.robot.get_observation()
@@ -434,6 +434,9 @@ def update(self, visualize_map: bool = False, debug_instances: bool = False):
             if not self._sweep_head_on_update:
                 break
 
+        if self._sweep_head_on_update:
+            self.robot.head_to(0, tilt, blocking=True)
+
         if self.use_scene_graph:
             self._update_scene_graph()
             self.scene_graph.get_relationships()

From 90c81a30f26606bfcce8d609c58a68c2d5c56b36 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 5 Sep 2024 10:38:41 -0400
Subject: [PATCH 018/410] modify head sweep

---
 src/stretch/app/head_sweep.py | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/src/stretch/app/head_sweep.py b/src/stretch/app/head_sweep.py
index c424b7f5b..b50683ca5 100644
--- a/src/stretch/app/head_sweep.py
+++ b/src/stretch/app/head_sweep.py
@@ -56,27 +56,30 @@ def main(
     observation = robot.get_observation()
     robot.move_to_nav_posture()
 
-    # Wait and then...
-    robot.head_to(head_pan=0, head_tilt=0, blocking=True)
-    agent.update()
+    if robot.parameters["agent"]["sweep_head_on_update"]:
+        agent.update()
+    else:
+        # Wait and then...
+        robot.head_to(head_pan=0, head_tilt=0, blocking=True)
+        agent.update()
 
-    robot.head_to(head_pan=0, head_tilt=-np.pi / 4, blocking=True)
-    agent.update()
+        robot.head_to(head_pan=0, head_tilt=-np.pi / 4, blocking=True)
+        agent.update()
 
-    robot.head_to(head_pan=-np.pi / 4, head_tilt=-np.pi / 4, blocking=True)
-    agent.update()
+        robot.head_to(head_pan=-np.pi / 4, head_tilt=-np.pi / 4, blocking=True)
+        agent.update()
 
-    robot.head_to(head_pan=-np.pi / 2, head_tilt=-np.pi / 4, blocking=True)
-    agent.update()
+        robot.head_to(head_pan=-np.pi / 2, head_tilt=-np.pi / 4, blocking=True)
+        agent.update()
 
-    robot.head_to(head_pan=-3 * np.pi / 4, head_tilt=-np.pi / 4, blocking=True)
-    agent.update()
+        robot.head_to(head_pan=-3 * np.pi / 4, head_tilt=-np.pi / 4, blocking=True)
+        agent.update()
 
-    robot.head_to(head_pan=-np.pi, head_tilt=-np.pi / 4, blocking=True)
-    agent.update()
+        robot.head_to(head_pan=-np.pi, head_tilt=-np.pi / 4, blocking=True)
+        agent.update()
 
-    robot.head_to(head_pan=0, head_tilt=0, blocking=True)
-    agent.update()
+        robot.head_to(head_pan=0, head_tilt=0, blocking=True)
+        agent.update()
 
     if show_open3d:
         agent.show_map()

From 9636058004c2eefaa70009ea4f42d2de377e1029 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 5 Sep 2024 12:17:59 -0400
Subject: [PATCH 019/410] temporarily disable the rerun server

---
 src/stretch/agent/zmq_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 83df6174b..7fb3545b7 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -86,7 +86,7 @@ def __init__(
         ee_link_name: Optional[str] = None,
         manip_mode_controlled_joints: Optional[List[str]] = None,
         start_immediately: bool = True,
-        enable_rerun_server: bool = True,
+        enable_rerun_server: bool = False,
     ):
         """
         Create a client to communicate with the robot over ZMQ.

From ccc5986721e53874b786ba88c9d9496e2a0743d5 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 5 Sep 2024 13:20:25 -0400
Subject: [PATCH 020/410] update planner

---
 src/stretch/config/default_planner.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 068195e20..ed1d86545 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -12,6 +12,7 @@ voxel_size: 0.05
 # 0.05 seems too low
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
+neg_obs_height: -0.05  # Things less than this height ARE obstacles
 obs_min_density: 10  # This many points makes it an obstacle
 
 # Padding

From 7a4c9e3322eb65c8e672378c1f4f96a1d162f5c0 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 5 Sep 2024 13:25:24 -0400
Subject: [PATCH 021/410] some light refactoring to make it easier to create
 SVM from parameters

---
 src/stretch/agent/robot_agent.py   | 45 +++--------------------
 src/stretch/mapping/voxel/voxel.py | 59 ++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 40 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index d12c48ad5..e50b328c3 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -139,46 +139,11 @@ def _create_voxel_map(self, parameters: Parameters) -> SparseVoxelMap:
         Returns:
             SparseVoxelMap: the voxel map
         """
-        return SparseVoxelMap(
-            resolution=self._voxel_size,
-            local_radius=parameters["local_radius"],
-            obs_min_height=parameters["obs_min_height"],
-            obs_max_height=parameters["obs_max_height"],
-            min_depth=parameters["min_depth"],
-            max_depth=parameters["max_depth"],
-            pad_obstacles=parameters["pad_obstacles"],
-            add_local_radius_points=parameters.get("add_local_radius_points", default=True),
-            remove_visited_from_obstacles=parameters.get(
-                "remove_visited_from_obstacles", default=False
-            ),
-            obs_min_density=parameters["obs_min_density"],
-            encoder=self.encoder,
-            smooth_kernel_size=parameters.get("filters/smooth_kernel_size", -1),
-            use_median_filter=parameters.get("filters/use_median_filter", False),
-            median_filter_size=parameters.get("filters/median_filter_size", 5),
-            median_filter_max_error=parameters.get("filters/median_filter_max_error", 0.01),
-            use_derivative_filter=parameters.get("filters/use_derivative_filter", False),
-            derivative_filter_threshold=parameters.get("filters/derivative_filter_threshold", 0.5),
-            use_instance_memory=(self.semantic_sensor is not None or self._use_instance_memory),
-            instance_memory_kwargs={
-                "min_pixels_for_instance_view": parameters.get("min_pixels_for_instance_view", 100),
-                "min_instance_thickness": parameters.get(
-                    "instance_memory/min_instance_thickness", 0.01
-                ),
-                "min_instance_vol": parameters.get("instance_memory/min_instance_vol", 1e-6),
-                "max_instance_vol": parameters.get("instance_memory/max_instance_vol", 10.0),
-                "min_instance_height": parameters.get("instance_memory/min_instance_height", 0.1),
-                "max_instance_height": parameters.get("instance_memory/max_instance_height", 1.8),
-                "min_pixels_for_instance_view": parameters.get(
-                    "instance_memory/min_pixels_for_instance_view", 100
-                ),
-                "min_percent_for_instance_view": parameters.get(
-                    "instance_memory/min_percent_for_instance_view", 0.2
-                ),
-                "open_vocab_cat_map_file": parameters.get("open_vocab_category_map_file", None),
-                "use_visual_feat": parameters.get("use_visual_feat", False),
-            },
-            prune_detected_objects=parameters.get("prune_detected_objects", False),
+        return SparseVoxelMap.from_parameters(
+            parameters,
+            self.encoder,
+            voxel_size=self._voxel_size,
+            use_instance_memory=self._use_instance_memory or self.semantic_sensor is not None,
         )
 
     @property
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index 56195dfda..0a78cffa7 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -29,6 +29,7 @@
 import stretch.utils.compression as compression
 import stretch.utils.logger as logger
 from stretch.core.interfaces import Observations
+from stretch.core.parameters import Parameters
 from stretch.mapping.grid import GridParams
 from stretch.mapping.instance import Instance, InstanceMemory
 from stretch.motion import Footprint, PlanResult, RobotModel
@@ -118,6 +119,7 @@ def __init__(
         obs_max_height: float = 1.8,
         obs_min_density: float = 10,
         smooth_kernel_size: int = 2,
+        neg_obs_height: float = 0.0,
         add_local_radius_points: bool = True,
         remove_visited_from_obstacles: bool = False,
         local_radius: float = 0.15,
@@ -171,6 +173,7 @@ def __init__(
         self.feature_dim = feature_dim
         self.obs_min_height = obs_min_height
         self.obs_max_height = obs_max_height
+        self.neg_obs_height = neg_obs_height
         self.obs_min_density = obs_min_density
         self.prune_detected_objects = prune_detected_objects
 
@@ -1127,3 +1130,59 @@ def _show_open3d(
         # Returns xyz and rgb for further inspection
         points, _, _, rgb = self.voxel_pcd.get_pointcloud()
         return points, rgb
+
+    @staticmethod
+    def from_parameters(
+        cls,
+        parameters: Parameters,
+        encoder: BaseImageTextEncoder,
+        voxel_size: float = 0.05,
+        use_instance_memory: bool = True,
+        **kwargs,
+    ) -> "SparseVoxelMap":
+        return SparseVoxelMap(
+            resolution=voxel_size,
+            local_radius=parameters["local_radius"],
+            obs_min_height=parameters["obs_min_height"],
+            obs_max_height=parameters["obs_max_height"],
+            neg_obs_height=parameters["neg_obs_height"],
+            min_depth=parameters["min_depth"],
+            max_depth=parameters["max_depth"],
+            pad_obstacles=parameters["pad_obstacles"],
+            add_local_radius_points=parameters.get("add_local_radius_points", default=True),
+            remove_visited_from_obstacles=parameters.get(
+                "remove_visited_from_obstacles", default=False
+            ),
+            obs_min_density=parameters["obs_min_density"],
+            encoder=encoder,
+            smooth_kernel_size=parameters.get("filters/smooth_kernel_size", -1),
+            use_median_filter=parameters.get("filters/use_median_filter", False),
+            median_filter_size=parameters.get("filters/median_filter_size", 5),
+            median_filter_max_error=parameters.get("filters/median_filter_max_error", 0.01),
+            use_derivative_filter=parameters.get("filters/use_derivative_filter", False),
+            derivative_filter_threshold=parameters.get("filters/derivative_filter_threshold", 0.5),
+            use_instance_memory=use_instance_memory,
+            instance_memory_kwargs={
+                "min_pixels_for_instance_view": parameters.get("min_pixels_for_instance_view", 100),
+                "min_instance_thickness": parameters.get(
+                    "instance_memory/min_instance_thickness", 0.01
+                ),
+                "min_instance_vol": parameters.get("instance_memory/min_instance_vol", 1e-6),
+                "max_instance_vol": parameters.get("instance_memory/max_instance_vol", 10.0),
+                "min_instance_height": parameters.get("instance_memory/min_instance_height", 0.1),
+                "max_instance_height": parameters.get("instance_memory/max_instance_height", 1.8),
+                "min_pixels_for_instance_view": parameters.get(
+                    "instance_memory/min_pixels_for_instance_view", 100
+                ),
+                "min_percent_for_instance_view": parameters.get(
+                    "instance_memory/min_percent_for_instance_view", 0.2
+                ),
+                "open_vocab_cat_map_file": parameters.get("open_vocab_category_map_file", None),
+                "use_visual_feat": parameters.get("use_visual_feat", False),
+            },
+            prune_detected_objects=parameters.get("prune_detected_objects", False),
+        )
+
+    def _get_instance_color(instance_id: int) -> List[float]:
+        """Get a color for an instance"""
+        return [np.random.random() for _ in range(3)]

From effb635d54ba97f557dbaf450b62e444ad71375a Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 5 Sep 2024 13:32:41 -0400
Subject: [PATCH 022/410] Update voxels - add negative

---
 src/stretch/mapping/voxel/voxel.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index db8e53966..1907b7588 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -790,12 +790,8 @@ def get_2d_map(self, debug: bool = False) -> Tuple[np.ndarray, np.ndarray]:
 
         device = xyz.device
         xyz = ((xyz / self.grid_resolution) + self.grid_origin).long()
-        xyz[xyz[:, -1] < 0, -1] = 0
-
-        # from stretch.utils.point_cloud import show_point_cloud
-        # show_point_cloud(xyz, rgb, orig=np.zeros(3))
-        xyz[xyz[:, -1] < 0, -1] = 0
-        # show_point_cloud(xyz, rgb, orig=np.zeros(3))
+        # xyz[xyz[:, -1] < 0, -1] = 0
+        negative_obstacles = xyz[:, -1] < self.neg_obs_height
 
         # Crop to robot height
         min_height = int(self.obs_min_height / self.grid_resolution)
@@ -805,6 +801,7 @@ def get_2d_map(self, debug: bool = False) -> Tuple[np.ndarray, np.ndarray]:
 
         # Mask out obstacles only above a certain height
         obs_mask = xyz[:, -1] < max_height
+        obs_mask = obs_mask & negative_obstacles
         xyz = xyz[obs_mask, :]
         counts = counts[obs_mask][:, None]
 

From 52ddd080cb4f7d097e3e04f54c2f96752caab120 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 5 Sep 2024 13:38:53 -0400
Subject: [PATCH 023/410] Update server

---
 src/stretch/simulation/mujoco_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/simulation/mujoco_server.py b/src/stretch/simulation/mujoco_server.py
index 7fc943bc3..c445609db 100644
--- a/src/stretch/simulation/mujoco_server.py
+++ b/src/stretch/simulation/mujoco_server.py
@@ -600,7 +600,7 @@ def main(
 
     scene_model = None
     if use_robocasa:
-        scene_model, scene_xml = model_generation_wizard(
+        scene_model, scene_xml, objects_info = model_generation_wizard(
             task=robocasa_task,
             style=robocasa_style,
             layout=robocasa_layout,

From ddce6b8d2f23506be9ff747e4b593b2dfbfb13b8 Mon Sep 17 00:00:00 2001
From: Peiqi Liu <leo20021210@163.com>
Date: Thu, 5 Sep 2024 14:46:50 -0400
Subject: [PATCH 024/410] test manipulation

---
 src/stretch/agent/zmq_client.py               |  2 ++
 src/stretch/dynav/ok_robot_hw/robot.py        | 28 +++++++++++++------
 .../dynav/ok_robot_hw/utils/grasper_utils.py  |  8 +++++-
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 141ec12b6..66c6c0f61 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -464,6 +464,8 @@ def arm_to(
             # If head is not specified, we need to set it to the right head position
             # In this case, we assume if moving arm you should look at ee
             _next_action["head_to"] = constants.look_at_ee
+            # cur_pan, cur_tilt = self.get_pan_tilt()
+            # _next_action["head_to"] = np.array([cur_pan, cur_tilt])
         _next_action["manip_blocking"] = blocking
         self.send_action(_next_action)
 
diff --git a/src/stretch/dynav/ok_robot_hw/robot.py b/src/stretch/dynav/ok_robot_hw/robot.py
index 669c8a338..6f588194a 100644
--- a/src/stretch/dynav/ok_robot_hw/robot.py
+++ b/src/stretch/dynav/ok_robot_hw/robot.py
@@ -49,6 +49,7 @@ def __init__(
 
         # Constraining the robots movement
         self.clamp = lambda n, minn, maxn: max(min(maxn, n), minn) 
+        self.pan, self.tilt = self.robot.get_pan_tilt()
     
 
     def set_end_link(self, link):
@@ -112,17 +113,24 @@ def move_to_position(
             target_state[5] = wrist_roll    
         
         # Actual Movement
-        print('Target Position', target_state)
-        self.robot.arm_to(target_state, blocking = blocking)
-        print('Actual location', self.robot.get_six_joints())
+        # print('Target Position', target_state)
+        # print('pan tilt before', self.robot.get_pan_tilt())
+        self.robot.arm_to(target_state, blocking = blocking, head = np.array([self.pan, self.tilt]))
+        # print('pan tilt after', self.robot.get_pan_tilt())
+        # print('Actual location', self.robot.get_six_joints())
 
         # Head state update and Movement
-        target_head_pan, target_head_tilt = self.robot.get_pan_tilt()
+        # target_head_pan, target_head_tilt = self.robot.get_pan_tilt()
+        target_head_pan = self.pan
+        target_head_tilt = self.tilt
         if not head_tilt is None:
             target_head_tilt = head_tilt
+            self.tilt = head_tilt
         if not head_pan is None:
             target_head_pan = head_pan
+            self.pan = head_pan
         self.robot.head_to(head_tilt = target_head_tilt, head_pan = target_head_pan, blocking = blocking)
+        # self.pan, self.tilt = self.robot.get_pan_tilt()
         #time.sleep(0.7)
 
     def pickup(self, width):
@@ -140,7 +148,7 @@ def pickup(self, width):
                 break
             
             if next_gripper_pos > 0:
-                next_gripper_pos -= 0.4
+                next_gripper_pos -= 0.35
             else: 
                 next_gripper_pos = -1
 
@@ -196,11 +204,13 @@ def move_to_joints(self, joints, gripper, mode=0):
         if mode == 1:
             target1 = state
             target1[1] = target_state[1]
-            self.robot.arm_to(target1, blocking = True)
+            self.robot.arm_to(target1, blocking = True, head = np.array([self.pan, self.tilt]))
 
-        self.robot.arm_to(target_state, blocking = True)
-        print(f"current state {self.robot.get_six_joints()}")
-        print(f"target state {target_state}")
+        # print('pan tilt before', self.robot.get_pan_tilt())
+        self.robot.arm_to(target_state, blocking = True, head = np.array([self.pan, self.tilt]))
+        # print('pan tilt after', self.robot.get_pan_tilt())
+        # print(f"current state {self.robot.get_six_joints()}")
+        # print(f"target state {target_state}")
         # time.sleep(1)
 
         #NOTE: below code is to fix the pitch drift issue in current hello-robot. Remove it if there is no pitch drift issue
diff --git a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
index 7ab3093d3..bd6f9354c 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
@@ -31,7 +31,7 @@ def capture_and_process_image(camera, mode, obj, socket, hello_robot):
             hello_robot.move_to_position(base_trans=base_trans,
                                     head_pan=head_pan,
                                     head_tilt=head_tilt)
-            time.sleep(4)
+            # time.sleep(4)
         
         elif (retry_flag !=0 and side_retries == 3):
             print("Tried in all angles but couldn't succed")
@@ -133,7 +133,9 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
     # print(f"pin_transformed frame {pin_transformed_frame}")
 
     # Lifting the arm to high position as part of pregrasping position
+    print('pan, tilt before', robot.robot.get_pan_tilt())
     robot.move_to_position(lift_pos = 1.05, gripper_pos = gripper_width, head_pan = None, head_tilt = None)
+    print('pan, tilt after', robot.robot.get_pan_tilt())
 
     # Rotation for aligning Robot gripper frame to Model gripper frame
     rotation2_top_mat = np.array([[0, 0, 1], 
@@ -146,11 +148,13 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
     print(f"pin final rotation {pin_final_rotation}")
 
     rpy_angles = pin.rpy.matrixToRpy(pin_final_rotation)
+    print('pan, tilt before', robot.robot.get_pan_tilt())
     robot.move_to_pose(
             [0, 0, 0],
             [rpy_angles[0], rpy_angles[1], rpy_angles[2]],
             [1],
         )
+    print('pan, tilt after', robot.robot.get_pan_tilt())
 
     # Final grasping point relative to camera
     pin_cam2gripper_transform = robot.get_joint_transform(base_node, gripper_node)
@@ -167,12 +171,14 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
     ref_diff = (diff_value)
 
     # Moving gripper to a point that is 0.2m away from the pose center in the line of gripper
+    print('pan, tilt before', robot.robot.get_pan_tilt())
     robot.move_to_pose(
         [pin_transformed_point1[0], pin_transformed_point1[1], pin_transformed_point1[2] - 0.2],
         [0, 0, 0],
         [1],
         move_mode = 1
     )
+    print('pan, tilt after', robot.robot.get_pan_tilt())
 
     # Z-Axis of link_straight_gripper points in line of gripper
     # So, the z co-ordiante of point w.r.t gripper gives the distance of point from gripper

From 389401e982686104ecfa1bb2c6fdb7438431a77f Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 5 Sep 2024 14:57:46 -0400
Subject: [PATCH 025/410] add obstacle code to planner config for tests

---
 src/test/mapping/planner.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/test/mapping/planner.yaml b/src/test/mapping/planner.yaml
index 167f8977a..8b3e83109 100644
--- a/src/test/mapping/planner.yaml
+++ b/src/test/mapping/planner.yaml
@@ -12,6 +12,7 @@ voxel_size: 0.05
 # 0.05 seems too low
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
+neg_obs_height: -0.05  # Things less than this height ARE obstacles
 obs_min_density: 10  # This many points makes it an obstacle
 
 # Padding
@@ -61,7 +62,9 @@ motion:
     head_not_moving_tolerance: 1.0e-4
 
 # Exploration
-in_place_rotation_steps: 8
+agent:
+  in_place_rotation_steps: 2
+  sweep_head_on_update: True
 
 # Instance memory parameters
 # These are mostly around making sure that we reject views of objects that are too small, too spotty, too unreliable, etc.

From 835b026f6ff82a4f387a881690df4a5baea4a65a Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 5 Sep 2024 15:22:15 -0400
Subject: [PATCH 026/410] updates to planner and voxel

---
 src/stretch/config/default_planner.yaml | 9 +++++----
 src/stretch/mapping/voxel/voxel.py      | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index ed1d86545..677acd79a 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -13,11 +13,11 @@ voxel_size: 0.05
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
 neg_obs_height: -0.05  # Things less than this height ARE obstacles
-obs_min_density: 10  # This many points makes it an obstacle
+obs_min_density: 5  # This many points makes it an obstacle
 
 # Padding
-# pad_obstacles: 2  # Add this many units (voxel_size) to the area around obstacles
-pad_obstacles: 1  # Add this many units (voxel_size) to the area around obstacles
+pad_obstacles: 2  # Add this many units (voxel_size) to the area around obstacles
+# pad_obstacles: 1  # Add this many units (voxel_size) to the area around obstacles
 min_pad_obstacles: 0  # Do not pad LESS than this amount, for safety.
 
 local_radius: 0.8  # Area around the robot to mark as explored (kind of a hack)
@@ -64,7 +64,8 @@ motion:
 
 # Exploration
 agent:
-  # in_place_rotation_steps: 8  # If you are not moving the head, rotate more often
+  #in_place_rotation_steps: 8  # If you are not moving the head, rotate more often
+  #sweep_head_on_update: False
   in_place_rotation_steps: 2
   sweep_head_on_update: True
 
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index 1907b7588..c9de234ca 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -801,7 +801,7 @@ def get_2d_map(self, debug: bool = False) -> Tuple[np.ndarray, np.ndarray]:
 
         # Mask out obstacles only above a certain height
         obs_mask = xyz[:, -1] < max_height
-        obs_mask = obs_mask & negative_obstacles
+        obs_mask = obs_mask | negative_obstacles
         xyz = xyz[obs_mask, :]
         counts = counts[obs_mask][:, None]
 

From cc20ec026af1cdfe48d1b96ff53886d0690799f1 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 5 Sep 2024 15:37:23 -0400
Subject: [PATCH 027/410] fix status

---
 src/stretch/simulation/mujoco_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/simulation/mujoco_server.py b/src/stretch/simulation/mujoco_server.py
index c445609db..ab5ab1368 100644
--- a/src/stretch/simulation/mujoco_server.py
+++ b/src/stretch/simulation/mujoco_server.py
@@ -380,7 +380,7 @@ def start(self, robocasa: bool = False):
 
         while self.is_running():
             self._camera_data = self.robot_sim.pull_camera_data()
-            self._status = self.robot_sim.pull_status()
+            self._status = self.robot_sim.status
             time.sleep(1 / self.simulation_rate)
 
     @override

From 77a99226e7c78d442d9c47bb871e3e43bbff17bd Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 5 Sep 2024 15:38:36 -0400
Subject: [PATCH 028/410] minor simulation fixes

---
 src/stretch/simulation/mujoco_server.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/stretch/simulation/mujoco_server.py b/src/stretch/simulation/mujoco_server.py
index ab5ab1368..4836404d4 100644
--- a/src/stretch/simulation/mujoco_server.py
+++ b/src/stretch/simulation/mujoco_server.py
@@ -113,7 +113,6 @@ def __init__(
         self.servo_report_steps = 1000
 
         self._camera_data = None
-        self._status = None
 
         # Controller stuff
         # Is the velocity controller active?
@@ -190,10 +189,10 @@ def _control_loop_thread(self):
     def control_loop_callback(self):
         """Actual controller timer callback"""
 
-        if self._status is None:
+        if self.robot_status is None:
             vel_odom = [0, 0]
         else:
-            vel_odom = self._status["base"]["x_vel"], self._status["base"]["theta_vel"]
+            vel_odom = self.robot_status["base"]["x_vel"], self.robot_status["base"]["theta_vel"]
 
         if self.debug_control_loop:
             print("Control loop callback: ", self.active, self.xyt_goal, vel_odom)
@@ -244,7 +243,7 @@ def base_controller_at_goal(self):
 
     def get_joint_state(self):
         """Get the joint state of the robot."""
-        status = self._status
+        status = self.robot_status
 
         positions = np.zeros(constants.stretch_degrees_of_freedom)
         velocities = np.zeros(constants.stretch_degrees_of_freedom)
@@ -380,9 +379,12 @@ def start(self, robocasa: bool = False):
 
         while self.is_running():
             self._camera_data = self.robot_sim.pull_camera_data()
-            self._status = self.robot_sim.status
             time.sleep(1 / self.simulation_rate)
 
+    @property
+    def robot_status(self) -> Dict[str, Any]:
+        return self.robot_sim.status
+
     @override
     def handle_action(self, action: Dict[str, Any]):
         """Handle the action received from the client."""

From b64e8019c36adabf4a6fd8dfc8f2795cda788088 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 5 Sep 2024 15:52:50 -0400
Subject: [PATCH 029/410] update query

---
 src/stretch/app/query.py                | 1 +
 src/stretch/config/default_planner.yaml | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/stretch/app/query.py b/src/stretch/app/query.py
index aea1103f9..1b587e656 100644
--- a/src/stretch/app/query.py
+++ b/src/stretch/app/query.py
@@ -48,6 +48,7 @@
 @click.option("--frame", default=-1, help="Final frame to read from input file")
 @click.option("--text", default="", help="Text to encode")
 @click.option("-y", "--yes", is_flag=True, help="Skip confirmation")
+@click.option("-l", "--load", type=str, default="", help="Load a saved voxel map")
 @click.option(
     "--all-matches",
     is_flag=True,
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 677acd79a..7557062d4 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -13,7 +13,7 @@ voxel_size: 0.05
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
 neg_obs_height: -0.05  # Things less than this height ARE obstacles
-obs_min_density: 5  # This many points makes it an obstacle
+obs_min_density: 10  # This many points makes it an obstacle
 
 # Padding
 pad_obstacles: 2  # Add this many units (voxel_size) to the area around obstacles

From f27332ccb2b678c70d90b8ad901ab3ce41ff2a92 Mon Sep 17 00:00:00 2001
From: Peiqi Liu <leo20021210@163.com>
Date: Thu, 5 Sep 2024 18:12:32 -0400
Subject: [PATCH 030/410] attempt

---
 src/stretch/config/default_planner.yaml      |  12 +-
 src/stretch/dynav/mapping_utils/voxel_map.py |  79 +++++++---
 src/stretch/dynav/robot_agent_manip_mdp.py   |  23 ++-
 src/stretch/dynav/run_ok_nav.py              |  18 ++-
 src/stretch/dynav/voxel_map_server.py        | 150 +++++++++----------
 5 files changed, 158 insertions(+), 124 deletions(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index bb611c02e..2586e368c 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -49,12 +49,12 @@ motion:
   angle_threshold: 0.01  # How much the robot has to rotate to be considered "rotating"
   min_steps_not_moving: 2  # How many steps the robot has to not move before we consider it "stopped"
   joint_tolerance:
-    arm: 0.005
-    base_x: 0.005
-    lift: 0.005
-    wrist_roll: 0.01
-    wrist_pitch: 0.01
-    wrist_yaw: 0.01
+    arm: 0.02
+    base_x: 0.02
+    lift: 0.02
+    wrist_roll: 0.1
+    wrist_pitch: 0.1
+    wrist_yaw: 0.1
     # arm: 0.05
     # base_x: 0.05
     # lift: 0.05
diff --git a/src/stretch/dynav/mapping_utils/voxel_map.py b/src/stretch/dynav/mapping_utils/voxel_map.py
index 8cd14c743..23fe99e2c 100644
--- a/src/stretch/dynav/mapping_utils/voxel_map.py
+++ b/src/stretch/dynav/mapping_utils/voxel_map.py
@@ -326,6 +326,7 @@ def sample_target_point(
         start: torch.Tensor,
         point: torch.Tensor,
         planner,
+        exploration: bool = False
     ) -> Optional[np.array]:
         """Sample a position near the mask and return.
 
@@ -368,44 +369,76 @@ def sample_target_point(
         # TODO: was this:
         # expanded_mask = expanded_mask & less_explored & ~obstacles
 
+        scores = []
+        points = []
+
         for selected_target in selected_targets:
             selected_x, selected_y = planner.to_xy([selected_target[0], selected_target[1]])
             theta = self.compute_theta(selected_x, selected_y, point[0], point[1])
 
-            # if debug and self.is_valid([selected_x, selected_y, theta]):
-            #     import matplotlib.pyplot as plt
-
-            #     obstacles, explored = self.voxel_map.get_2d_map()
-            #     plt.scatter(ys, xs, s = 1)
-            #     plt.scatter(selected_target[1], selected_target[0], s = 10)
-            #     plt.scatter(target_y, target_x, s = 10)
-            #     plt.imshow(obstacles)
             target_is_valid = self.is_valid([selected_x, selected_y, theta])
-            # print('Target:', [selected_x, selected_y, theta])
-            # print('Target is valid:', target_is_valid)
             if not target_is_valid:
                 continue
             if np.linalg.norm([selected_x - point[0], selected_y - point[1]]) < 0.35:
                 continue
-            elif np.linalg.norm([selected_x - point[0], selected_y - point[1]]) <= 0.5:
-                print('OBSTACLE AVOIDANCE')
-                print(selected_target[0].int(), selected_target[1].int())
+
+            points.append([selected_x, selected_y, theta])
+            score = np.linalg.norm(point[:2] - np.array([selected_x, selected_y]))
+
+            if np.linalg.norm([selected_x - point[0], selected_y - point[1]]) <= 0.5 and not exploration:
+                # print('OBSTACLE AVOIDANCE')
                 i = (point[0] - selected_target[0]) // abs(point[0] - selected_target[0])
                 j = (point[1] - selected_target[1]) // abs(point[1] - selected_target[1])
                 index_i = int(selected_target[0].int() + i)
                 index_j = int(selected_target[1].int() + j)
                 if obstacles[index_i][index_j]:
-                    target_is_valid = False
-            # elif np.linalg.norm([selected_x - point[0], selected_y - point[1]]) <= 0.5:
-            #     for i in [-1, 0, 1]:
-            #         for j in [-1, 0, 1]:
-            #             if obstacles[selected_target[0] + i][selected_target[1] + j]:
-            #                 target_is_valid = False
-            if not target_is_valid:
-                continue
+                    score += 0.5
+
+            scores.append(score)
+
+        if len(points) == 0:
+            return None
+        else:
+            return points[np.argmax(scores)]
+
+        # for selected_target in selected_targets:
+        #     selected_x, selected_y = planner.to_xy([selected_target[0], selected_target[1]])
+        #     theta = self.compute_theta(selected_x, selected_y, point[0], point[1])
+
+        #     # if debug and self.is_valid([selected_x, selected_y, theta]):
+        #     #     import matplotlib.pyplot as plt
+
+        #     #     obstacles, explored = self.voxel_map.get_2d_map()
+        #     #     plt.scatter(ys, xs, s = 1)
+        #     #     plt.scatter(selected_target[1], selected_target[0], s = 10)
+        #     #     plt.scatter(target_y, target_x, s = 10)
+        #     #     plt.imshow(obstacles)
+        #     target_is_valid = self.is_valid([selected_x, selected_y, theta])
+        #     # print('Target:', [selected_x, selected_y, theta])
+        #     # print('Target is valid:', target_is_valid)
+        #     if not target_is_valid:
+        #         continue
+        #     if np.linalg.norm([selected_x - point[0], selected_y - point[1]]) < 0.35:
+        #         continue
+        #     elif np.linalg.norm([selected_x - point[0], selected_y - point[1]]) <= 0.5:
+        #         print('OBSTACLE AVOIDANCE')
+        #         print(selected_target[0].int(), selected_target[1].int())
+        #         i = (point[0] - selected_target[0]) // abs(point[0] - selected_target[0])
+        #         j = (point[1] - selected_target[1]) // abs(point[1] - selected_target[1])
+        #         index_i = int(selected_target[0].int() + i)
+        #         index_j = int(selected_target[1].int() + j)
+        #         if obstacles[index_i][index_j]:
+        #             target_is_valid = False
+        #     # elif np.linalg.norm([selected_x - point[0], selected_y - point[1]]) <= 0.5:
+        #     #     for i in [-1, 0, 1]:
+        #     #         for j in [-1, 0, 1]:
+        #     #             if obstacles[selected_target[0] + i][selected_target[1] + j]:
+        #     #                 target_is_valid = False
+        #     if not target_is_valid:
+        #         continue
             
-            return np.array([selected_x, selected_y, theta])
-        return None
+        #     return np.array([selected_x, selected_y, theta])
+        # return None
 
     def sample_near_mask(
         self,
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index 562a49663..c1051894a 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -94,7 +94,6 @@ def look_around(self):
                 end_time = time.time()
                 print('moving head takes ', end_time - start_time, 'seconds.')
                 self.update()
-        self.robot.look_front()
 
     def rotate_in_place(self):
         print("*" * 10, "Rotate in place", "*" * 10)
@@ -112,19 +111,19 @@ def update(self):
         self.obs_history.append(obs)
         self.obs_count += 1
         rgb, depth, K, camera_pose = obs.rgb, obs.depth, obs.camera_K, obs.camera_pose
-        start_time = time.time()
+        # start_time = time.time()
         self.image_processor.process_rgbd_images(rgb, depth, K, camera_pose)
-        end_time = time.time()
-        print('Image processing takes', end_time - start_time, 'seconds.')
+        # end_time = time.time()
+        # print('Image processing takes', end_time - start_time, 'seconds.')
 
     def execute_action(
         self,
         text: str,
     ):
-        start_time = time.time()
+        # start_time = time.time()
 
-        # self.robot.look_front()
-        self.look_around()
+        self.robot.look_front()
+        # self.look_around()
         # self.robot.look_front()
         self.robot.switch_to_navigation_mode()
 
@@ -134,11 +133,11 @@ def execute_action(
         res = self.image_processor.process_text(text, start)
 
         look_around_finish = time.time()
-        look_around_take = look_around_finish - start_time
-        print('Looking around takes ', look_around_take, ' seconds.')
-        self.look_around_times.append(look_around_take)
-        print(self.look_around_times)
-        print(sum(self.look_around_times) / len(self.look_around_times))
+        # look_around_take = look_around_finish - start_time
+        # print('Looking around takes ', look_around_take, ' seconds.')
+        # self.look_around_times.append(look_around_take)
+        # print(self.look_around_times)
+        # print(sum(self.look_around_times) / len(self.look_around_times))
 
         if len(res) > 0:
             print("Plan successful!")
diff --git a/src/stretch/dynav/run_ok_nav.py b/src/stretch/dynav/run_ok_nav.py
index 3bee44f06..c43e4acde 100644
--- a/src/stretch/dynav/run_ok_nav.py
+++ b/src/stretch/dynav/run_ok_nav.py
@@ -16,6 +16,8 @@
 from stretch.agent import RobotClient
 
 import cv2
+import threading
+import time
 
 def compute_tilt(camera_xyz, target_xyz):
     '''
@@ -75,13 +77,17 @@ def main(
     else:
         demo.image_processor.read_from_pickle(input_path)
 
-    # def keep_looking_around():
-    #     while True:
-    #         demo.look_around()
+    def keep_looking_around():
+        while True:
+            time.sleep(0.3)
+            if robot.get_six_joints()[2] > 0.7 or not robot.in_navigation_mode():
+                continue
+            demo.update()
+            demo.image_processor.compute_path(robot.get_base_pose())
 
-    # img_thread = threading.Thread(target=keep_looking_around)
-    # img_thread.daemon = True
-    # img_thread.start()
+    img_thread = threading.Thread(target=keep_looking_around)
+    img_thread.daemon = True
+    img_thread.start()
 
     while True:
         mode = input('select mode? E/N/S')
diff --git a/src/stretch/dynav/voxel_map_server.py b/src/stretch/dynav/voxel_map_server.py
index 2c9e57c20..7eb126db9 100644
--- a/src/stretch/dynav/voxel_map_server.py
+++ b/src/stretch/dynav/voxel_map_server.py
@@ -152,6 +152,9 @@ def __init__(self,
             self.img_thread = threading.Thread(target=self._recv_image)
             self.img_thread.daemon = True
             self.img_thread.start()
+
+        self.path = None
+        self.text = None
     
     def create_obstacle_map(self):
         print("- Load parameters")
@@ -220,28 +223,25 @@ def create_vision_model(self):
         self.yolo_model.set_classes(self.texts)
         self.voxel_map_localizer = VoxelMapLocalizer(self.voxel_map, clip_model = self.clip_model, processor = self.clip_preprocess, device = self.device, siglip = self.siglip)
 
-    def process_text(self, text, start_pose):
+    def compute_path(self, start_pose):
         if self.rerun:
-            if not self.static:
-                rr.set_time_sequence("frame", self.obs_count)
             rr.log('/object', rr.Clear(recursive = True), static = self.static)
             rr.log('/robot_start_pose', rr.Clear(recursive = True), static = self.static)
             rr.log('/direction', rr.Clear(recursive = True), static = self.static)
             rr.log('robot_monologue', rr.Clear(recursive = True), static = self.static)
             rr.log('/Past_observation_most_similar_to_text', rr.Clear(recursive = True), static = self.static)
-            if not self.static:
-                rr.connect('100.108.67.79:9876')
         
         debug_text = ''
         mode = 'navigation'
         obs = None
+        with self.voxel_map_lock:
+            text = self.text
         # Do visual grounding
         if text is not None and text != '':
-            with self.voxel_map_lock:
-                localized_point, debug_text, obs, pointcloud = self.voxel_map_localizer.localize_A(text, debug = True, return_debug = True)
+            localized_point, debug_text, obs, pointcloud = self.voxel_map_localizer.localize_A(text, debug = True, return_debug = True)
             if localized_point is not None:
                 rr.log("/object", rr.Points3D([localized_point[0], localized_point[1], 1.5], colors=torch.Tensor([1, 0, 0]), radii=0.1), static = self.static)
-        # Do Frontier based exploration
+        # Do exploration
         if text is None or text == '' or localized_point is None:
             debug_text += '## Navigation fails, so robot starts exploring environments.\n'
             localized_point = self.sample_frontier(start_pose, text)
@@ -250,32 +250,29 @@ def process_text(self, text, start_pose):
             print('\n', localized_point, '\n')
 
         if localized_point is None:
-            return []
+            with self.voxel_map_lock:
+                self.traj = (text, None, start_pose)
+            return
         
         if len(localized_point) == 2:
             localized_point = np.array([localized_point[0], localized_point[1], 0])
 
-        point = self.sample_navigation(start_pose, localized_point)
-        # if mode == 'navigation' and torch.linalg.norm(torch.Tensor(point)[:2] - torch.linalg.norm(localized_point[:2])) > 2.0:
-        #     localized_point = self.sample_frontier(start_pose, None)
-        #     mode = 'exploration'
-        #     point = self.sample_navigation(start_pose, localized_point)
-        #     debug_text += '## All reachable points of robot are too far from the target object, explore to find new paths. \n'
-
-        if self.rerun:
-            buf = BytesIO()
-            plt.savefig(buf, format='png')
-            buf.seek(0)
-            img = Image.open(buf)
-            img = np.array(img)
-            buf.close()
-            rr.log('2d_map', rr.Image(img), static = self.static)
-        else:
-            if text != '':
-                plt.savefig(self.log + '/debug_' + text + str(self.obs_count) + '.png')
-            else:
-                plt.savefig(self.log + '/debug_exploration' + str(self.obs_count) + '.png')
-        plt.clf()
+        point = self.sample_navigation(start_pose, localized_point, mode = mode)
+
+        # if self.rerun:
+        #     buf = BytesIO()
+        #     # plt.savefig(buf, format='png')
+        #     buf.seek(0)
+        #     img = Image.open(buf)
+        #     img = np.array(img)
+        #     buf.close()
+        #     rr.log('2d_map', rr.Image(img), static = self.static)
+        # else:
+        #     if text != '':
+        #         plt.savefig(self.log + '/debug_' + text + str(self.obs_count) + '.png')
+        #     else:
+        #         plt.savefig(self.log + '/debug_exploration' + str(self.obs_count) + '.png')
+        # plt.clf()
 
         if self.rerun:
             if text is not None and text != '':
@@ -303,11 +300,9 @@ def process_text(self, text, start_pose):
                 # If we are navigating to some object of interst, send (x, y, z) of 
                 # the object so that we can make sure the robot looks at the object after navigation
                 print(waypoints)
-                # finished = (len(waypoints) <= 4 or torch.linalg.norm(torch.Tensor(point)[:2] - torch.Tensor(start_pose[:2])) > 0.8) and mode == 'navigation'
-                # finished = mode == 'navigation'
                 finished = len(waypoints) <= 4 and mode == 'navigation'
                 if not finished:
-                    waypoints = waypoints[:8]
+                    waypoints = waypoints[:4]
                 traj = self.planner.clean_path_for_xy(waypoints)
                 # traj = traj[1:]
                 if finished:
@@ -315,9 +310,6 @@ def process_text(self, text, start_pose):
                     if isinstance(localized_point, torch.Tensor):
                         localized_point = localized_point.tolist()
                     traj.append(localized_point)
-                print('Planned trajectory:', traj)
-            else:
-                print('[FAILURE]', res.reason)
             
         if traj is not None:
             origins = []
@@ -330,16 +322,24 @@ def process_text(self, text, start_pose):
             rr.log("/robot_start_pose", rr.Points3D([start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1), static = self.static)
         
         self.write_to_pickle()
-        return traj
+        with self.voxel_map_lock:
+            self.traj = (text, traj, start_pose)
+
+    def process_text(self, text, start_pose):
+        if self.traj is not None:
+            saved_text, traj, saved_start_pose = self.traj
+            if saved_text == text and np.linalg.norm(saved_start_pose - start_pose) <= 0.25:
+                return traj
+        with self.voxel_map_lock:
+            self.text = text
+        self.compute_path(start_pose)
+        return self.traj[1]
 
 
-    # def recv_text(self):
-    #     text = self.text_socket.recv_string()
-    #     self.text_socket.send_string('Text recevied, waiting for robot pose')
-    #     start_pose = recv_array(self.text_socket)
+    # def process_text(self, text, start_pose):
     #     if self.rerun:
-    #         if not self.static:
-    #             rr.set_time_sequence("frame", self.obs_count)
+    #         # if not self.static:
+    #         #     rr.set_time_sequence("frame", self.obs_count)
     #         rr.log('/object', rr.Clear(recursive = True), static = self.static)
     #         rr.log('/robot_start_pose', rr.Clear(recursive = True), static = self.static)
     #         rr.log('/direction', rr.Clear(recursive = True), static = self.static)
@@ -347,12 +347,12 @@ def process_text(self, text, start_pose):
     #         rr.log('/Past_observation_most_similar_to_text', rr.Clear(recursive = True), static = self.static)
     #         if not self.static:
     #             rr.connect('100.108.67.79:9876')
-
+        
     #     debug_text = ''
     #     mode = 'navigation'
     #     obs = None
     #     # Do visual grounding
-    #     if text != '':
+    #     if text is not None and text != '':
     #         with self.voxel_map_lock:
     #             localized_point, debug_text, obs, pointcloud = self.voxel_map_localizer.localize_A(text, debug = True, return_debug = True)
     #         if localized_point is not None:
@@ -364,21 +364,19 @@ def process_text(self, text, start_pose):
     #         mode = 'exploration'
     #         rr.log("/object", rr.Points3D([0, 0, 0], colors=torch.Tensor([1, 0, 0]), radii=0), static = self.static)
     #         print('\n', localized_point, '\n')
-        
+
     #     if localized_point is None:
-    #         print('Unable to find any target point, some exception might happen')
-    #         send_array(self.text_socket, [])
-    #         return
+    #         return []
         
     #     if len(localized_point) == 2:
     #         localized_point = np.array([localized_point[0], localized_point[1], 0])
 
-    #     point = self.sample_navigation(start_pose, localized_point)
-    #     if mode == 'navigation' and np.min(np.linalg.norm(np.asarray(point)[:2] - np.asarray(pointcloud)[:, :2], axis = -1)) > 1.2:
-    #         localized_point = self.sample_frontier(start_pose, None)
-    #         mode = 'exploration'
-    #         point = self.sample_navigation(start_pose, localized_point)
-    #         debug_text += '## All reachable points of robot are too far from the target object, explore to find new paths. \n'
+    #     point = self.sample_navigation(start_pose, localized_point, mode = mode)
+    #     # if mode == 'navigation' and torch.linalg.norm(torch.Tensor(point)[:2] - torch.linalg.norm(localized_point[:2])) > 2.0:
+    #     #     localized_point = self.sample_frontier(start_pose, None)
+    #     #     mode = 'exploration'
+    #     #     point = self.sample_navigation(start_pose, localized_point)
+    #     #     debug_text += '## All reachable points of robot are too far from the target object, explore to find new paths. \n'
 
     #     if self.rerun:
     #         buf = BytesIO()
@@ -409,11 +407,10 @@ def process_text(self, text, start_pose):
     #             cv2.imwrite(self.log + '/debug_' + text + '.png', rgb[:, :, [2, 1, 0]])
     #         else:
     #             rr.log('/Past_observation_most_similar_to_text', rr.Image(rgb), static = self.static)
-    #     traj = None
+    #     traj = []
     #     waypoints = None
     #     if point is None:
     #         print('Unable to find any target point, some exception might happen')
-    #         send_array(self.text_socket, [])
     #     else:
     #         print('Target point is', point)
     #         res = self.planner.plan(start_pose, point)
@@ -421,12 +418,12 @@ def process_text(self, text, start_pose):
     #             waypoints = [pt.state for pt in res.trajectory]
     #             # If we are navigating to some object of interst, send (x, y, z) of 
     #             # the object so that we can make sure the robot looks at the object after navigation
-    #             # print(waypoints[-1][:2], start_pose[:2])
-    #             finished = len(waypoints) <= 10 and mode == 'navigation'
+    #             print(waypoints)
+    #             # finished = (len(waypoints) <= 4 or torch.linalg.norm(torch.Tensor(point)[:2] - torch.Tensor(start_pose[:2])) > 0.8) and mode == 'navigation'
     #             # finished = mode == 'navigation'
+    #             finished = len(waypoints) <= 4 and mode == 'navigation'
     #             if not finished:
-    #                 waypoints = waypoints[:7]
-    #             print(waypoints)
+    #                 waypoints = waypoints[:4]
     #             traj = self.planner.clean_path_for_xy(waypoints)
     #             # traj = traj[1:]
     #             if finished:
@@ -435,10 +432,8 @@ def process_text(self, text, start_pose):
     #                     localized_point = localized_point.tolist()
     #                 traj.append(localized_point)
     #             print('Planned trajectory:', traj)
-    #             send_array(self.text_socket, traj)
     #         else:
     #             print('[FAILURE]', res.reason)
-    #             send_array(self.text_socket, [])
             
     #     if traj is not None:
     #         origins = []
@@ -451,25 +446,26 @@ def process_text(self, text, start_pose):
     #         rr.log("/robot_start_pose", rr.Points3D([start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1), static = self.static)
         
     #     self.write_to_pickle()
+    #     return traj
 
-    def sample_navigation(self, start, point):
-        plt.clf()
+    def sample_navigation(self, start, point, mode = 'navigation'):
+        # plt.clf()
         obstacles, _ = self.voxel_map.get_2d_map()
-        plt.imshow(obstacles)
+        # plt.imshow(obstacles)
         if point is None:
             start_pt = self.planner.to_pt(start)
-            plt.scatter(start_pt[1], start_pt[0], s = 10)
+            # plt.scatter(start_pt[1], start_pt[0], s = 10)
             return None
-        goal = self.space.sample_target_point(start, point, self.planner)
+        goal = self.space.sample_target_point(start, point, self.planner, exploration = mode != 'navigation')
         print("point:", point, "goal:", goal)
         obstacles, explored = self.voxel_map.get_2d_map()
         start_pt = self.planner.to_pt(start)
-        plt.scatter(start_pt[1], start_pt[0], s = 15, c = 'b')
+        # plt.scatter(start_pt[1], start_pt[0], s = 15, c = 'b')
         point_pt = self.planner.to_pt(point)
-        plt.scatter(point_pt[1], point_pt[0], s = 15, c = 'g')
+        # plt.scatter(point_pt[1], point_pt[0], s = 15, c = 'g')
         if goal is not None:
             goal_pt = self.planner.to_pt(goal)
-            plt.scatter(goal_pt[1], goal_pt[0], s = 10, c = 'r')
+            # plt.scatter(goal_pt[1], goal_pt[0], s = 10, c = 'r')
         return goal
 
     def sample_frontier(self, start_pose = [0, 0, 0], text = None):
@@ -481,9 +477,9 @@ def sample_frontier(self, start_pose = [0, 0, 0], text = None):
                 alignments_heuristics = time_heuristics
                 
         obstacles, explored = self.voxel_map.get_2d_map()
-        plt.clf()
-        plt.imshow(obstacles * 0.5 + explored * 0.5)
-        plt.scatter(index[1], index[0], s = 20, c = 'r')
+        # plt.clf()
+        # plt.imshow(obstacles * 0.5 + explored * 0.5)
+        # plt.scatter(index[1], index[0], s = 20, c = 'r')
         return self.voxel_map.grid_coords_to_xyt(torch.tensor([index[0], index[1]]))
             
 
@@ -684,7 +680,7 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
         self.obs_count += 1
         world_xyz = get_xyz(depth, pose, intrinsics).squeeze(0)
 
-        # cv2.imwrite('debug/rgb' + str(self.obs_count) + '.jpg', rgb[:, :, [2, 1, 0]])
+        cv2.imwrite(self.log + '/rgb' + str(self.obs_count) + '.jpg', rgb[:, :, [2, 1, 0]])
         np.save(self.log + '/rgb' + str(self.obs_count) + '.npy', rgb)
         np.save(self.log + '/depth' + str(self.obs_count) + '.npy', depth)
         np.save(self.log + '/intrinsics' + str(self.obs_count) + '.npy', intrinsics)
@@ -720,8 +716,8 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
         )
         obs, exp = self.voxel_map.get_2d_map()
         if self.rerun:
-            if not self.static:
-                rr.set_time_sequence("frame", self.obs_count)
+            # if not self.static:
+            #     rr.set_time_sequence("frame", self.obs_count)
             # rr.log('robot_pov', rr.Image(rgb.permute(1, 2, 0)), static = self.static)
             if self.voxel_map.voxel_pcd._points is not None:
                 rr.log("Obstalce_map/pointcloud", rr.Points3D(self.voxel_map.voxel_pcd._points.detach().cpu(), \

From 4bdd5772506f591fd7132b1f06a2da1f2dfd855a Mon Sep 17 00:00:00 2001
From: Peiqi Liu <leo20021210@163.com>
Date: Thu, 5 Sep 2024 20:11:05 -0400
Subject: [PATCH 031/410] debug exploration

---
 src/stretch/dynav/mapping_utils/voxel_map.py |  5 +--
 src/stretch/dynav/voxel_map_server.py        | 37 +++++++++++---------
 2 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/src/stretch/dynav/mapping_utils/voxel_map.py b/src/stretch/dynav/mapping_utils/voxel_map.py
index 8cd14c743..2f457a5a1 100644
--- a/src/stretch/dynav/mapping_utils/voxel_map.py
+++ b/src/stretch/dynav/mapping_utils/voxel_map.py
@@ -385,7 +385,7 @@ def sample_target_point(
             # print('Target is valid:', target_is_valid)
             if not target_is_valid:
                 continue
-            if np.linalg.norm([selected_x - point[0], selected_y - point[1]]) < 0.35:
+            if np.linalg.norm([selected_x - point[0], selected_y - point[1]]) < 0.4:
                 continue
             elif np.linalg.norm([selected_x - point[0], selected_y - point[1]]) <= 0.5:
                 print('OBSTACLE AVOIDANCE')
@@ -678,8 +678,9 @@ def _time_heuristic(self, history_soft, outside_frontier, time_smooth = 0.1, tim
         time_heuristics = 1 / (1 + np.exp(-time_smooth * (time_heuristics - time_threshold)))
         index = np.unravel_index(np.argmax(time_heuristics), history_soft.shape)
         # return index
+        debug = True
         if debug:
-            plt.clf()
+            # plt.clf()
             plt.title('time')
             plt.imshow(time_heuristics)
             plt.scatter(index[1], index[0], s = 15, c = 'r')
diff --git a/src/stretch/dynav/voxel_map_server.py b/src/stretch/dynav/voxel_map_server.py
index 2c9e57c20..aad1196bd 100644
--- a/src/stretch/dynav/voxel_map_server.py
+++ b/src/stretch/dynav/voxel_map_server.py
@@ -305,7 +305,7 @@ def process_text(self, text, start_pose):
                 print(waypoints)
                 # finished = (len(waypoints) <= 4 or torch.linalg.norm(torch.Tensor(point)[:2] - torch.Tensor(start_pose[:2])) > 0.8) and mode == 'navigation'
                 # finished = mode == 'navigation'
-                finished = len(waypoints) <= 4 and mode == 'navigation'
+                finished = len(waypoints) <= 5 and mode == 'navigation'
                 if not finished:
                     waypoints = waypoints[:8]
                 traj = self.planner.clean_path_for_xy(waypoints)
@@ -831,19 +831,22 @@ def write_to_pickle(self):
             pickle.dump(data, f)
 
 # @hydra.main(version_base="1.2", config_path=".", config_name="config.yaml")
-# def main(cfg):
-#     torch.manual_seed(1)
-#     imageProcessor = ImageProcessor(rerun = False, static = False, min_depth = 0., max_depth = 2.5)
-#     imageProcessor = ImageProcessor(rerun = cfg.rerun, static = cfg.static, min_depth = cfg.min_depth, max_depth = cfg.max_depth)
-#     if not cfg.pickle_file_name is None:
-#         imageProcessor.read_from_pickle(cfg.pickle_file_name)
-#     print(imageProcessor.voxel_map_localizer.voxel_pcd._points)
-#     if cfg.open_communication:
-#         while True:
-#             imageProcessor.recv_text()
-#     for i in range(8, 16):
-#         imageProcessor.read_from_pickle('debug/debug_2024-09-02_16-22-55.pkl', i)
-#         imageProcessor.space.sample_exploration(xyt = [0, 0, 0], planner = imageProcessor.planner, text = None)
-
-# if __name__ == "__main__":
-#     main(None)
\ No newline at end of file
+def main(cfg):
+    torch.manual_seed(1)
+    imageProcessor = ImageProcessor(rerun = False, static = False, min_depth = 0., max_depth = 2.5)
+    # imageProcessor = ImageProcessor(rerun = cfg.rerun, static = cfg.static, min_depth = cfg.min_depth, max_depth = cfg.max_depth)
+    # if not cfg.pickle_file_name is None:
+    #     imageProcessor.read_from_pickle(cfg.pickle_file_name)
+    # print(imageProcessor.voxel_map_localizer.voxel_pcd._points)
+    # if cfg.open_communication:
+    #     while True:
+    #         imageProcessor.recv_text()
+    for i in range(8, 47):
+        imageProcessor.read_from_pickle('debug/debug_2024-09-05_18-13-12.pkl', i)
+        obs, exp = imageProcessor.voxel_map.get_2d_map()
+        plt.imshow(obs + exp)
+        # plt.imshow(exp)
+        imageProcessor.space.sample_exploration(xyt = [0, 0, 0], planner = imageProcessor.planner, text = None)
+
+if __name__ == "__main__":
+    main(None)
\ No newline at end of file

From f5e7b790981c2b0418db3c293e51877eb0aca15b Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 6 Sep 2024 13:28:43 -0400
Subject: [PATCH 032/410] Fixing issues with collisions and exploration

---
 src/stretch/agent/robot_agent.py        |  2 ++
 src/stretch/simulation/mujoco_server.py | 14 ++++++--------
 src/stretch/utils/voxel.py              |  2 ++
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 220542132..9b08e187b 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -439,6 +439,8 @@ def update(self, visualize_map: bool = False, debug_instances: bool = False):
 
         if self._sweep_head_on_update:
             self.robot.head_to(0, tilt, blocking=True)
+            x, y, theta = self.robot.get_base_pose()
+            self.voxel_map.delete_obstacles(radius=0.3, point=np.array([x, y]))
 
         if self.use_scene_graph:
             self._update_scene_graph()
diff --git a/src/stretch/simulation/mujoco_server.py b/src/stretch/simulation/mujoco_server.py
index 4836404d4..a54886ad2 100644
--- a/src/stretch/simulation/mujoco_server.py
+++ b/src/stretch/simulation/mujoco_server.py
@@ -113,6 +113,7 @@ def __init__(
         self.servo_report_steps = 1000
 
         self._camera_data = None
+        self._status = None
 
         # Controller stuff
         # Is the velocity controller active?
@@ -189,10 +190,10 @@ def _control_loop_thread(self):
     def control_loop_callback(self):
         """Actual controller timer callback"""
 
-        if self.robot_status is None:
+        if self._status is None:
             vel_odom = [0, 0]
         else:
-            vel_odom = self.robot_status["base"]["x_vel"], self.robot_status["base"]["theta_vel"]
+            vel_odom = self._status["base"]["x_vel"], self._status["base"]["theta_vel"]
 
         if self.debug_control_loop:
             print("Control loop callback: ", self.active, self.xyt_goal, vel_odom)
@@ -243,7 +244,7 @@ def base_controller_at_goal(self):
 
     def get_joint_state(self):
         """Get the joint state of the robot."""
-        status = self.robot_status
+        status = self._status
 
         positions = np.zeros(constants.stretch_degrees_of_freedom)
         velocities = np.zeros(constants.stretch_degrees_of_freedom)
@@ -379,12 +380,9 @@ def start(self, robocasa: bool = False):
 
         while self.is_running():
             self._camera_data = self.robot_sim.pull_camera_data()
+            self._status = self.robot_sim._pull_status()
             time.sleep(1 / self.simulation_rate)
 
-    @property
-    def robot_status(self) -> Dict[str, Any]:
-        return self.robot_sim.status
-
     @override
     def handle_action(self, action: Dict[str, Any]):
         """Handle the action received from the client."""
@@ -602,7 +600,7 @@ def main(
 
     scene_model = None
     if use_robocasa:
-        scene_model, scene_xml, objects_info = model_generation_wizard(
+        scene_model, scene_xml = model_generation_wizard(
             task=robocasa_task,
             style=robocasa_style,
             layout=robocasa_layout,
diff --git a/src/stretch/utils/voxel.py b/src/stretch/utils/voxel.py
index 14a73a4fd..2b3411f97 100644
--- a/src/stretch/utils/voxel.py
+++ b/src/stretch/utils/voxel.py
@@ -121,6 +121,8 @@ def remove(
             if self._weights is not None:
                 self._weights = self._weights[mask]
             self._rgb = self._rgb[mask]
+        else:
+            raise ValueError("Must specify either bounds or both point and radius to remove points")
 
     def add(
         self,

From 077087ea7164ded3dfa7d8771f096541ca9d2726 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 6 Sep 2024 13:44:56 -0400
Subject: [PATCH 033/410] updates to mapping and server

---
 src/stretch/mapping/voxel/voxel.py      | 3 ++-
 src/stretch/simulation/mujoco_server.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index c9de234ca..39f77b96a 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -551,7 +551,8 @@ def add(
         if world_xyz.nelement() > 0:
             self.voxel_pcd.add(world_xyz, features=feats, rgb=rgb, weights=None)
 
-        if self._add_local_radius_points:
+        if self._add_local_radius_points and len(self.observations) == 0:
+            # Only do this at the first step, never after it.
             # TODO: just get this from camera_pose?
             self._update_visited(camera_pose[:3, 3].to(self.map_2d_device))
         if base_pose is not None:
diff --git a/src/stretch/simulation/mujoco_server.py b/src/stretch/simulation/mujoco_server.py
index a54886ad2..8acfb37b4 100644
--- a/src/stretch/simulation/mujoco_server.py
+++ b/src/stretch/simulation/mujoco_server.py
@@ -600,7 +600,7 @@ def main(
 
     scene_model = None
     if use_robocasa:
-        scene_model, scene_xml = model_generation_wizard(
+        scene_model, scene_xml, objects_info = model_generation_wizard(
             task=robocasa_task,
             style=robocasa_style,
             layout=robocasa_layout,

From 537d496707a6a771398a5cb85dc4f1539dfb6d75 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 6 Sep 2024 14:03:33 -0400
Subject: [PATCH 034/410] update

---
 src/stretch/agent/robot_agent.py | 11 ++++++-----
 src/stretch/app/query.py         | 17 +++++++++++------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 9b08e187b..5d05a56e6 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -324,19 +324,20 @@ def grasp_object(self, object_goal: Optional[str] = None, **kwargs) -> bool:
         return self.grasp_client.try_grasping(object_goal=object_goal, **kwargs)
 
     def rotate_in_place(
-        self, steps: int = 12, visualize: bool = False, verbose: bool = True
+        self, steps: Optional[int] = -1, visualize: bool = False, verbose: bool = True
     ) -> bool:
         """Simple helper function to make the robot rotate in place. Do a 360 degree turn to get some observations (this helps debug the robot and create a nice map).
 
         Args:
-            steps(int): number of steps to rotate (each step is 360 degrees / steps). Default is 12.
+            steps(int): number of steps to rotate (each step is 360 degrees / steps). If steps <= 0, use the default number of steps from the parameters.
             visualize(bool): show the map as we rotate. Default is False.
 
         Returns:
             executed(bool): false if we did not actually do any rotations"""
         logger.info("Rotate in place")
-        if steps <= 0:
-            return False
+        if steps is None or steps <= 0:
+            # Read the number of steps from the parameters
+            steps = self.parameters["agent"]["in_place_rotation_steps"]
 
         step_size = 2 * np.pi / steps
         i = 0
@@ -1356,7 +1357,7 @@ def load_map(self, filename: str, color_weight: float = 0.5) -> None:
         loaded_voxel_map = self._create_voxel_map(self.parameters)
         loaded_voxel_map.read_from_pickle(filename, perception=self.semantic_sensor)
 
-        xyz1, rgb1, _, _ = self.voxel_pcd.get_pointcloud()
+        xyz1, rgb1, _, _ = self.voxel_map.get_pointcloud()
         xyz2, rgb2, _, _ = loaded_voxel_map.get_pointcloud()
 
         tform = find_se3_transform(xyz1, xyz2, rgb1, rgb2)
diff --git a/src/stretch/app/query.py b/src/stretch/app/query.py
index 1b587e656..ed19623da 100644
--- a/src/stretch/app/query.py
+++ b/src/stretch/app/query.py
@@ -35,7 +35,7 @@
 @click.option("--spin", default=False, is_flag=True)
 @click.option("--reset", is_flag=True)
 @click.option(
-    "--input_file", default="", type=str, help="Path to input file used instead of robot data"
+    "-i", "--input-path", default="", type=str, help="Path to input file used instead of robot data"
 )
 @click.option(
     "--write-instance-images",
@@ -48,7 +48,6 @@
 @click.option("--frame", default=-1, help="Final frame to read from input file")
 @click.option("--text", default="", help="Text to encode")
 @click.option("-y", "--yes", is_flag=True, help="Skip confirmation")
-@click.option("-l", "--load", type=str, default="", help="Load a saved voxel map")
 @click.option(
     "--all-matches",
     is_flag=True,
@@ -61,6 +60,7 @@
     is_flag=True,
     help="Don't move the robot to the instance, if using real robot instead of offline data",
 )
+@click.option("-o", "--offline", is_flag=True, help="Run code offline on stored data.")
 def main(
     device_id: int = 0,
     verbose: bool = True,
@@ -74,13 +74,14 @@ def main(
     output_filename: str = "stretch_output",
     spin: bool = False,
     write_instance_images: bool = False,
-    input_file: str = "",
+    input_path: str = "",
     frame: int = -1,
     text: str = "",
     yes: bool = False,
     stationary: bool = False,
     all_matches: bool = False,
     threshold: float = 0.5,
+    offline: bool = False,
 ):
 
     print("- Load parameters")
@@ -91,7 +92,7 @@ def main(
         verbose=verbose,
     )
 
-    if len(input_file) == 0 or input_file is None:
+    if not offline:
         real_robot = True
         current_datetime = datetime.datetime.now()
         formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
@@ -112,11 +113,15 @@ def main(
 
         if spin:
             # Rotate and capture many frames
-            agent.rotate_in_place(steps=8, visualize=False)
+            agent.rotate_in_place(visualize=False)
         else:
             # Just capture a single frame
             agent.update()
 
+        # Load the input file
+        if input_path is not None and len(input_path) > 0:
+            agent.load_map(input_path)
+
         if explore_iter > 0:
             raise NotImplementedError("Exploration not implemented yet")
 
@@ -127,7 +132,7 @@ def main(
         real_robot = False
         dummy_robot = DummyStretchClient()
         agent = RobotAgent(dummy_robot, parameters, semantic_sensor)
-        agent.voxel_map.read_from_pickle(input_file, num_frames=frame)
+        agent.voxel_map.read_from_pickle(input_path, num_frames=frame)
 
     try:
         if len(text) == 0:

From fa8a753f5a4299315cf23b6a06dd04be47ff5b36 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 6 Sep 2024 14:04:26 -0400
Subject: [PATCH 035/410] update voxel point cloud code

---
 src/stretch/mapping/voxel/voxel.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index c9de234ca..49e3c02f6 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -777,6 +777,10 @@ def recompute_map(self):
                 **frame.info,
             )
 
+    def get_pointcloud(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Get the current point cloud"""
+        return self.voxel_pcd.get_pointcloud()
+
     def get_2d_map(self, debug: bool = False) -> Tuple[np.ndarray, np.ndarray]:
         """Get 2d map with explored area and frontiers."""
 

From d399262486a708abdbdead0f388a49c3745bab32 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 6 Sep 2024 14:11:10 -0400
Subject: [PATCH 036/410] various updates

---
 src/stretch/mapping/voxel/voxel.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index 39f77b96a..b22ef3e2e 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -554,9 +554,12 @@ def add(
         if self._add_local_radius_points and len(self.observations) == 0:
             # Only do this at the first step, never after it.
             # TODO: just get this from camera_pose?
-            self._update_visited(camera_pose[:3, 3].to(self.map_2d_device))
-        if base_pose is not None:
-            self._update_visited(base_pose.to(self.map_2d_device))
+            # Add local radius points to the map around base
+            if base_pose is not None:
+                self._update_visited(base_pose.to(self.map_2d_device))
+            else:
+                # Camera only
+                self._update_visited(camera_pose[:3, 3].to(self.map_2d_device))
 
         # Increment sequence counter
         self._seq += 1

From 9642b919733c4bb3070841eaaf164c470edaf090 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 6 Sep 2024 14:12:05 -0400
Subject: [PATCH 037/410] update

---
 src/stretch/agent/robot_agent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 5d05a56e6..fcd3045e5 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1360,8 +1360,8 @@ def load_map(self, filename: str, color_weight: float = 0.5) -> None:
         xyz1, rgb1, _, _ = self.voxel_map.get_pointcloud()
         xyz2, rgb2, _, _ = loaded_voxel_map.get_pointcloud()
 
-        tform = find_se3_transform(xyz1, xyz2, rgb1, rgb2)
         breakpoint()
+        tform = find_se3_transform(xyz1, xyz2, rgb1, rgb2)
 
     def get_detections(self, **kwargs) -> List[Instance]:
         """Get the current detections."""

From c060569d368327607708d2b06d1d1c4b086df8c1 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 6 Sep 2024 14:18:09 -0400
Subject: [PATCH 038/410] update point cloud stuff

---
 src/stretch/agent/robot_agent.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index fcd3045e5..135664c81 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1357,11 +1357,11 @@ def load_map(self, filename: str, color_weight: float = 0.5) -> None:
         loaded_voxel_map = self._create_voxel_map(self.parameters)
         loaded_voxel_map.read_from_pickle(filename, perception=self.semantic_sensor)
 
-        xyz1, rgb1, _, _ = self.voxel_map.get_pointcloud()
-        xyz2, rgb2, _, _ = loaded_voxel_map.get_pointcloud()
+        xyz1, _, _, rgb1 = self.voxel_map.get_pointcloud()
+        xyz2, _, _, rgb2 = loaded_voxel_map.get_pointcloud()
 
-        breakpoint()
         tform = find_se3_transform(xyz1, xyz2, rgb1, rgb2)
+        breakpoint()
 
     def get_detections(self, **kwargs) -> List[Instance]:
         """Get the current detections."""

From 08cfe7bf53f5b70c703acd905fa4bd6c8857f50d Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 6 Sep 2024 14:38:57 -0400
Subject: [PATCH 039/410] Update point cloud stuff

---
 src/stretch/utils/point_cloud.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/stretch/utils/point_cloud.py b/src/stretch/utils/point_cloud.py
index f77155f3d..3e3dda771 100644
--- a/src/stretch/utils/point_cloud.py
+++ b/src/stretch/utils/point_cloud.py
@@ -11,11 +11,12 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 
 import cv2
 import numpy as np
 import open3d as o3d
+import torch
 import trimesh.transformations as tra
 
 
@@ -425,7 +426,13 @@ def dropout_random_ellipses(depth_img, dropout_mean, gamma_shape=10000, gamma_sc
 
 
 def find_se3_transform(
-    cloud1, cloud2, rgb1, rgb2, color_weight=0.5, max_iterations=50, tolerance=1e-5
+    cloud1: Union[torch.Tensor, np.ndarray],
+    cloud2: Union[torch.Tensor, np.ndarray],
+    rgb1: Union[torch.Tensor, np.ndarray],
+    rgb2: Union[torch.Tensor, np.ndarray],
+    color_weight=0.5,
+    max_iterations=50,
+    tolerance=1e-5,
 ):
     """
     Find the SE(3) transformation between two colorized point clouds.
@@ -450,6 +457,15 @@ def find_se3_transform(
     #
     # R, t = find_se3_transform(cloud1, cloud2, rgb1, rgb2)
 
+    if isinstance(cloud1, torch.Tensor):
+        cloud1 = cloud1.cpu().numpy()
+    if isinstance(cloud2, torch.Tensor):
+        cloud2 = cloud2.cpu().numpy
+    if isinstance(rgb1, torch.Tensor):
+        rgb1 = rgb1.cpu().numpy()
+    if isinstance(rgb2, torch.Tensor):
+        rgb2 = rgb2.cpu().numpy()
+
     def best_fit_transform(A, B):
         """
         Calculates the least-squares best-fit transform between corresponding 3D points A->B

From 0f36872436d2024e9722c255a2e39ef5e4e930e2 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 6 Sep 2024 14:58:38 -0400
Subject: [PATCH 040/410] only add explored at beginning

---
 src/stretch/mapping/voxel/voxel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index 694a0b6f2..ee51bdc72 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -551,7 +551,7 @@ def add(
         if world_xyz.nelement() > 0:
             self.voxel_pcd.add(world_xyz, features=feats, rgb=rgb, weights=None)
 
-        if self._add_local_radius_points and len(self.observations) == 0:
+        if self._add_local_radius_points and len(self.observations) < 2:
             # Only do this at the first step, never after it.
             # TODO: just get this from camera_pose?
             # Add local radius points to the map around base

From 7dfeffcfa8b73563116dcb0e29f0bc436bd4fe8f Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 6 Sep 2024 15:13:31 -0400
Subject: [PATCH 041/410] updating config

---
 src/stretch/agent/robot_agent.py   | 4 +++-
 src/stretch/mapping/voxel/voxel.py | 3 ++-
 src/stretch/utils/voxel.py         | 9 ++++++++-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 5d05a56e6..3b28358b4 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -441,7 +441,9 @@ def update(self, visualize_map: bool = False, debug_instances: bool = False):
         if self._sweep_head_on_update:
             self.robot.head_to(0, tilt, blocking=True)
             x, y, theta = self.robot.get_base_pose()
-            self.voxel_map.delete_obstacles(radius=0.3, point=np.array([x, y]))
+            self.voxel_map.delete_obstacles(
+                radius=0.3, point=np.array([x, y]), min_height=self.voxel_map.obs_min_height
+            )
 
         if self.use_scene_graph:
             self._update_scene_graph()
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index ee51bdc72..8c3a5c42c 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -1106,9 +1106,10 @@ def delete_obstacles(
         bounds: Optional[np.ndarray] = None,
         point: Optional[np.ndarray] = None,
         radius: Optional[float] = None,
+        min_height: Optional[float] = None,
     ) -> None:
         """Delete obstacles from the map"""
-        self.voxel_pcd.remove(bounds, point, radius)
+        self.voxel_pcd.remove(bounds, point, radius, min_height=min_height)
 
         # Force recompute of 2d map
         self.get_2d_map()
diff --git a/src/stretch/utils/voxel.py b/src/stretch/utils/voxel.py
index 2b3411f97..0aa0818d3 100644
--- a/src/stretch/utils/voxel.py
+++ b/src/stretch/utils/voxel.py
@@ -86,8 +86,13 @@ def remove(
         bounds: Optional[np.ndarray] = None,
         point: Optional[np.ndarray] = None,
         radius: Optional[float] = None,
+        min_height: Optional[float] = None,
     ):
         """Deletes points within a certain radius of a point, or optionally within certain bounds."""
+
+        if min_height is None:
+            min_height = -np.inf
+
         if point is not None and radius is not None:
             # We will do a radius removal
             assert bounds is None, "Cannot do both radius and bounds removal"
@@ -97,7 +102,9 @@ def remove(
                 dists = torch.norm(self._points[:, :2] - torch.tensor(point[:2]), dim=1)
             else:
                 dists = torch.norm(self._points - torch.tensor(point), dim=1)
-            mask = dists > radius
+            radius_mask = dists > radius
+            height_ok = self._points[:, 2] < min_height
+            mask = radius_mask | height_ok
             self._points = self._points[mask]
             if self._features is not None:
                 self._features = self._features[mask]

From 4ed59062db17071ebd57ee9b03660773af2657aa Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 6 Sep 2024 15:55:51 -0400
Subject: [PATCH 042/410] update point cloud setup

---
 src/stretch/agent/robot_agent.py | 21 +++++++++++++++++----
 src/stretch/utils/point_cloud.py |  2 +-
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 135664c81..838169904 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1346,7 +1346,7 @@ def wave(self, **kwargs) -> bool:
 
         self.robot.switch_to_navigation_mode()
 
-    def load_map(self, filename: str, color_weight: float = 0.5) -> None:
+    def load_map(self, filename: str, color_weight: float = 0.5, debug: bool = False) -> None:
         """Load a map from a PKL file. Creates a new voxel map and loads the data from the file into this map. Then uses RANSAC to figure out where the current map and the loaded map overlap, computes a transform, and applies this transform to the loaded map to align it with the current map.
 
         Args:
@@ -1357,11 +1357,24 @@ def load_map(self, filename: str, color_weight: float = 0.5) -> None:
         loaded_voxel_map = self._create_voxel_map(self.parameters)
         loaded_voxel_map.read_from_pickle(filename, perception=self.semantic_sensor)
 
-        xyz1, _, _, rgb1 = self.voxel_map.get_pointcloud()
-        xyz2, _, _, rgb2 = loaded_voxel_map.get_pointcloud()
+        xyz1, _, _, rgb1 = loaded_voxel_map.get_pointcloud()
+        xyz2, _, _, rgb2 = self.voxel_map.get_pointcloud()
 
         tform = find_se3_transform(xyz1, xyz2, rgb1, rgb2)
-        breakpoint()
+
+        # Apply the transform to the loaded map
+        xyz1 = xyz1 @ tform[0].T + tform[1]
+
+        if debug:
+            # for visualization
+            from stretch.utils.point_cloud import show_point_cloud
+
+            _xyz = np.concatenate([xyz1.cpu().numpy(), xyz2.cpu().numpy()], axis=0)
+            _rgb = np.concatenate([rgb1.cpu().numpy(), rgb2.cpu().numpy() * 0.5], axis=0)
+            show_point_cloud(_xyz, _rgb, orig=np.zeros(3))
+
+        # Add the loaded map to the current map
+        self.voxel_map.add_pointcloud(xyz1, rgb1)
 
     def get_detections(self, **kwargs) -> List[Instance]:
         """Get the current detections."""
diff --git a/src/stretch/utils/point_cloud.py b/src/stretch/utils/point_cloud.py
index 3e3dda771..713ec9bef 100644
--- a/src/stretch/utils/point_cloud.py
+++ b/src/stretch/utils/point_cloud.py
@@ -460,7 +460,7 @@ def find_se3_transform(
     if isinstance(cloud1, torch.Tensor):
         cloud1 = cloud1.cpu().numpy()
     if isinstance(cloud2, torch.Tensor):
-        cloud2 = cloud2.cpu().numpy
+        cloud2 = cloud2.cpu().numpy()
     if isinstance(rgb1, torch.Tensor):
         rgb1 = rgb1.cpu().numpy()
     if isinstance(rgb2, torch.Tensor):

From e84cabfec99229196b98a26b95d0b811a8e604cd Mon Sep 17 00:00:00 2001
From: Peiqi Liu <leo20021210@163.com>
Date: Fri, 6 Sep 2024 19:34:05 -0400
Subject: [PATCH 043/410] llm codes

---
 src/stretch/config/dynav_config.yaml         |   2 +-
 src/stretch/dynav/llm_localizer.py           | 257 ++++++++++
 src/stretch/dynav/llm_server.py              | 483 +++++++++++++++++++
 src/stretch/dynav/mapping_utils/a_star.py    |  12 +-
 src/stretch/dynav/mapping_utils/voxel.py     |   1 -
 src/stretch/dynav/mapping_utils/voxel_map.py |   6 +-
 src/stretch/dynav/ok_robot_hw/robot.py       |   6 +
 src/stretch/dynav/robot_agent_manip_mdp.py   |  23 +-
 src/stretch/dynav/run_ok_nav.py              |  20 +-
 src/stretch/dynav/voxel_map_localizer.py     |   6 -
 src/stretch/dynav/voxel_map_server.py        |  40 +-
 11 files changed, 799 insertions(+), 57 deletions(-)
 create mode 100644 src/stretch/dynav/llm_localizer.py
 create mode 100644 src/stretch/dynav/llm_server.py

diff --git a/src/stretch/config/dynav_config.yaml b/src/stretch/config/dynav_config.yaml
index 57dd6a808..6a474c65f 100644
--- a/src/stretch/config/dynav_config.yaml
+++ b/src/stretch/config/dynav_config.yaml
@@ -10,7 +10,7 @@ voxel_size: 0.1
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.5  # Ignore things over this height (eg ceilings)
 obs_min_density: 1  # This many points makes it an obstacle
-exp_min_density: 1
+exp_min_density: 0
 
 # Padding
 pad_obstacles: 2  # Add this many units (voxel_size) to the area around obstacles
diff --git a/src/stretch/dynav/llm_localizer.py b/src/stretch/dynav/llm_localizer.py
new file mode 100644
index 000000000..bf48198f9
--- /dev/null
+++ b/src/stretch/dynav/llm_localizer.py
@@ -0,0 +1,257 @@
+from io import BytesIO
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import time
+import os
+
+import torch
+import numpy as np
+from PIL import Image
+
+from typing import Optional
+from torch import Tensor
+
+from stretch.utils.point_cloud_torch import unproject_masked_depth_to_xyz_coordinates
+from stretch.dynav.mapping_utils import VoxelizedPointcloud
+
+from transformers import AutoProcessor
+from transformers import Owlv2ForObjectDetection
+
+import google.generativeai as genai
+from openai import OpenAI
+import base64
+genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
+generation_config = genai.GenerationConfig(temperature=0)
+safety_settings = [
+    {
+        "category": "HARM_CATEGORY_DANGEROUS",
+        "threshold": "BLOCK_NONE",
+    },
+    {
+        "category": "HARM_CATEGORY_HARASSMENT",
+        "threshold": "BLOCK_NONE",
+    },
+    {
+        "category": "HARM_CATEGORY_HATE_SPEECH",
+        "threshold": "BLOCK_NONE",
+    },
+    {
+        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+        "threshold": "BLOCK_NONE",
+    },
+    {
+        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+        "threshold": "BLOCK_NONE",
+    },
+]
+class LLM_Localizer():
+    def __init__(self, voxel_map_wrapper = None, exist_model = 'gpt-4o', loc_model = 'owlv2', device = 'cuda'):
+        self.voxel_map_wrapper = voxel_map_wrapper
+        self.device = device
+        self.voxel_pcd = VoxelizedPointcloud(voxel_size=0.2).to(self.device)
+        # self.exist_model = YOLOWorld("yolov8l-worldv2.pt")
+        self.existence_checking_model = exist_model
+        if exist_model == 'gpt-4o':
+            print('WE ARE USING OPENAI GPT4o')
+            self.gpt_client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
+        elif exist_model == 'gemini-1.5-pro':
+            print('WE ARE USING GEMINI 1.5 PRO')
+            
+        elif exist_model == 'gemini-1.5-flash':
+            print('WE ARE USING GEMINI 1.5 FLASH')
+        else:
+            print('YOU ARE USING NOTHING!')
+        self.location_checking_model = loc_model
+        if loc_model == 'owlv2':
+            self.exist_processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
+            self.exist_model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble").to(self.device)
+            print('WE ARE USING OWLV2 FOR LOCALIZATION!')
+        else:
+            print('YOU ARE USING VOXEL MAP FOR LOCALIZATION!')
+        
+    def add(self,
+        points: Tensor,
+        features: Optional[Tensor],
+        rgb: Optional[Tensor],
+        weights: Optional[Tensor] = None,
+        obs_count: Optional[Tensor] = None,
+    ):
+        points = points.to(self.device)
+        if features is not None:
+            features = features.to(self.device)
+        if rgb is not None:
+            rgb = rgb.to(self.device)
+        if weights is not None:
+            weights = weights.to(self.device)
+        self.voxel_pcd.add(points = points, 
+                        features = features,
+                        rgb = rgb,
+                        weights = weights,
+                        obs_count = obs_count)
+
+    def owl_locater(self, A, encoded_image, timestamps_lst):
+        for i in timestamps_lst:
+            image_info = encoded_image[i][-1]
+            image = image_info['image']
+            box = None
+                
+            inputs = self.exist_processor(text=A, images=image, return_tensors="pt").to(self.device)
+
+            with torch.no_grad():
+                outputs = self.exist_model(**inputs)
+
+            target_sizes = torch.tensor([image.size[::-1]])
+            results = self.exist_processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.2)[0]
+
+            if len(results["scores"]) > 0:
+                cur_score = torch.max(results["scores"]).item()
+                max_score_index = torch.argmax(results["scores"])
+                box = results["boxes"][max_score_index].tolist()
+            if box is not None:
+                xmin, ymin, xmax, ymax = map(int, box)
+                mask = np.zeros(image_info['depth'].shape, dtype=np.uint8)
+                mask[ymin:ymax, xmin:xmax] = 255
+                xyz = image_info['xyz']        
+                masked_xyz = xyz[mask.flatten() > 0]
+                centroid = np.stack([torch.mean(masked_xyz[:, 0]), torch.mean(masked_xyz[:, 1]), torch.mean(masked_xyz[:, 2])]).T
+                debug_text = '#### - Obejct is detected in observations where instance' + str(i + 1) + ' comes from. **😃** Directly navigate to it.\n'
+                return centroid, debug_text, i, masked_xyz
+        debug_text = '#### - All instances are not the target! Maybe target object has not been observed yet. **😭**\n'
+        return None, debug_text, None, None
+    
+    def gpt_chunk(self, chunk, sys_prompt, user_prompt):
+        for i in range(10):
+            try:
+                response = self.gpt_client.chat.completions.create(
+                    model=self.existence_checking_model,
+                    messages=[
+                        {"role": "system", "content": sys_prompt},
+                        {"role": "user", "content": user_prompt},
+                        {"role": "user", "content": chunk}
+                    ],
+                    temperature=0.0,
+                )                
+                timestamps = response.choices[0].message.content
+                if 'None' in timestamps:
+                    return None
+                else:
+                    return list(map(int, timestamps.replace(' ', '').split(':')[1].split(',')))
+            except Exception as e:
+                print(f"Error: {e}")
+                time.sleep(15)
+        return "Execution Failed"
+
+    def gemini_chunk(self, chunk, sys_prompt, user_prompt):
+        if self.existence_checking_model == 'gemini-1.5-pro':
+            model_name="models/gemini-1.5-pro-exp-0827"
+        elif self.existence_checking_model == 'gemini-1.5-flash':
+            model_name="models/gemini-1.5-flash-exp-0827"
+
+        for i in range(3):
+            try:
+                model = genai.GenerativeModel(model_name=model_name, system_instruction=sys_prompt)
+                timestamps = model.generate_content(chunk + [user_prompt], generation_config=generation_config, safety_settings=safety_settings).text
+                timestamps = timestamps.split('\n')[0]
+                if 'None' in timestamps:
+                    return None
+                else:
+                    return list(map(int, timestamps.replace(' ', '').split(':')[1].split(',')))
+            except Exception as e:
+                print(f"Error: {e}")
+                time.sleep(10)
+        return "Execution Failed"
+
+
+
+    def llm_locator(self, A, encoded_image, process_chunk, context_length = 30):
+        timestamps_lst = []
+
+        sys_prompt = f"""
+        For each object query provided, list at most 10 timestamps that the object is most clearly shown. If the object does not appear, simply output the object name and the word "None" for the timestamp. Avoid any unnecessary explanations or additional formatting.
+        
+        Example:
+        Input:
+        cat
+
+        Output: 
+        cat: 1,4,6,9
+
+        Input: 
+        car
+
+        Output:
+        car: None
+        """
+
+        user_prompt = f"""The object you need to find is {A}"""
+        if 'gpt' in self.existence_checking_model:
+            content = [item for sublist in list(encoded_image.values()) for item in sublist[:2]][-100:] # adjust to [-60:] for taking only the last 30 and have faster speed
+        elif 'gemini' in self.existence_checking_model:
+            content = [item for sublist in list(encoded_image.values()) for item in sublist[0]][-100:]
+        content_chunks = [content[i:i + 2 * context_length] for i in range(0, len(content), 2 * context_length)]
+        
+        with ThreadPoolExecutor(max_workers=2) as executor:
+            future_to_chunk = {executor.submit(process_chunk, chunk, sys_prompt, user_prompt): chunk for chunk in content_chunks}
+            
+            for future in as_completed(future_to_chunk):
+                chunk = future_to_chunk[future]
+                try:
+                    result = future.result()
+                    if result:
+                        timestamps_lst.extend(result)    
+                except Exception as e:
+                    print(f"Exception occurred: {e}")
+        timestamps_lst = sorted(timestamps_lst, reverse=True)
+        # print(A, timestamps_lst)
+        return self.owl_locater(A, encoded_image, timestamps_lst)
+
+    def localize_A(self, A, debug = True, return_debug = False, count_threshold = 5):
+        encoded_image = {}
+
+        counts = torch.bincount(self.voxel_map_wrapper.voxel_pcd._obs_counts)
+        cur_obs = max(self.voxel_map_wrapper.voxel_pcd._obs_counts)
+        filtered_obs = (counts > count_threshold).nonzero(as_tuple=True)[0].tolist()
+        # filtered_obs = sorted(set(filtered_obs + [i for i in range(cur_obs-10, cur_obs+1)]))
+        filtered_obs = sorted(filtered_obs)
+
+        # filtered_obs = (counts <= count_threshold).nonzero(as_tuple=True)[0].tolist()
+        # filtered_obs = [obs for obs in filtered_obs if (cur_obs - obs) >= 10]
+
+        if 'gemini' in self.existence_checking_model:
+            process_chunk = self.gemini_chunk
+            context_length = 100
+        elif 'gpt' in self.existence_checking_model:
+            process_chunk = self.gpt_chunk
+            context_length = 30
+
+        for obs_id in filtered_obs: 
+            rgb = self.voxel_map_wrapper.observations[obs_id - 1].rgb.numpy()
+            depth = self.voxel_map_wrapper.observations[obs_id - 1].depth
+            camera_pose = self.voxel_map_wrapper.observations[obs_id - 1].camera_pose
+            camera_K = self.voxel_map_wrapper.observations[obs_id - 1].camera_K
+
+            full_world_xyz = unproject_masked_depth_to_xyz_coordinates(  # Batchable!
+                depth=depth.unsqueeze(0).unsqueeze(1),
+                pose=camera_pose.unsqueeze(0),
+                inv_intrinsics=torch.linalg.inv(camera_K[:3, :3]).unsqueeze(0),
+            )
+            depth = depth.numpy()
+            rgb[depth > 2.5] = [0, 0, 0]
+            image = Image.fromarray(rgb.astype(np.uint8), mode='RGB')
+            if 'gemini' in self.existence_checking_model:
+                encoded_image[obs_id] = [[f"Following is the image took on timestep {obs_id}: ", image], {'image':image, 'xyz':full_world_xyz, 'depth':depth}]
+            elif 'gpt' in self.existence_checking_model:
+                buffered = BytesIO()
+                image.save(buffered, format="PNG")
+                img_bytes = buffered.getvalue()
+                base64_encoded = base64.b64encode(img_bytes).decode('utf-8')
+                encoded_image[obs_id] = [{"type": "text", "text": f"Following is the image took on timestep {obs_id}"},
+                    {"type": "image_url", "image_url": {
+                        "url": f"data:image/png;base64,{base64_encoded}"}
+                    }, {'image':image, 'xyz':full_world_xyz, 'depth':depth}]
+        target_point, debug_text, obs, point = self.llm_locator(A, encoded_image, process_chunk, context_length)
+        if not debug:
+            return target_point
+        elif not return_debug:
+            return target_point, debug_text
+        else:
+            return target_point, debug_text, obs, point
\ No newline at end of file
diff --git a/src/stretch/dynav/llm_server.py b/src/stretch/dynav/llm_server.py
new file mode 100644
index 000000000..31bced34d
--- /dev/null
+++ b/src/stretch/dynav/llm_server.py
@@ -0,0 +1,483 @@
+import zmq
+
+from stretch.dynav.scannet import CLASS_LABELS_200
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchvision.transforms.functional as V
+
+import clip
+from torchvision import transforms
+
+import os
+# import wget
+import time
+
+import open3d as o3d
+
+from matplotlib import pyplot as plt
+import pickle
+from pathlib import Path
+from stretch.dynav.llm_localizer import LLM_Localizer
+from stretch.core import get_parameters
+from stretch.dynav.mapping_utils import (
+    SparseVoxelMap, 
+    SparseVoxelMapNavigationSpace, 
+    AStar,
+    VoxelizedPointcloud
+)
+
+import datetime
+
+import threading
+import scipy
+import hydra
+
+from transformers import AutoProcessor, AutoModel
+import rerun as rr
+
+from io import BytesIO
+from PIL import Image
+
+from stretch.dynav.communication_util import load_socket, send_array, recv_array, send_rgb_img, recv_rgb_img, send_depth_img, recv_depth_img, send_everything, recv_everything
+
+def get_inv_intrinsics(intrinsics):
+    # return intrinsics.double().inverse().to(intrinsics)
+    fx, fy, ppx, ppy = intrinsics[..., 0, 0], intrinsics[..., 1, 1], intrinsics[..., 0, 2], intrinsics[..., 1, 2]
+    inv_intrinsics = torch.zeros_like(intrinsics)
+    inv_intrinsics[..., 0, 0] = 1.0 / fx
+    inv_intrinsics[..., 1, 1] = 1.0 / fy
+    inv_intrinsics[..., 0, 2] = -ppx / fx
+    inv_intrinsics[..., 1, 2] = -ppy / fy
+    inv_intrinsics[..., 2, 2] = 1.0
+    return inv_intrinsics
+
+def get_xyz(depth, pose, intrinsics):
+    """Returns the XYZ coordinates for a set of points.
+
+    Args:
+        depth: The depth array, with shape (B, 1, H, W)
+        pose: The pose array, with shape (B, 4, 4)
+        intrinsics: The intrinsics array, with shape (B, 3, 3)
+
+    Returns:
+        The XYZ coordinates of the projected points, with shape (B, H, W, 3)
+    """
+    if not isinstance(depth, torch.Tensor):
+        depth = torch.from_numpy(depth)
+    if not isinstance(pose, torch.Tensor):
+        pose = torch.from_numpy(pose)
+    if not isinstance(intrinsics, torch.Tensor):
+        intrinsics = torch.from_numpy(intrinsics)
+    while depth.ndim < 4:
+        depth = depth.unsqueeze(0)
+    while pose.ndim < 3:
+        pose = pose.unsqueeze(0)
+    while intrinsics.ndim < 3:
+        intrinsics = intrinsics.unsqueeze(0)
+    (bsz, _, height, width), device, dtype = depth.shape, depth.device, intrinsics.dtype
+
+    # Gets the pixel grid.
+    xs, ys = torch.meshgrid(
+        torch.arange(0, width, device=device, dtype=dtype),
+        torch.arange(0, height, device=device, dtype=dtype),
+        indexing="xy",
+    )
+    xy = torch.stack([xs, ys], dim=-1).flatten(0, 1).unsqueeze(0).repeat_interleave(bsz, 0)
+    xyz = torch.cat((xy, torch.ones_like(xy[..., :1])), dim=-1)
+
+    # Applies intrinsics and extrinsics.
+    # xyz = xyz @ intrinsics.inverse().transpose(-1, -2)
+    xyz = xyz @ get_inv_intrinsics(intrinsics).transpose(-1, -2)
+    xyz = xyz * depth.flatten(1).unsqueeze(-1)
+    xyz = (xyz[..., None, :] * pose[..., None, :3, :3]).sum(dim=-1) + pose[..., None, :3, 3]
+    
+    xyz = xyz.unflatten(1, (height, width))
+
+    return xyz
+
+class ImageProcessor:
+    def __init__(self,  
+        vision_method = 'flash_owl', 
+        siglip = True,
+        device = 'cuda',
+        min_depth = 0.25,
+        max_depth = 2.5,
+        img_port = 5555,
+        text_port = 5556,
+        open_communication = True,
+        rerun = True,
+        static = True
+    ):
+        self.static = static
+        self.siglip = siglip
+        current_datetime = datetime.datetime.now()
+        self.log = 'debug_' + current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
+        self.rerun = rerun
+        if self.rerun:
+            rr.init(self.log, spawn = True)
+            # if self.static:
+            #     rr.init(self.log, spawn = False)
+            #     rr.connect('100.108.67.79:9876')
+            # else:
+            #     rr.init(self.log, spawn = True)
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.obs_count = 0
+        # There are three vision methods:
+        # 1. 'mask*lip' Following the idea of https://arxiv.org/abs/2112.01071, remove the last layer of any VLM and obtain the dense features
+        # 2. 'mask&*lip' Following the idea of https://mahis.life/clip-fields/, extract segmentation mask and assign a vision-language feature to it
+        self.vision_method = vision_method
+        # If cuda is not available, then device will be forced to be cpu
+        if not torch.cuda.is_available():
+            device = 'cpu'
+        self.device = device
+
+        self.create_obstacle_map()
+        self.create_vision_model()
+
+        self.voxel_map_lock = threading.Lock()  # Create a lock for synchronizing access to `self.voxel_map_localizer`
+        
+        if open_communication:
+            self.img_socket = load_socket(img_port)
+            self.text_socket = load_socket(text_port)
+
+            self.img_thread = threading.Thread(target=self._recv_image)
+            self.img_thread.daemon = True
+            self.img_thread.start()
+    
+    def create_obstacle_map(self):
+        print("- Load parameters")
+        parameters = get_parameters("dynav_config.yaml")
+        self.default_expand_frontier_size = parameters["default_expand_frontier_size"]
+        self.voxel_map = SparseVoxelMap(
+            resolution=parameters["voxel_size"],
+            local_radius=parameters["local_radius"],
+            obs_min_height=parameters["obs_min_height"],
+            obs_max_height=parameters["obs_max_height"],
+            obs_min_density = parameters["obs_min_density"],
+            exp_min_density = parameters["exp_min_density"],
+            min_depth=self.min_depth,
+            max_depth=self.max_depth,
+            pad_obstacles=parameters["pad_obstacles"],
+            add_local_radius_points=parameters.get(
+                "add_local_radius_points", default=True
+            ),
+            remove_visited_from_obstacles=parameters.get(
+                "remove_visited_from_obstacles", default=False
+            ),
+            smooth_kernel_size=parameters.get("filters/smooth_kernel_size", -1),
+            use_median_filter=parameters.get("filters/use_median_filter", False),
+            median_filter_size=parameters.get("filters/median_filter_size", 5),
+            median_filter_max_error=parameters.get(
+                "filters/median_filter_max_error", 0.01
+            ),
+            use_derivative_filter=parameters.get(
+                "filters/use_derivative_filter", False
+            ),
+            derivative_filter_threshold=parameters.get(
+                "filters/derivative_filter_threshold", 0.5
+            )
+        )
+        self.space = SparseVoxelMapNavigationSpace(
+            self.voxel_map,
+            # step_size=parameters["step_size"],
+            rotation_step_size=parameters["rotation_step_size"],
+            dilate_frontier_size=parameters[
+                "dilate_frontier_size"
+            ],  # 0.6 meters back from every edge = 12 * 0.02 = 0.24
+            dilate_obstacle_size=parameters["dilate_obstacle_size"],
+        )
+
+        # Create a simple motion planner
+        self.planner = AStar(self.space)
+
+    def create_vision_model(self):
+        if self.vision_method == 'gpt_owl':
+            self.voxel_map_localizer = LLM_Localizer(self.voxel_map, exist_model = 'gpt-4o', loc_model = 'owlv2', device = self.device)
+        elif self.vision_method == 'flash_owl':
+            self.voxel_map_localizer = LLM_Localizer(self.voxel_map, exist_model = 'gemini-1.5-flash', loc_model = 'owlv2', device = self.device)    
+        elif self.vision_method == 'pro_owl':
+            self.voxel_map_localizer = LLM_Localizer(self.voxel_map, exist_model = 'gemini-1.5-pro', loc_model = 'owlv2', device = self.device)
+
+    def process_text(self, text, start_pose):
+        if self.rerun:
+            rr.log('/object', rr.Clear(recursive = True), static = self.static)
+            rr.log('/robot_start_pose', rr.Clear(recursive = True), static = self.static)
+            rr.log('/direction', rr.Clear(recursive = True), static = self.static)
+            rr.log('robot_monologue', rr.Clear(recursive = True), static = self.static)
+            rr.log('/Past_observation_most_similar_to_text', rr.Clear(recursive = True), static = self.static)
+            if not self.static:
+                rr.connect('100.108.67.79:9876')
+        
+        start_time = time.time()
+
+        debug_text = ''
+        mode = 'navigation'
+        obs = None
+        # Do visual grounding
+        if text is not None and text != '':
+            with self.voxel_map_lock:
+                localized_point, debug_text, obs, pointcloud = self.voxel_map_localizer.localize_A(text, debug = True, return_debug = True)
+            if localized_point is not None:
+                rr.log("/object", rr.Points3D([localized_point[0], localized_point[1], 1.5], colors=torch.Tensor([1, 0, 0]), radii=0.1), static = self.static)
+        # Do Frontier based exploration
+        if text is None or text == '' or localized_point is None:
+            debug_text += '## Navigation fails, so robot starts exploring environments.\n'
+            localized_point = self.sample_frontier(start_pose, text)
+            mode = 'exploration'
+            rr.log("/object", rr.Points3D([0, 0, 0], colors=torch.Tensor([1, 0, 0]), radii=0), static = self.static)
+            print('\n', localized_point, '\n')
+
+        if localized_point is None:
+            return []
+        
+        breakpoint1 = time.time()
+        print('Determining localized point', breakpoint1 - start_time)
+        
+        if len(localized_point) == 2:
+            localized_point = np.array([localized_point[0], localized_point[1], 0])
+
+        point = self.sample_navigation(start_pose, localized_point)
+
+        breakpoint2 = time.time()
+        print('Determining target point', breakpoint2 - start_time)
+
+        if self.rerun:
+            buf = BytesIO()
+            plt.savefig(buf, format='png')
+            buf.seek(0)
+            img = Image.open(buf)
+            img = np.array(img)
+            buf.close()
+            rr.log('2d_map', rr.Image(img), static = self.static)
+        else:
+            if text != '':
+                plt.savefig(self.log + '/debug_' + text + str(self.obs_count) + '.png')
+            else:
+                plt.savefig(self.log + '/debug_exploration' + str(self.obs_count) + '.png')
+        plt.clf()
+
+        if self.rerun:
+            if text is not None and text != '':
+                debug_text = '### The goal is to navigate to ' + text + '.\n' + debug_text
+            else:
+                debug_text = '### I have not received any text query from human user.\n ### So, I plan to explore the environment with Frontier-based exploration.\n'
+            debug_text = '# Robot\'s monologue: \n' + debug_text
+            rr.log("robot_monologue", rr.TextDocument(debug_text, media_type = rr.MediaType.MARKDOWN), static = self.static)
+
+        if obs is not None and mode == 'navigation':
+            rgb = self.voxel_map.observations[obs - 1].rgb
+            if not self.rerun:
+                if isinstance(rgb, torch.Tensor):
+                    rgb = np.array(rgb)
+                cv2.imwrite(self.log + '/debug_' + text + '.png', rgb[:, :, [2, 1, 0]])
+            else:
+                rr.log('/Past_observation_most_similar_to_text', rr.Image(rgb), static = self.static)
+        traj = []
+        waypoints = None
+
+        breakpoint3 = time.time()
+        print('Start planning', breakpoint3 - start_time)
+
+        if point is None:
+            print('Unable to find any target point, some exception might happen')
+        else:
+            print('Target point is', point)
+            res = self.planner.plan(start_pose, point)
+            if res.success:
+                waypoints = [pt.state for pt in res.trajectory]
+                # If we are navigating to some object of interst, send (x, y, z) of 
+                # the object so that we can make sure the robot looks at the object after navigation
+                finished = len(waypoints) <= 5 and mode == 'navigation'
+                if not finished:
+                    waypoints = waypoints[:8]
+                traj = self.planner.clean_path_for_xy(waypoints)
+                # traj = traj[1:]
+                if finished:
+                    traj.append([np.nan, np.nan, np.nan])
+                    if isinstance(localized_point, torch.Tensor):
+                        localized_point = localized_point.tolist()
+                    traj.append(localized_point)
+                print('Planned trajectory:', traj)
+            else:
+                print('[FAILURE]', res.reason)
+
+        breakpoint4 = time.time()
+        print('Planning stops', breakpoint4 - start_time)
+            
+        if traj is not None:
+            origins = []
+            vectors = []
+            for idx in range(len(traj)):
+                if idx != len(traj) - 1:
+                    origins.append([traj[idx][0], traj[idx][1], 1.5])
+                    vectors.append([traj[idx + 1][0] - traj[idx][0], traj[idx + 1][1] - traj[idx][1], 0])
+            rr.log("/direction", rr.Arrows3D(origins = origins, vectors = vectors, colors=torch.Tensor([0, 1, 0]), radii=0.05), static = self.static)
+            rr.log("/robot_start_pose", rr.Points3D([start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1), static = self.static)
+        
+        breakpoint5 = time.time()
+        print('Logging', breakpoint5 - start_time)
+
+        # self.write_to_pickle()
+
+        # breakpoint6 = time.time()
+        # print('Writing to pickle', breakpoint6 - start_time)
+
+        return traj
+
+    def sample_navigation(self, start, point):
+        plt.clf()
+        obstacles, _ = self.voxel_map.get_2d_map()
+        plt.imshow(obstacles)
+        if point is None:
+            start_pt = self.planner.to_pt(start)
+            plt.scatter(start_pt[1], start_pt[0], s = 10)
+            return None
+        goal = self.space.sample_target_point(start, point, self.planner)
+        print("point:", point, "goal:", goal)
+        obstacles, explored = self.voxel_map.get_2d_map()
+        start_pt = self.planner.to_pt(start)
+        plt.scatter(start_pt[1], start_pt[0], s = 15, c = 'b')
+        point_pt = self.planner.to_pt(point)
+        plt.scatter(point_pt[1], point_pt[0], s = 15, c = 'g')
+        if goal is not None:
+            goal_pt = self.planner.to_pt(goal)
+            plt.scatter(goal_pt[1], goal_pt[0], s = 10, c = 'r')
+        return goal
+
+    def sample_frontier(self, start_pose = [0, 0, 0], text = None):
+        with self.voxel_map_lock:
+            if text is not None and text != '':
+                index, time_heuristics, alignments_heuristics, total_heuristics = self.space.sample_exploration(start_pose, self.planner, None, None, debug = False)
+            else:
+                index, time_heuristics, _, total_heuristics = self.space.sample_exploration(start_pose, self.planner, None, None, debug = False)
+                alignments_heuristics = time_heuristics
+                
+        obstacles, explored = self.voxel_map.get_2d_map()
+        plt.clf()
+        plt.imshow(obstacles * 0.5 + explored * 0.5)
+        plt.scatter(index[1], index[0], s = 20, c = 'r')
+        return self.voxel_map.grid_coords_to_xyt(torch.tensor([index[0], index[1]]))
+            
+    def _recv_image(self):
+        while True:
+            # data = recv_array(self.img_socket)
+            rgb, depth, intrinsics, pose = recv_everything(self.img_socket)
+            start_time = time.time()
+            self.process_rgbd_images(rgb, depth, intrinsics, pose)
+            process_time = time.time() - start_time
+            print('Image processing takes', process_time, 'seconds')
+    
+    def add_to_voxel_pcd(self, valid_xyz, feature, valid_rgb, weights = None, threshold = 0.95):
+        # Adding all points to voxelizedPointCloud is useless and expensive, we should exclude threshold of all points
+        selected_indices = torch.randperm(len(valid_xyz))[:int((1 - threshold) * len(valid_xyz))]
+        if len(selected_indices) == 0:
+            return
+        if valid_xyz is not None:
+            valid_xyz = valid_xyz[selected_indices]
+        if feature is not None:
+            feature = feature[selected_indices]
+        if valid_rgb is not None:
+            valid_rgb = valid_rgb[selected_indices]
+        if weights is not None:
+            weights = weights[selected_indices]
+        with self.voxel_map_lock:
+            self.voxel_map_localizer.add(points = valid_xyz, 
+                                    features = feature,
+                                    rgb = valid_rgb,
+                                    weights = weights,
+                                    obs_count = self.obs_count)
+
+    def process_rgbd_images(self, rgb, depth, intrinsics, pose):
+        if not os.path.exists(self.log):
+            os.mkdir(self.log)
+        self.obs_count += 1
+        world_xyz = get_xyz(depth, pose, intrinsics).squeeze(0)
+
+        cv2.imwrite('debug/rgb' + str(self.obs_count) + '.jpg', rgb[:, :, [2, 1, 0]])
+
+        rgb, depth = torch.from_numpy(rgb), torch.from_numpy(depth)
+        rgb = rgb.permute(2, 0, 1).to(torch.uint8)
+
+        median_depth = torch.from_numpy(
+            scipy.ndimage.median_filter(depth, size=5)
+        )
+        median_filter_error = (depth - median_depth).abs()
+        valid_depth = torch.logical_and(depth < self.max_depth, depth > self.min_depth)
+        valid_depth = (
+            valid_depth
+            & (median_filter_error < 0.01).bool()
+        )
+        
+        with self.voxel_map_lock:
+            self.voxel_map_localizer.voxel_pcd.clear_points(depth, torch.from_numpy(intrinsics), torch.from_numpy(pose))
+            self.voxel_map.voxel_pcd.clear_points(depth, torch.from_numpy(intrinsics), torch.from_numpy(pose))
+        
+        if '_owl' in self.vision_method:
+            self.run_llm_owl(rgb, ~valid_depth, world_xyz)
+
+        self.voxel_map.add(
+            camera_pose = torch.Tensor(pose), 
+            rgb = torch.Tensor(rgb).permute(1, 2, 0), 
+            depth = torch.Tensor(depth), 
+            camera_K = torch.Tensor(intrinsics)
+        )
+        obs, exp = self.voxel_map.get_2d_map()
+        if self.rerun:
+            if self.voxel_map.voxel_pcd._points is not None:
+                rr.log("Obstalce_map/pointcloud", rr.Points3D(self.voxel_map.voxel_pcd._points.detach().cpu(), \
+                                                              colors=self.voxel_map.voxel_pcd._rgb.detach().cpu() / 255., radii=0.03), static = self.static)
+            if self.voxel_map_localizer.voxel_pcd._points is not None:
+                rr.log("Semantic_memory/pointcloud", rr.Points3D(self.voxel_map_localizer.voxel_pcd._points.detach().cpu(), \
+                                                                 colors=self.voxel_map_localizer.voxel_pcd._rgb.detach().cpu() / 255., radii=0.03), static = self.static)
+        else:
+            cv2.imwrite(self.log + '/debug_' + str(self.obs_count) + '.jpg', np.asarray(obs.int() * 127 + exp.int() * 127))
+    
+    def run_llm_owl(self, rgb, mask, world_xyz):
+        valid_xyz = world_xyz[~mask]
+        valid_rgb = rgb.permute(1, 2, 0)[~mask]
+        if len(valid_xyz) != 0:
+            self.add_to_voxel_pcd(valid_xyz, None, valid_rgb)
+    
+    def write_to_pickle(self):
+        """Write out to a pickle file. This is a rough, quick-and-easy output for debugging, not intended to replace the scalable data writer in data_tools for bigger efforts."""
+        if not os.path.exists('debug'):
+            os.mkdir('debug')
+        filename = 'debug/' + self.log + '.pkl'
+        data = {}
+        data["camera_poses"] = []
+        data["camera_K"] = []
+        data["base_poses"] = []
+        data["xyz"] = []
+        data["world_xyz"] = []
+        data["rgb"] = []
+        data["depth"] = []
+        data["feats"] = []
+        for frame in self.voxel_map.observations:
+            # add it to pickle
+            # TODO: switch to using just Obs struct?
+            data["camera_poses"].append(frame.camera_pose)
+            data["base_poses"].append(frame.base_pose)
+            data["camera_K"].append(frame.camera_K)
+            data["xyz"].append(frame.xyz)
+            data["world_xyz"].append(frame.full_world_xyz)
+            data["rgb"].append(frame.rgb)
+            data["depth"].append(frame.depth)
+            data["feats"].append(frame.feats)
+            for k, v in frame.info.items():
+                if k not in data:
+                    data[k] = []
+                data[k].append(v)
+        (
+            data["combined_xyz"],
+            data["combined_feats"],
+            data["combined_weights"],
+            data["combined_rgb"],
+        ) = self.voxel_map_localizer.voxel_pcd.get_pointcloud()
+        data["obs_id"] = self.voxel_map_localizer.voxel_pcd._obs_counts
+        data["entity_id"] = self.voxel_map_localizer.voxel_pcd._entity_ids
+        with open(filename, "wb") as f:
+            pickle.dump(data, f)
diff --git a/src/stretch/dynav/mapping_utils/a_star.py b/src/stretch/dynav/mapping_utils/a_star.py
index 853401b53..ee70bd817 100644
--- a/src/stretch/dynav/mapping_utils/a_star.py
+++ b/src/stretch/dynav/mapping_utils/a_star.py
@@ -50,10 +50,10 @@ def compute_theta(self, cur_x, cur_y, end_x, end_y):
         return theta
 
     def reset(self):
-        print('loading the up to date navigable map')
+        # print('loading the up to date navigable map')
         print('Wait')
         obs, exp = self.space.voxel_map.get_2d_map()
-        print('up to date navigable map loaded')
+        # print('up to date navigable map loaded')
         self._navigable = ~obs & exp
         self.start_time = time.time()
 
@@ -238,7 +238,7 @@ def run_astar(
         # start_pt = self.get_unoccupied_neighbor(start_pt, end_pt)
         start_pt = self.get_unoccupied_neighbor(start_pt)
         end_pt = self.get_unoccupied_neighbor(end_pt, start_pt)
-        print('A* formally starts ', time.time() - self.start_time, ' seconds after path planning starts')
+        # print('A* formally starts ', time.time() - self.start_time, ' seconds after path planning starts')
         if start_pt is None or end_pt is None:
             return None
 
@@ -299,9 +299,9 @@ def plan(self, start, goal, verbose: bool = True) -> PlanResult:
                 print("[Planner] invalid goal")
             return PlanResult(False, reason = "[Planner] invalid goal")
         # Add start to the tree
-        print('Start running A* ', time.time() - self.start_time, ' seconds after path planning starts')
+        # print('Start running A* ', time.time() - self.start_time, ' seconds after path planning starts')
         waypoints = self.run_astar(start[:2], goal[:2]) 
-        print('Finish running A* ', time.time() - self.start_time, ' seconds after path planning starts')
+        # print('Finish running A* ', time.time() - self.start_time, ' seconds after path planning starts')
 
         if waypoints is None:
             if verbose:
@@ -312,5 +312,5 @@ def plan(self, start, goal, verbose: bool = True) -> PlanResult:
             theta = self.compute_theta(waypoints[i][0], waypoints[i][1], waypoints[i + 1][0], waypoints[i + 1][1])
             trajectory.append(Node([waypoints[i][0], waypoints[i][1], float(theta)]))
         trajectory.append(Node([waypoints[-1][0], waypoints[-1][1], goal[-1]]))
-        print('Finish computing theta ', time.time() - self.start_time, ' seconds after path planning starts')
+        # print('Finish computing theta ', time.time() - self.start_time, ' seconds after path planning starts')
         return PlanResult(True, trajectory = trajectory)
\ No newline at end of file
diff --git a/src/stretch/dynav/mapping_utils/voxel.py b/src/stretch/dynav/mapping_utils/voxel.py
index b06733eb1..91771d5f8 100644
--- a/src/stretch/dynav/mapping_utils/voxel.py
+++ b/src/stretch/dynav/mapping_utils/voxel.py
@@ -717,7 +717,6 @@ def get_2d_map(
             #     ),
             #     self.smooth_kernel,
             # )[0, 0].bool()
-
         if debug:
             import matplotlib.pyplot as plt
 
diff --git a/src/stretch/dynav/mapping_utils/voxel_map.py b/src/stretch/dynav/mapping_utils/voxel_map.py
index 2f457a5a1..21798297e 100644
--- a/src/stretch/dynav/mapping_utils/voxel_map.py
+++ b/src/stretch/dynav/mapping_utils/voxel_map.py
@@ -388,8 +388,8 @@ def sample_target_point(
             if np.linalg.norm([selected_x - point[0], selected_y - point[1]]) < 0.4:
                 continue
             elif np.linalg.norm([selected_x - point[0], selected_y - point[1]]) <= 0.5:
-                print('OBSTACLE AVOIDANCE')
-                print(selected_target[0].int(), selected_target[1].int())
+                # print('OBSTACLE AVOIDANCE')
+                # print(selected_target[0].int(), selected_target[1].int())
                 i = (point[0] - selected_target[0]) // abs(point[0] - selected_target[0])
                 j = (point[1] - selected_target[1]) // abs(point[1] - selected_target[1])
                 index_i = int(selected_target[0].int() + i)
@@ -678,7 +678,7 @@ def _time_heuristic(self, history_soft, outside_frontier, time_smooth = 0.1, tim
         time_heuristics = 1 / (1 + np.exp(-time_smooth * (time_heuristics - time_threshold)))
         index = np.unravel_index(np.argmax(time_heuristics), history_soft.shape)
         # return index
-        debug = True
+        # debug = True
         if debug:
             # plt.clf()
             plt.title('time')
diff --git a/src/stretch/dynav/ok_robot_hw/robot.py b/src/stretch/dynav/ok_robot_hw/robot.py
index 6f588194a..13ca87be8 100644
--- a/src/stretch/dynav/ok_robot_hw/robot.py
+++ b/src/stretch/dynav/ok_robot_hw/robot.py
@@ -200,14 +200,20 @@ def move_to_joints(self, joints, gripper, mode=0):
             joints['joint_wrist_pitch'],
             joints['joint_wrist_roll']]
         
+        # print('pan tilt before', self.robot.get_pan_tilt())
+
         # Moving only the lift first
         if mode == 1:
             target1 = state
             target1[1] = target_state[1]
             self.robot.arm_to(target1, blocking = True, head = np.array([self.pan, self.tilt]))
 
+        # print('pan tilt after', self.robot.get_pan_tilt())
         # print('pan tilt before', self.robot.get_pan_tilt())
+
         self.robot.arm_to(target_state, blocking = True, head = np.array([self.pan, self.tilt]))
+        self.robot.head_to(head_tilt = self.tilt, head_pan = self.pan, blocking = True)
+
         # print('pan tilt after', self.robot.get_pan_tilt())
         # print(f"current state {self.robot.get_six_joints()}")
         # print(f"target state {target_state}")
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index 562a49663..a9fb8e413 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -25,7 +25,8 @@
 from stretch.dynav.ok_robot_hw.camera import RealSenseCamera
 from stretch.dynav.ok_robot_hw.utils.grasper_utils import pickup, move_to_point, capture_and_process_image
 from stretch.dynav.ok_robot_hw.utils.communication_utils import send_array, recv_array
-from stretch.dynav.voxel_map_server import ImageProcessor
+# from stretch.dynav.voxel_map_server import ImageProcessor
+from stretch.dynav.llm_server import ImageProcessor
 
 import cv2
 
@@ -112,10 +113,10 @@ def update(self):
         self.obs_history.append(obs)
         self.obs_count += 1
         rgb, depth, K, camera_pose = obs.rgb, obs.depth, obs.camera_K, obs.camera_pose
-        start_time = time.time()
+        # start_time = time.time()
         self.image_processor.process_rgbd_images(rgb, depth, K, camera_pose)
-        end_time = time.time()
-        print('Image processing takes', end_time - start_time, 'seconds.')
+        # end_time = time.time()
+        # print('Image processing takes', end_time - start_time, 'seconds.')
 
     def execute_action(
         self,
@@ -123,22 +124,22 @@ def execute_action(
     ):
         start_time = time.time()
 
-        # self.robot.look_front()
-        self.look_around()
+        self.robot.look_front()
+        # self.look_around()
         # self.robot.look_front()
         self.robot.switch_to_navigation_mode()
 
         start = self.robot.get_base_pose()
-        print("       Start:", start)
+        # print("       Start:", start)
         # res = self.image_sender.query_text(text, start)  
         res = self.image_processor.process_text(text, start)
 
         look_around_finish = time.time()
         look_around_take = look_around_finish - start_time
-        print('Looking around takes ', look_around_take, ' seconds.')
-        self.look_around_times.append(look_around_take)
-        print(self.look_around_times)
-        print(sum(self.look_around_times) / len(self.look_around_times))
+        print('Path planning takes ', look_around_take, ' seconds.')
+        # self.look_around_times.append(look_around_take)
+        # print(self.look_around_times)
+        # print(sum(self.look_around_times) / len(self.look_around_times))
 
         if len(res) > 0:
             print("Plan successful!")
diff --git a/src/stretch/dynav/run_ok_nav.py b/src/stretch/dynav/run_ok_nav.py
index 3bee44f06..41f870e30 100644
--- a/src/stretch/dynav/run_ok_nav.py
+++ b/src/stretch/dynav/run_ok_nav.py
@@ -16,6 +16,8 @@
 from stretch.agent import RobotClient
 
 import cv2
+import time
+import threading
 
 def compute_tilt(camera_xyz, target_xyz):
     '''
@@ -75,17 +77,21 @@ def main(
     else:
         demo.image_processor.read_from_pickle(input_path)
 
-    # def keep_looking_around():
-    #     while True:
-    #         demo.look_around()
+    def keep_looking_around():
+        while True:
+            time.sleep(0.5)
+            if robot.get_six_joints()[2] > 0.7 or not robot.in_navigation_mode():
+                continue
+            demo.update()
 
-    # img_thread = threading.Thread(target=keep_looking_around)
-    # img_thread.daemon = True
-    # img_thread.start()
+    img_thread = threading.Thread(target=keep_looking_around)
+    img_thread.daemon = True
+    img_thread.start()
 
     while True:
         mode = input('select mode? E/N/S')
         if mode == 'S':
+            demo.image_processor.write_to_pickle()
             break
         if mode == 'E':
             robot.switch_to_navigation_mode()
@@ -93,7 +99,7 @@ def main(
                 print('\n', 'Exploration epoch ', epoch, '\n')
                 if not demo.run_exploration():
                     print('Exploration failed! Quitting!')
-                    break
+                    continue
         else:
             robot.move_to_nav_posture()
             robot.switch_to_navigation_mode()
diff --git a/src/stretch/dynav/voxel_map_localizer.py b/src/stretch/dynav/voxel_map_localizer.py
index 7408e54ed..94f04214c 100644
--- a/src/stretch/dynav/voxel_map_localizer.py
+++ b/src/stretch/dynav/voxel_map_localizer.py
@@ -184,18 +184,12 @@ def compute_coord(self, text, obs_id, threshold = 0.2):
         
             tl_x, tl_y, br_x, br_y = bbox
             w, h = depth.shape
-            # if w > h:
-            #     tl_x, br_x = tl_x * w / h, br_x * w / h
-            # else:
-            #     tl_y, br_y = tl_y * h / w, br_y * h / w
             tl_x, tl_y, br_x, br_y = int(max(0, tl_x.item())), int(max(0, tl_y.item())), int(min(h, br_x.item())), int(min(w, br_y.item()))
             pose = self.voxel_map_wrapper.observations[obs_id - 1].camera_pose
             K = self.voxel_map_wrapper.observations[obs_id - 1].camera_K
             xyzs = get_xyz(depth, pose, K)[0]
             if torch.median(depth[tl_y: br_y, tl_x: br_x].reshape(-1)) < 3:
                 return torch.median(xyzs[tl_y: br_y, tl_x: br_x].reshape(-1, 3), dim = 0).values
-            # if depth[(tl_y + br_y) // 2, (tl_x + br_x) // 2] < 3.:
-            #     return xyzs[(tl_y + br_y) // 2, (tl_x + br_x) // 2]
         return None
 
     def localize_A(self, A, debug = True, return_debug = False):
diff --git a/src/stretch/dynav/voxel_map_server.py b/src/stretch/dynav/voxel_map_server.py
index aad1196bd..417cfc890 100644
--- a/src/stretch/dynav/voxel_map_server.py
+++ b/src/stretch/dynav/voxel_map_server.py
@@ -288,6 +288,8 @@ def process_text(self, text, start_pose):
         if obs is not None and mode == 'navigation':
             rgb = self.voxel_map.observations[obs - 1].rgb
             if not self.rerun:
+                if isinstance(rgb, torch.Tensor):
+                    rgb = np.array(rgb)
                 cv2.imwrite(self.log + '/debug_' + text + '.png', rgb[:, :, [2, 1, 0]])
             else:
                 rr.log('/Past_observation_most_similar_to_text', rr.Image(rgb), static = self.static)
@@ -302,9 +304,6 @@ def process_text(self, text, start_pose):
                 waypoints = [pt.state for pt in res.trajectory]
                 # If we are navigating to some object of interst, send (x, y, z) of 
                 # the object so that we can make sure the robot looks at the object after navigation
-                print(waypoints)
-                # finished = (len(waypoints) <= 4 or torch.linalg.norm(torch.Tensor(point)[:2] - torch.Tensor(start_pose[:2])) > 0.8) and mode == 'navigation'
-                # finished = mode == 'navigation'
                 finished = len(waypoints) <= 5 and mode == 'navigation'
                 if not finished:
                     waypoints = waypoints[:8]
@@ -831,22 +830,19 @@ def write_to_pickle(self):
             pickle.dump(data, f)
 
 # @hydra.main(version_base="1.2", config_path=".", config_name="config.yaml")
-def main(cfg):
-    torch.manual_seed(1)
-    imageProcessor = ImageProcessor(rerun = False, static = False, min_depth = 0., max_depth = 2.5)
-    # imageProcessor = ImageProcessor(rerun = cfg.rerun, static = cfg.static, min_depth = cfg.min_depth, max_depth = cfg.max_depth)
-    # if not cfg.pickle_file_name is None:
-    #     imageProcessor.read_from_pickle(cfg.pickle_file_name)
-    # print(imageProcessor.voxel_map_localizer.voxel_pcd._points)
-    # if cfg.open_communication:
-    #     while True:
-    #         imageProcessor.recv_text()
-    for i in range(8, 47):
-        imageProcessor.read_from_pickle('debug/debug_2024-09-05_18-13-12.pkl', i)
-        obs, exp = imageProcessor.voxel_map.get_2d_map()
-        plt.imshow(obs + exp)
-        # plt.imshow(exp)
-        imageProcessor.space.sample_exploration(xyt = [0, 0, 0], planner = imageProcessor.planner, text = None)
-
-if __name__ == "__main__":
-    main(None)
\ No newline at end of file
+# def main(cfg):
+#     torch.manual_seed(1)
+#     imageProcessor = ImageProcessor(rerun = False, static = False, min_depth = 0., max_depth = 2.5)
+#     # imageProcessor = ImageProcessor(rerun = cfg.rerun, static = cfg.static, min_depth = cfg.min_depth, max_depth = cfg.max_depth)
+#     # if not cfg.pickle_file_name is None:
+#     #     imageProcessor.read_from_pickle(cfg.pickle_file_name)
+#     # print(imageProcessor.voxel_map_localizer.voxel_pcd._points)
+#     # if cfg.open_communication:
+#     #     while True:
+#     #         imageProcessor.recv_text()
+#     for i in range(10, 20):
+#         imageProcessor.read_from_pickle('debug/debug_2024-09-05_18-13-12.pkl', i)
+#         imageProcessor.space.sample_exploration(xyt = [0, 0, 0], planner = imageProcessor.planner, text = None)
+
+# if __name__ == "__main__":
+#     main(None)
\ No newline at end of file

From b70865e6f965c4ab846276aebc686fae57b88b74 Mon Sep 17 00:00:00 2001
From: Peiqi Liu <leo20021210@163.com>
Date: Sat, 7 Sep 2024 07:27:45 -0400
Subject: [PATCH 044/410] attempt

---
 src/stretch/dynav/llm_localizer.py            | 153 ++++++++++---
 src/stretch/dynav/llm_server.py               |  22 --
 .../dynav/mapping_utils/voxelized_pcd.py      |  30 ++-
 .../dynav/ok_robot_hw/utils/grasper_utils.py  |   2 +-
 src/stretch/dynav/robot_agent_manip_mdp.py    |   2 +
 src/stretch/dynav/voxel_map_localizer.py      | 212 +++++++++++++++---
 6 files changed, 331 insertions(+), 90 deletions(-)

diff --git a/src/stretch/dynav/llm_localizer.py b/src/stretch/dynav/llm_localizer.py
index bf48198f9..9eb796ea6 100644
--- a/src/stretch/dynav/llm_localizer.py
+++ b/src/stretch/dynav/llm_localizer.py
@@ -43,6 +43,62 @@
         "threshold": "BLOCK_NONE",
     },
 ]
+
+def get_inv_intrinsics(intrinsics):
+    # return intrinsics.double().inverse().to(intrinsics)
+    fx, fy, ppx, ppy = intrinsics[..., 0, 0], intrinsics[..., 1, 1], intrinsics[..., 0, 2], intrinsics[..., 1, 2]
+    inv_intrinsics = torch.zeros_like(intrinsics)
+    inv_intrinsics[..., 0, 0] = 1.0 / fx
+    inv_intrinsics[..., 1, 1] = 1.0 / fy
+    inv_intrinsics[..., 0, 2] = -ppx / fx
+    inv_intrinsics[..., 1, 2] = -ppy / fy
+    inv_intrinsics[..., 2, 2] = 1.0
+    return inv_intrinsics
+
+def get_xyz(depth, pose, intrinsics):
+    """Returns the XYZ coordinates for a set of points.
+
+    Args:
+        depth: The depth array, with shape (B, 1, H, W)
+        pose: The pose array, with shape (B, 4, 4)
+        intrinsics: The intrinsics array, with shape (B, 3, 3)
+
+    Returns:
+        The XYZ coordinates of the projected points, with shape (B, H, W, 3)
+    """
+    if not isinstance(depth, torch.Tensor):
+        depth = torch.from_numpy(depth)
+    if not isinstance(pose, torch.Tensor):
+        pose = torch.from_numpy(pose)
+    if not isinstance(intrinsics, torch.Tensor):
+        intrinsics = torch.from_numpy(intrinsics)
+    while depth.ndim < 4:
+        depth = depth.unsqueeze(0)
+    while pose.ndim < 3:
+        pose = pose.unsqueeze(0)
+    while intrinsics.ndim < 3:
+        intrinsics = intrinsics.unsqueeze(0)
+    (bsz, _, height, width), device, dtype = depth.shape, depth.device, intrinsics.dtype
+
+    # Gets the pixel grid.
+    xs, ys = torch.meshgrid(
+        torch.arange(0, width, device=device, dtype=dtype),
+        torch.arange(0, height, device=device, dtype=dtype),
+        indexing="xy",
+    )
+    xy = torch.stack([xs, ys], dim=-1).flatten(0, 1).unsqueeze(0).repeat_interleave(bsz, 0)
+    xyz = torch.cat((xy, torch.ones_like(xy[..., :1])), dim=-1)
+
+    # Applies intrinsics and extrinsics.
+    # xyz = xyz @ intrinsics.inverse().transpose(-1, -2)
+    xyz = xyz @ get_inv_intrinsics(intrinsics).transpose(-1, -2)
+    xyz = xyz * depth.flatten(1).unsqueeze(-1)
+    xyz = (xyz[..., None, :] * pose[..., None, :3, :3]).sum(dim=-1) + pose[..., None, :3, 3]
+    
+    xyz = xyz.unflatten(1, (height, width))
+
+    return xyz
+
 class LLM_Localizer():
     def __init__(self, voxel_map_wrapper = None, exist_model = 'gpt-4o', loc_model = 'owlv2', device = 'cuda'):
         self.voxel_map_wrapper = voxel_map_wrapper
@@ -88,33 +144,62 @@ def add(self,
                         weights = weights,
                         obs_count = obs_count)
 
+    def compute_coord(self, text, image_info, threshold = 0.25):
+        rgb = image_info['image']
+        inputs = self.exist_processor(text=[['a photo of a ' + text]], images=rgb, return_tensors="pt")
+        for input in inputs:
+            inputs[input] = inputs[input].to('cuda')
+        
+        with torch.no_grad():
+            outputs = self.exist_model(**inputs)
+    
+        target_sizes = torch.Tensor([rgb.size[::-1]]).to(self.device)
+        results = self.exist_processor.image_processor.post_process_object_detection(
+            outputs, threshold=threshold, target_sizes=target_sizes
+        )[0]
+        depth = image_info['depth']
+        xyzs = image_info['xyz']
+        for idx, (score, bbox) in enumerate(sorted(zip(results['scores'], results['boxes']), key=lambda x: x[0], reverse=True)):
+        
+            tl_x, tl_y, br_x, br_y = bbox
+            w, h = depth.shape
+            tl_x, tl_y, br_x, br_y = int(max(0, tl_x.item())), int(max(0, tl_y.item())), int(min(h, br_x.item())), int(min(w, br_y.item()))
+            if np.median(depth[tl_y: br_y, tl_x: br_x].reshape(-1)) < 3:
+                return torch.from_numpy(np.median(xyzs[tl_y: br_y, tl_x: br_x].reshape(-1, 3), axis = 0))
+        return None
+
     def owl_locater(self, A, encoded_image, timestamps_lst):
         for i in timestamps_lst:
             image_info = encoded_image[i][-1]
-            image = image_info['image']
-            box = None
+            res = self.compute_coord(A, image_info)
+            if res is not None:
+                debug_text = '#### - Obejct is detected in observations where instance ' + str(i + 1) + ' comes from. **😃** Directly navigate to it.\n'
+                return res, debug_text, i, None
+            # image = image_info['image']
+
+            # box = None
                 
-            inputs = self.exist_processor(text=A, images=image, return_tensors="pt").to(self.device)
-
-            with torch.no_grad():
-                outputs = self.exist_model(**inputs)
-
-            target_sizes = torch.tensor([image.size[::-1]])
-            results = self.exist_processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.2)[0]
-
-            if len(results["scores"]) > 0:
-                cur_score = torch.max(results["scores"]).item()
-                max_score_index = torch.argmax(results["scores"])
-                box = results["boxes"][max_score_index].tolist()
-            if box is not None:
-                xmin, ymin, xmax, ymax = map(int, box)
-                mask = np.zeros(image_info['depth'].shape, dtype=np.uint8)
-                mask[ymin:ymax, xmin:xmax] = 255
-                xyz = image_info['xyz']        
-                masked_xyz = xyz[mask.flatten() > 0]
-                centroid = np.stack([torch.mean(masked_xyz[:, 0]), torch.mean(masked_xyz[:, 1]), torch.mean(masked_xyz[:, 2])]).T
-                debug_text = '#### - Obejct is detected in observations where instance' + str(i + 1) + ' comes from. **😃** Directly navigate to it.\n'
-                return centroid, debug_text, i, masked_xyz
+            # inputs = self.exist_processor(text= 'a photo of ' + A, images=image, return_tensors="pt").to(self.device)
+
+            # with torch.no_grad():
+            #     outputs = self.exist_model(**inputs)
+
+            # target_sizes = torch.tensor([image.size[::-1]])
+            # results = self.exist_processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.2)[0]
+
+            # if len(results["scores"]) > 0:
+            #     cur_score = torch.max(results["scores"]).item()
+            #     max_score_index = torch.argmax(results["scores"])
+            #     box = results["boxes"][max_score_index].tolist()
+            # if box is not None:
+            #     xmin, ymin, xmax, ymax = map(int, box)
+            #     mask = np.zeros(image_info['depth'].shape, dtype=np.uint8)
+            #     mask[ymin:ymax, xmin:xmax] = 255
+            #     xyz = image_info['xyz']        
+            #     masked_xyz = xyz[mask.flatten() > 0]
+            #     centroid = np.stack([torch.mean(masked_xyz[:, 0]), torch.mean(masked_xyz[:, 1]), torch.mean(masked_xyz[:, 2])]).T
+            #     debug_text = '#### - Obejct is detected in observations where instance' + str(i + 1) + ' comes from. **😃** Directly navigate to it.\n'
+            #     return centroid, debug_text, i, masked_xyz
         debug_text = '#### - All instances are not the target! Maybe target object has not been observed yet. **😭**\n'
         return None, debug_text, None, None
     
@@ -207,8 +292,8 @@ def llm_locator(self, A, encoded_image, process_chunk, context_length = 30):
     def localize_A(self, A, debug = True, return_debug = False, count_threshold = 5):
         encoded_image = {}
 
-        counts = torch.bincount(self.voxel_map_wrapper.voxel_pcd._obs_counts)
-        cur_obs = max(self.voxel_map_wrapper.voxel_pcd._obs_counts)
+        counts = torch.bincount(self.voxel_pcd._obs_counts)
+        cur_obs = max(self.voxel_pcd._obs_counts)
         filtered_obs = (counts > count_threshold).nonzero(as_tuple=True)[0].tolist()
         # filtered_obs = sorted(set(filtered_obs + [i for i in range(cur_obs-10, cur_obs+1)]))
         filtered_obs = sorted(filtered_obs)
@@ -229,16 +314,18 @@ def localize_A(self, A, debug = True, return_debug = False, count_threshold = 5)
             camera_pose = self.voxel_map_wrapper.observations[obs_id - 1].camera_pose
             camera_K = self.voxel_map_wrapper.observations[obs_id - 1].camera_K
 
-            full_world_xyz = unproject_masked_depth_to_xyz_coordinates(  # Batchable!
-                depth=depth.unsqueeze(0).unsqueeze(1),
-                pose=camera_pose.unsqueeze(0),
-                inv_intrinsics=torch.linalg.inv(camera_K[:3, :3]).unsqueeze(0),
-            )
+            # full_world_xyz = unproject_masked_depth_to_xyz_coordinates(  # Batchable!
+            #     depth=depth.unsqueeze(0).unsqueeze(1),
+            #     pose=camera_pose.unsqueeze(0),
+            #     inv_intrinsics=torch.linalg.inv(camera_K[:3, :3]).unsqueeze(0),
+            # )
+            xyz = get_xyz(depth, camera_pose, camera_K)[0]
+            # print(full_world_xyz.shape)
             depth = depth.numpy()
-            rgb[depth > 2.5] = [0, 0, 0]
+            # rgb[depth > 2.5] = [0, 0, 0]
             image = Image.fromarray(rgb.astype(np.uint8), mode='RGB')
             if 'gemini' in self.existence_checking_model:
-                encoded_image[obs_id] = [[f"Following is the image took on timestep {obs_id}: ", image], {'image':image, 'xyz':full_world_xyz, 'depth':depth}]
+                encoded_image[obs_id] = [[f"Following is the image took on timestep {obs_id}: ", image], {'image':image, 'xyz': xyz, 'depth':depth}]
             elif 'gpt' in self.existence_checking_model:
                 buffered = BytesIO()
                 image.save(buffered, format="PNG")
@@ -247,7 +334,7 @@ def localize_A(self, A, debug = True, return_debug = False, count_threshold = 5)
                 encoded_image[obs_id] = [{"type": "text", "text": f"Following is the image took on timestep {obs_id}"},
                     {"type": "image_url", "image_url": {
                         "url": f"data:image/png;base64,{base64_encoded}"}
-                    }, {'image':image, 'xyz':full_world_xyz, 'depth':depth}]
+                    }, {'image':image, 'xyz':xyz, 'depth':depth}]
         target_point, debug_text, obs, point = self.llm_locator(A, encoded_image, process_chunk, context_length)
         if not debug:
             return target_point
diff --git a/src/stretch/dynav/llm_server.py b/src/stretch/dynav/llm_server.py
index 31bced34d..ee4989922 100644
--- a/src/stretch/dynav/llm_server.py
+++ b/src/stretch/dynav/llm_server.py
@@ -211,9 +211,6 @@ def process_text(self, text, start_pose):
             rr.log('/Past_observation_most_similar_to_text', rr.Clear(recursive = True), static = self.static)
             if not self.static:
                 rr.connect('100.108.67.79:9876')
-        
-        start_time = time.time()
-
         debug_text = ''
         mode = 'navigation'
         obs = None
@@ -234,17 +231,11 @@ def process_text(self, text, start_pose):
         if localized_point is None:
             return []
         
-        breakpoint1 = time.time()
-        print('Determining localized point', breakpoint1 - start_time)
-        
         if len(localized_point) == 2:
             localized_point = np.array([localized_point[0], localized_point[1], 0])
 
         point = self.sample_navigation(start_pose, localized_point)
 
-        breakpoint2 = time.time()
-        print('Determining target point', breakpoint2 - start_time)
-
         if self.rerun:
             buf = BytesIO()
             plt.savefig(buf, format='png')
@@ -279,9 +270,6 @@ def process_text(self, text, start_pose):
         traj = []
         waypoints = None
 
-        breakpoint3 = time.time()
-        print('Start planning', breakpoint3 - start_time)
-
         if point is None:
             print('Unable to find any target point, some exception might happen')
         else:
@@ -304,9 +292,6 @@ def process_text(self, text, start_pose):
                 print('Planned trajectory:', traj)
             else:
                 print('[FAILURE]', res.reason)
-
-        breakpoint4 = time.time()
-        print('Planning stops', breakpoint4 - start_time)
             
         if traj is not None:
             origins = []
@@ -317,15 +302,8 @@ def process_text(self, text, start_pose):
                     vectors.append([traj[idx + 1][0] - traj[idx][0], traj[idx + 1][1] - traj[idx][1], 0])
             rr.log("/direction", rr.Arrows3D(origins = origins, vectors = vectors, colors=torch.Tensor([0, 1, 0]), radii=0.05), static = self.static)
             rr.log("/robot_start_pose", rr.Points3D([start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1), static = self.static)
-        
-        breakpoint5 = time.time()
-        print('Logging', breakpoint5 - start_time)
 
         # self.write_to_pickle()
-
-        # breakpoint6 = time.time()
-        # print('Writing to pickle', breakpoint6 - start_time)
-
         return traj
 
     def sample_navigation(self, start, point):
diff --git a/src/stretch/dynav/mapping_utils/voxelized_pcd.py b/src/stretch/dynav/mapping_utils/voxelized_pcd.py
index 09d3a4196..6f4486a8e 100644
--- a/src/stretch/dynav/mapping_utils/voxelized_pcd.py
+++ b/src/stretch/dynav/mapping_utils/voxelized_pcd.py
@@ -44,6 +44,8 @@
     from stretch.utils.torch_geometric import consecutive_cluster, voxel_grid
     from stretch.utils.torch_scatter import scatter
 
+from sklearn.cluster import DBSCAN
+
 
 def project_points(points_3d, K, pose):
     if not isinstance(K, torch.Tensor):
@@ -169,11 +171,11 @@ def clear_points(self, depth, intrinsics, pose, depth_is_valid = None):
                 ),
                 dim = 0)
 
-            if self._entity_ids is not None:
-                removed_entities, removed_counts = torch.unique(self._entity_ids.detach().cpu()[~indices], return_counts = True)
-                total_counts = torch.bincount(self._entity_ids.detach().cpu())
-                entities_to_be_removed = removed_entities[(removed_counts > total_counts[removed_entities] * 0.75) | (total_counts[removed_entities] - removed_counts < 5)]
-                indices = indices & ~torch.isin(self._entity_ids.detach().cpu(), entities_to_be_removed)
+            # if self._entity_ids is not None:
+            #     removed_entities, removed_counts = torch.unique(self._entity_ids.detach().cpu()[~indices], return_counts = True)
+            #     total_counts = torch.bincount(self._entity_ids.detach().cpu())
+            #     entities_to_be_removed = removed_entities[(removed_counts > total_counts[removed_entities] * 0.6) | (total_counts[removed_entities] - removed_counts < 5)]
+            #     indices = indices & ~torch.isin(self._entity_ids.detach().cpu(), entities_to_be_removed)
         
             # print('Clearing non valid points...')
             # print('Removing ' + str((~indices).sum().item()) + ' points.')
@@ -190,6 +192,24 @@ def clear_points(self, depth, intrinsics, pose, depth_is_valid = None):
             if self._entity_ids is not None:
                 self._entity_ids = self._entity_ids[indices]
 
+            if self._entity_ids is not None:
+                dbscan = DBSCAN(eps=self.voxel_size * 4, min_samples=10)
+                cluster_vertices = torch.cat((self._points.detach().cpu(), self._entity_ids.detach().cpu().reshape(-1,1) * 1000), -1).numpy()
+                clusters = dbscan.fit(cluster_vertices)
+                labels = clusters.labels_
+                indices = labels != -1
+                self._points = self._points[indices]
+                if self._features is not None:
+                    self._features = self._features[indices]
+                if self._weights is not None:
+                    self._weights= self._weights[indices]
+                if self._rgb is not None:
+                    self._rgb = self._rgb[indices]
+                if self._obs_counts is not None:
+                    self._obs_counts = self._obs_counts[indices]
+                if self._entity_ids is not None:
+                    self._entity_ids = self._entity_ids[indices]
+
     def add(
         self,
         points: Tensor,
diff --git a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
index aa5c06b63..ff5c7872a 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
@@ -31,7 +31,7 @@ def capture_and_process_image(camera, mode, obj, socket, hello_robot):
             hello_robot.move_to_position(base_trans=base_trans,
                                     head_pan=head_pan,
                                     head_tilt=head_tilt)
-            # time.sleep(4)
+            time.sleep(2)
         
         elif (retry_flag !=0 and side_retries == 3):
             print("Tried in all angles but couldn't succed")
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index a9fb8e413..11556b0e5 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -133,6 +133,8 @@ def execute_action(
         # print("       Start:", start)
         # res = self.image_sender.query_text(text, start)  
         res = self.image_processor.process_text(text, start)
+        if len(res) == 0 and text != '' and text is not None:
+            res = self.image_processor.process_text('', start)
 
         look_around_finish = time.time()
         look_around_take = look_around_finish - start_time
diff --git a/src/stretch/dynav/voxel_map_localizer.py b/src/stretch/dynav/voxel_map_localizer.py
index 94f04214c..3f5cc76ed 100644
--- a/src/stretch/dynav/voxel_map_localizer.py
+++ b/src/stretch/dynav/voxel_map_localizer.py
@@ -79,6 +79,54 @@ def get_xyz(depth, pose, intrinsics):
 
     return xyz
 
+def find_clusters(vertices: np.ndarray, similarity: np.ndarray, obs = None):
+    # Calculate the number of top values directly
+    top_positions = vertices
+    # top_values = probability_over_all_points[top_indices].flatten()
+
+    # Apply DBSCAN clustering
+    dbscan = DBSCAN(eps=0.25, min_samples=3)
+    clusters = dbscan.fit(top_positions)
+    labels = clusters.labels_
+
+    # Initialize empty lists to store centroids and extends of each cluster
+    centroids = []
+    extends = []
+    similarity_max_list = []
+    points = []
+    obs_max_list = []
+    
+    for cluster_id in set(labels):
+        if cluster_id == -1:  # Ignore noise
+            continue
+
+        members = top_positions[labels == cluster_id]
+        centroid = np.mean(members, axis=0)
+
+        similarity_values = similarity[labels == cluster_id]
+        simiarity_max = np.max(similarity_values)
+
+        if obs is not None:
+            obs_values = obs[labels == cluster_id]
+            obs_max = np.max(obs_values)
+
+        sx = np.max(members[:, 0]) - np.min(members[:, 0])
+        sy = np.max(members[:, 1]) - np.min(members[:, 1])
+        sz = np.max(members[:, 2]) - np.min(members[:, 2])
+
+        # Append centroid and extends to the lists
+        centroids.append(centroid)
+        extends.append((sx, sy, sz))
+        similarity_max_list.append(simiarity_max)
+        points.append(members)
+        if obs is not None:
+            obs_max_list.append(obs_max)
+
+    if obs is not None:
+        return centroids, extends, similarity_max_list, points, obs_max_list
+    else:
+        return centroids, extends, similarity_max_list, points
+
 class VoxelMapLocalizer():
     def __init__(self, voxel_map_wrapper = None, exist_model = True, clip_model = None, processor = None, device = 'cuda', siglip = True):
         print('Localizer V3')
@@ -163,10 +211,35 @@ def find_obs_id_for_A(self, A):
         alignments = self.find_alignment_over_model(A).cpu()
         return obs_counts[alignments.argmax(dim = -1)].detach().cpu()
 
-    def compute_coord(self, text, obs_id, threshold = 0.2):
+    def compute_coord(self, text, obs_id, threshold = 0.25, point = None):
         if obs_id <= 0:
             return None
         rgb = self.voxel_map_wrapper.observations[obs_id - 1].rgb
+        pose = self.voxel_map_wrapper.observations[obs_id - 1].camera_pose
+        depth = self.voxel_map_wrapper.observations[obs_id - 1].depth
+        K = self.voxel_map_wrapper.observations[obs_id - 1].camera_K
+        xyzs = get_xyz(depth, pose, K)[0]
+        
+        if point is not None:
+            distances = torch.linalg.norm(xyzs - point, dim=2)
+            cx, cy = torch.unravel_index(torch.argmin(distances), distances.shape)
+
+            height, width = rgb.shape[0], rgb.shape[1]
+            crop_size = (int(0.4 * height), int(0.4 * width))
+
+            half_width = crop_size[0] // 2
+            half_height = crop_size[1] // 2
+    
+            x_min = max(cx - half_width, 0)
+            y_min = max(cy - half_height, 0)
+            x_max = min(cx + half_width, height)
+            y_max = min(cy + half_height, width)
+    
+            rgb = rgb[x_min:x_max, y_min:y_max]
+            depth = depth[x_min:x_max, y_min:y_max]
+            xyzs = xyzs[x_min:x_max, y_min:y_max]
+            # cv2.imwrite(text + '.jpg', rgb.numpy()[:, :, [2, 1, 0]])
+
         rgb = rgb.permute(2, 0, 1).to(torch.uint8)
         inputs = self.exist_processor(text=[['a photo of a ' + text]], images=rgb, return_tensors="pt")
         for input in inputs:
@@ -179,49 +252,130 @@ def compute_coord(self, text, obs_id, threshold = 0.2):
         results = self.exist_processor.image_processor.post_process_object_detection(
             outputs, threshold=threshold, target_sizes=target_sizes
         )[0]
-        depth = self.voxel_map_wrapper.observations[obs_id - 1].depth
         for idx, (score, bbox) in enumerate(sorted(zip(results['scores'], results['boxes']), key=lambda x: x[0], reverse=True)):
         
             tl_x, tl_y, br_x, br_y = bbox
             w, h = depth.shape
             tl_x, tl_y, br_x, br_y = int(max(0, tl_x.item())), int(max(0, tl_y.item())), int(min(h, br_x.item())), int(min(w, br_y.item()))
-            pose = self.voxel_map_wrapper.observations[obs_id - 1].camera_pose
-            K = self.voxel_map_wrapper.observations[obs_id - 1].camera_K
-            xyzs = get_xyz(depth, pose, K)[0]
+            
             if torch.median(depth[tl_y: br_y, tl_x: br_x].reshape(-1)) < 3:
-                return torch.median(xyzs[tl_y: br_y, tl_x: br_x].reshape(-1, 3), dim = 0).values
+                return xyzs[(tl_y + br_y) // 2, (tl_x + br_x) // 2]
+                # return torch.median(xyzs[tl_y: br_y, tl_x: br_x].reshape(-1, 3), dim = 0).values
         return None
 
     def localize_A(self, A, debug = True, return_debug = False):
-        points, _, _, _ = self.voxel_pcd.get_pointcloud()
-        alignments = self.find_alignment_over_model(A).cpu()
-        point = points[alignments.argmax(dim = -1)].detach().cpu().squeeze()
-        obs_counts = self.voxel_pcd._obs_counts
-        image_id = obs_counts[alignments.argmax(dim = -1)].detach().cpu()
-        debug_text = ''
+        centroids, extends, similarity_max_list, points, obs_max_list, debug_text = self.find_clusters_for_A(A, return_obs_counts = True, debug = debug)
+        if len(centroids) == 0:
+            if not debug:
+                return None
+            else:
+                return None, debug_text
         target_point = None
+        obs = None
+        similarity = None
+        point = None
+        for idx, (centroid, obs, similarity, point) in enumerate(sorted(zip(centroids, obs_max_list, similarity_max_list, points), key=lambda x: x[2], reverse=True)):
+            
+            
+            res = self.compute_coord(A, obs, point = centroid)
+            if res is not None:
+                target_point = res
 
-        res = self.compute_coord(A, image_id)
-        # res = None
-        if res is not None:
-            target_point = res
-            debug_text += '#### - Obejct is detected in observations . **😃** Directly navigate to it.\n'
-        else:
-            # debug_text += '#### - Directly ignore this instance is the target. **😞** \n'
-            if self.siglip:
-                cosine_similarity_check = alignments.max().item() > 0.14
-            else:
-                cosine_similarity_check = alignments.max().item() > 0.3
-            if cosine_similarity_check:
-                target_point = point
+                debug_text += '#### - Obejct is detected in observations where instance' + str(idx + 1) + ' comes from. **😃** Directly navigate to it.\n'
 
-                debug_text += '#### - The point has high cosine similarity. **😃** Directly navigate to it.\n'
+                break
             else:
-                debug_text += '#### - Cannot verify whether this instance is the target. **😞** \n'
-        # print('target_point', target_point)
+                if self.siglip:
+                    cosine_similarity_check = similarity > 0.14
+                else:
+                    cosine_similarity_check = similarity > 0.3
+
+                if cosine_similarity_check:
+
+                    debug_text += '#### - Instance ' +  str(idx + 1) + ' has high cosine similarity (' + str(round(similarity, 3)) +  '). **😃** Directly navigate to it.\n'    
+
+                    target_point = centroid
+                    break
+
+        if target_point is None:
+            debug_text += '#### - All instances are not the target! Maybe target object has not been observed yet. **😭**\n'
         if not debug:
             return target_point
         elif not return_debug:
             return target_point, debug_text
         else:
-            return target_point, debug_text, image_id, point
\ No newline at end of file
+            return target_point, debug_text, obs, point
+
+    # def localize_A(self, A, debug = True, return_debug = False):
+    #     points, _, _, _ = self.voxel_pcd.get_pointcloud()
+    #     alignments = self.find_alignment_over_model(A).cpu()
+    #     point = points[alignments.argmax(dim = -1)].detach().cpu().squeeze()
+    #     obs_counts = self.voxel_pcd._obs_counts
+    #     image_id = obs_counts[alignments.argmax(dim = -1)].detach().cpu()
+    #     debug_text = ''
+    #     target_point = None
+
+    #     res = self.compute_coord(A, image_id, point = point)
+    #     # res = None
+    #     if res is not None:
+    #         target_point = res
+    #         debug_text += '#### - Obejct is detected in observations . **😃** Directly navigate to it.\n'
+    #     else:
+    #         # debug_text += '#### - Directly ignore this instance is the target. **😞** \n'
+    #         if self.siglip:
+    #             cosine_similarity_check = alignments.max().item() > 0.14
+    #         else:
+    #             cosine_similarity_check = alignments.max().item() > 0.3
+    #         if cosine_similarity_check:
+    #             target_point = point
+
+    #             debug_text += '#### - The point has high cosine similarity. **😃** Directly navigate to it.\n'
+    #         else:
+    #             debug_text += '#### - Cannot verify whether this instance is the target. **😞** \n'
+    #     # print('target_point', target_point)
+    #     if not debug:
+    #         return target_point
+    #     elif not return_debug:
+    #         return target_point, debug_text
+    #     else:
+    #         return target_point, debug_text, image_id, point
+
+    def find_clusters_for_A(self, A, return_obs_counts = False, debug = False):
+
+        debug_text = ''
+
+        points, features, _, _ = self.voxel_pcd.get_pointcloud()
+        alignments = self.find_alignment_over_model(A).cpu().reshape(-1).detach().numpy()
+        # turning_point = max(np.percentile(alignments, 99), 0.08)
+        if self.siglip:
+            turning_point = min(0.14, alignments[np.argsort(alignments)[-20]])
+        else:
+            turning_point = min(0.3, alignments[np.argsort(alignments)[-20]])
+        mask = alignments >= turning_point
+        alignments = alignments[mask]
+        points = points[mask]
+        if len(points) == 0:
+
+            debug_text += '### - No instance found! Maybe target object has not been observed yet. **😭**\n'
+
+            output = [[], [], [], []]
+            if return_obs_counts:
+                output.append([])
+            if debug:
+                output.append(debug_text)
+
+            return output
+        else:
+            if return_obs_counts:
+                obs_ids = self.voxel_pcd._obs_counts.detach().cpu().numpy()[mask]
+                centroids, extends, similarity_max_list, points, obs_max_list = find_clusters(points.detach().cpu().numpy(), alignments, obs = obs_ids)
+                output = [centroids, extends, similarity_max_list, points, obs_max_list]
+            else:
+                centroids, extends, similarity_max_list, points = find_clusters(points.detach().cpu().numpy(), alignments, obs = None)
+                output = [centroids, extends, similarity_max_list, points]
+
+            debug_text += '### - Found ' + str(len(centroids)) + ' instances that might be target object.\n'
+            if debug:
+                output.append(debug_text)
+            
+            return output
\ No newline at end of file

From b07556a5425975d076c44666981b879a99a2eef5 Mon Sep 17 00:00:00 2001
From: Peiqi Liu <leo20021210@163.com>
Date: Sat, 7 Sep 2024 17:00:38 -0400
Subject: [PATCH 045/410] restore setting

---
 src/stretch/dynav/voxel_map_localizer.py | 205 +++--------------------
 1 file changed, 25 insertions(+), 180 deletions(-)

diff --git a/src/stretch/dynav/voxel_map_localizer.py b/src/stretch/dynav/voxel_map_localizer.py
index 3f5cc76ed..481f50f5b 100644
--- a/src/stretch/dynav/voxel_map_localizer.py
+++ b/src/stretch/dynav/voxel_map_localizer.py
@@ -16,8 +16,6 @@
 
 from transformers import AutoProcessor, AutoModel
 
-from sklearn.cluster import DBSCAN
-
 # from ultralytics import YOLOWorld
 from transformers import Owlv2Processor, Owlv2ForObjectDetection
 
@@ -79,54 +77,6 @@ def get_xyz(depth, pose, intrinsics):
 
     return xyz
 
-def find_clusters(vertices: np.ndarray, similarity: np.ndarray, obs = None):
-    # Calculate the number of top values directly
-    top_positions = vertices
-    # top_values = probability_over_all_points[top_indices].flatten()
-
-    # Apply DBSCAN clustering
-    dbscan = DBSCAN(eps=0.25, min_samples=3)
-    clusters = dbscan.fit(top_positions)
-    labels = clusters.labels_
-
-    # Initialize empty lists to store centroids and extends of each cluster
-    centroids = []
-    extends = []
-    similarity_max_list = []
-    points = []
-    obs_max_list = []
-    
-    for cluster_id in set(labels):
-        if cluster_id == -1:  # Ignore noise
-            continue
-
-        members = top_positions[labels == cluster_id]
-        centroid = np.mean(members, axis=0)
-
-        similarity_values = similarity[labels == cluster_id]
-        simiarity_max = np.max(similarity_values)
-
-        if obs is not None:
-            obs_values = obs[labels == cluster_id]
-            obs_max = np.max(obs_values)
-
-        sx = np.max(members[:, 0]) - np.min(members[:, 0])
-        sy = np.max(members[:, 1]) - np.min(members[:, 1])
-        sz = np.max(members[:, 2]) - np.min(members[:, 2])
-
-        # Append centroid and extends to the lists
-        centroids.append(centroid)
-        extends.append((sx, sy, sz))
-        similarity_max_list.append(simiarity_max)
-        points.append(members)
-        if obs is not None:
-            obs_max_list.append(obs_max)
-
-    if obs is not None:
-        return centroids, extends, similarity_max_list, points, obs_max_list
-    else:
-        return centroids, extends, similarity_max_list, points
-
 class VoxelMapLocalizer():
     def __init__(self, voxel_map_wrapper = None, exist_model = True, clip_model = None, processor = None, device = 'cuda', siglip = True):
         print('Localizer V3')
@@ -211,7 +161,7 @@ def find_obs_id_for_A(self, A):
         alignments = self.find_alignment_over_model(A).cpu()
         return obs_counts[alignments.argmax(dim = -1)].detach().cpu()
 
-    def compute_coord(self, text, obs_id, threshold = 0.25, point = None):
+    def compute_coord(self, text, obs_id, threshold = 0.25):
         if obs_id <= 0:
             return None
         rgb = self.voxel_map_wrapper.observations[obs_id - 1].rgb
@@ -219,27 +169,6 @@ def compute_coord(self, text, obs_id, threshold = 0.25, point = None):
         depth = self.voxel_map_wrapper.observations[obs_id - 1].depth
         K = self.voxel_map_wrapper.observations[obs_id - 1].camera_K
         xyzs = get_xyz(depth, pose, K)[0]
-        
-        if point is not None:
-            distances = torch.linalg.norm(xyzs - point, dim=2)
-            cx, cy = torch.unravel_index(torch.argmin(distances), distances.shape)
-
-            height, width = rgb.shape[0], rgb.shape[1]
-            crop_size = (int(0.4 * height), int(0.4 * width))
-
-            half_width = crop_size[0] // 2
-            half_height = crop_size[1] // 2
-    
-            x_min = max(cx - half_width, 0)
-            y_min = max(cy - half_height, 0)
-            x_max = min(cx + half_width, height)
-            y_max = min(cy + half_height, width)
-    
-            rgb = rgb[x_min:x_max, y_min:y_max]
-            depth = depth[x_min:x_max, y_min:y_max]
-            xyzs = xyzs[x_min:x_max, y_min:y_max]
-            # cv2.imwrite(text + '.jpg', rgb.numpy()[:, :, [2, 1, 0]])
-
         rgb = rgb.permute(2, 0, 1).to(torch.uint8)
         inputs = self.exist_processor(text=[['a photo of a ' + text]], images=rgb, return_tensors="pt")
         for input in inputs:
@@ -259,123 +188,39 @@ def compute_coord(self, text, obs_id, threshold = 0.25, point = None):
             tl_x, tl_y, br_x, br_y = int(max(0, tl_x.item())), int(max(0, tl_y.item())), int(min(h, br_x.item())), int(min(w, br_y.item()))
             
             if torch.median(depth[tl_y: br_y, tl_x: br_x].reshape(-1)) < 3:
-                return xyzs[(tl_y + br_y) // 2, (tl_x + br_x) // 2]
-                # return torch.median(xyzs[tl_y: br_y, tl_x: br_x].reshape(-1, 3), dim = 0).values
+                return torch.median(xyzs[tl_y: br_y, tl_x: br_x].reshape(-1, 3), dim = 0).values
         return None
 
     def localize_A(self, A, debug = True, return_debug = False):
-        centroids, extends, similarity_max_list, points, obs_max_list, debug_text = self.find_clusters_for_A(A, return_obs_counts = True, debug = debug)
-        if len(centroids) == 0:
-            if not debug:
-                return None
-            else:
-                return None, debug_text
+        points, _, _, _ = self.voxel_pcd.get_pointcloud()
+        alignments = self.find_alignment_over_model(A).cpu()
+        point = points[alignments.argmax(dim = -1)].detach().cpu().squeeze()
+        obs_counts = self.voxel_pcd._obs_counts
+        image_id = obs_counts[alignments.argmax(dim = -1)].detach().cpu()
+        debug_text = ''
         target_point = None
-        obs = None
-        similarity = None
-        point = None
-        for idx, (centroid, obs, similarity, point) in enumerate(sorted(zip(centroids, obs_max_list, similarity_max_list, points), key=lambda x: x[2], reverse=True)):
-            
-            
-            res = self.compute_coord(A, obs, point = centroid)
-            if res is not None:
-                target_point = res
 
-                debug_text += '#### - Obejct is detected in observations where instance' + str(idx + 1) + ' comes from. **😃** Directly navigate to it.\n'
-
-                break
+        res = self.compute_coord(A, image_id)
+        # res = None
+        if res is not None:
+            target_point = res
+            debug_text += '#### - Obejct is detected in observations . **😃** Directly navigate to it.\n'
+        else:
+            # debug_text += '#### - Directly ignore this instance is the target. **😞** \n'
+            if self.siglip:
+                cosine_similarity_check = alignments.max().item() > 0.14
             else:
-                if self.siglip:
-                    cosine_similarity_check = similarity > 0.14
-                else:
-                    cosine_similarity_check = similarity > 0.3
-
-                if cosine_similarity_check:
+                cosine_similarity_check = alignments.max().item() > 0.3
+            if cosine_similarity_check:
+                target_point = point
 
-                    debug_text += '#### - Instance ' +  str(idx + 1) + ' has high cosine similarity (' + str(round(similarity, 3)) +  '). **😃** Directly navigate to it.\n'    
-
-                    target_point = centroid
-                    break
-
-        if target_point is None:
-            debug_text += '#### - All instances are not the target! Maybe target object has not been observed yet. **😭**\n'
+                debug_text += '#### - The point has high cosine similarity. **😃** Directly navigate to it.\n'
+            else:
+                debug_text += '#### - Cannot verify whether this instance is the target. **😞** \n'
+        # print('target_point', target_point)
         if not debug:
             return target_point
         elif not return_debug:
             return target_point, debug_text
         else:
-            return target_point, debug_text, obs, point
-
-    # def localize_A(self, A, debug = True, return_debug = False):
-    #     points, _, _, _ = self.voxel_pcd.get_pointcloud()
-    #     alignments = self.find_alignment_over_model(A).cpu()
-    #     point = points[alignments.argmax(dim = -1)].detach().cpu().squeeze()
-    #     obs_counts = self.voxel_pcd._obs_counts
-    #     image_id = obs_counts[alignments.argmax(dim = -1)].detach().cpu()
-    #     debug_text = ''
-    #     target_point = None
-
-    #     res = self.compute_coord(A, image_id, point = point)
-    #     # res = None
-    #     if res is not None:
-    #         target_point = res
-    #         debug_text += '#### - Obejct is detected in observations . **😃** Directly navigate to it.\n'
-    #     else:
-    #         # debug_text += '#### - Directly ignore this instance is the target. **😞** \n'
-    #         if self.siglip:
-    #             cosine_similarity_check = alignments.max().item() > 0.14
-    #         else:
-    #             cosine_similarity_check = alignments.max().item() > 0.3
-    #         if cosine_similarity_check:
-    #             target_point = point
-
-    #             debug_text += '#### - The point has high cosine similarity. **😃** Directly navigate to it.\n'
-    #         else:
-    #             debug_text += '#### - Cannot verify whether this instance is the target. **😞** \n'
-    #     # print('target_point', target_point)
-    #     if not debug:
-    #         return target_point
-    #     elif not return_debug:
-    #         return target_point, debug_text
-    #     else:
-    #         return target_point, debug_text, image_id, point
-
-    def find_clusters_for_A(self, A, return_obs_counts = False, debug = False):
-
-        debug_text = ''
-
-        points, features, _, _ = self.voxel_pcd.get_pointcloud()
-        alignments = self.find_alignment_over_model(A).cpu().reshape(-1).detach().numpy()
-        # turning_point = max(np.percentile(alignments, 99), 0.08)
-        if self.siglip:
-            turning_point = min(0.14, alignments[np.argsort(alignments)[-20]])
-        else:
-            turning_point = min(0.3, alignments[np.argsort(alignments)[-20]])
-        mask = alignments >= turning_point
-        alignments = alignments[mask]
-        points = points[mask]
-        if len(points) == 0:
-
-            debug_text += '### - No instance found! Maybe target object has not been observed yet. **😭**\n'
-
-            output = [[], [], [], []]
-            if return_obs_counts:
-                output.append([])
-            if debug:
-                output.append(debug_text)
-
-            return output
-        else:
-            if return_obs_counts:
-                obs_ids = self.voxel_pcd._obs_counts.detach().cpu().numpy()[mask]
-                centroids, extends, similarity_max_list, points, obs_max_list = find_clusters(points.detach().cpu().numpy(), alignments, obs = obs_ids)
-                output = [centroids, extends, similarity_max_list, points, obs_max_list]
-            else:
-                centroids, extends, similarity_max_list, points = find_clusters(points.detach().cpu().numpy(), alignments, obs = None)
-                output = [centroids, extends, similarity_max_list, points]
-
-            debug_text += '### - Found ' + str(len(centroids)) + ' instances that might be target object.\n'
-            if debug:
-                output.append(debug_text)
-            
-            return output
\ No newline at end of file
+            return target_point, debug_text, image_id, point

From f39adda417c1c5bf0a813cf8bd83ed0e1bd0aff4 Mon Sep 17 00:00:00 2001
From: Peiqi Liu <leo20021210@163.com>
Date: Sat, 7 Sep 2024 18:24:56 -0400
Subject: [PATCH 046/410] fix real world bugs

---
 src/stretch/dynav/llm_localizer.py            | 56 +++--------
 src/stretch/dynav/llm_server.py               | 98 +++++++++++++++++--
 .../dynav/mapping_utils/voxelized_pcd.py      |  6 +-
 src/stretch/dynav/robot_agent_manip_mdp.py    | 19 ++--
 src/stretch/dynav/run_ok_nav.py               | 11 ++-
 src/stretch/dynav/voxel_map_server.py         | 78 ++++++++++-----
 6 files changed, 179 insertions(+), 89 deletions(-)

diff --git a/src/stretch/dynav/llm_localizer.py b/src/stretch/dynav/llm_localizer.py
index 9eb796ea6..589b3b858 100644
--- a/src/stretch/dynav/llm_localizer.py
+++ b/src/stretch/dynav/llm_localizer.py
@@ -171,35 +171,11 @@ def compute_coord(self, text, image_info, threshold = 0.25):
     def owl_locater(self, A, encoded_image, timestamps_lst):
         for i in timestamps_lst:
             image_info = encoded_image[i][-1]
-            res = self.compute_coord(A, image_info)
+            res = self.compute_coord(A, image_info, threshold=0.2)
             if res is not None:
-                debug_text = '#### - Obejct is detected in observations where instance ' + str(i + 1) + ' comes from. **😃** Directly navigate to it.\n'
+                debug_text = '#### - Obejct is detected in observations where instance' + str(i + 1) + ' comes from. **😃** Directly navigate to it.\n'
                 return res, debug_text, i, None
-            # image = image_info['image']
-
-            # box = None
                 
-            # inputs = self.exist_processor(text= 'a photo of ' + A, images=image, return_tensors="pt").to(self.device)
-
-            # with torch.no_grad():
-            #     outputs = self.exist_model(**inputs)
-
-            # target_sizes = torch.tensor([image.size[::-1]])
-            # results = self.exist_processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.2)[0]
-
-            # if len(results["scores"]) > 0:
-            #     cur_score = torch.max(results["scores"]).item()
-            #     max_score_index = torch.argmax(results["scores"])
-            #     box = results["boxes"][max_score_index].tolist()
-            # if box is not None:
-            #     xmin, ymin, xmax, ymax = map(int, box)
-            #     mask = np.zeros(image_info['depth'].shape, dtype=np.uint8)
-            #     mask[ymin:ymax, xmin:xmax] = 255
-            #     xyz = image_info['xyz']        
-            #     masked_xyz = xyz[mask.flatten() > 0]
-            #     centroid = np.stack([torch.mean(masked_xyz[:, 0]), torch.mean(masked_xyz[:, 1]), torch.mean(masked_xyz[:, 2])]).T
-            #     debug_text = '#### - Obejct is detected in observations where instance' + str(i + 1) + ' comes from. **😃** Directly navigate to it.\n'
-            #     return centroid, debug_text, i, masked_xyz
         debug_text = '#### - All instances are not the target! Maybe target object has not been observed yet. **😭**\n'
         return None, debug_text, None, None
     
@@ -222,7 +198,7 @@ def gpt_chunk(self, chunk, sys_prompt, user_prompt):
                     return list(map(int, timestamps.replace(' ', '').split(':')[1].split(',')))
             except Exception as e:
                 print(f"Error: {e}")
-                time.sleep(15)
+                time.sleep(10)
         return "Execution Failed"
 
     def gemini_chunk(self, chunk, sys_prompt, user_prompt):
@@ -255,23 +231,18 @@ def llm_locator(self, A, encoded_image, process_chunk, context_length = 30):
         
         Example:
         Input:
-        cat
+        The object you need to find are cat, car
 
         Output: 
         cat: 1,4,6,9
-
-        Input: 
-        car
-
-        Output:
         car: None
         """
 
-        user_prompt = f"""The object you need to find is {A}"""
+        user_prompt = f"""The object you need to find are {A}, car, bottle, shoes, watch, clothes, desk, chair, shoes, cup"""
         if 'gpt' in self.existence_checking_model:
-            content = [item for sublist in list(encoded_image.values()) for item in sublist[:2]][-100:] # adjust to [-60:] for taking only the last 30 and have faster speed
+            content = [item for sublist in list(encoded_image.values()) for item in sublist[:2]][-120:] # adjust to [-60:] for taking only the last 30 and have faster speed
         elif 'gemini' in self.existence_checking_model:
-            content = [item for sublist in list(encoded_image.values()) for item in sublist[0]][-100:]
+            content = [item for sublist in list(encoded_image.values()) for item in sublist[0]][-120:]
         content_chunks = [content[i:i + 2 * context_length] for i in range(0, len(content), 2 * context_length)]
         
         with ThreadPoolExecutor(max_workers=2) as executor:
@@ -289,8 +260,8 @@ def llm_locator(self, A, encoded_image, process_chunk, context_length = 30):
         # print(A, timestamps_lst)
         return self.owl_locater(A, encoded_image, timestamps_lst)
 
-    def localize_A(self, A, debug = True, return_debug = False, count_threshold = 5):
-        encoded_image = {}
+    def localize_A(self, A, debug = True, return_debug = False, count_threshold = 3):
+        encoded_image = OrderedDict()
 
         counts = torch.bincount(self.voxel_pcd._obs_counts)
         cur_obs = max(self.voxel_pcd._obs_counts)
@@ -309,10 +280,11 @@ def localize_A(self, A, debug = True, return_debug = False, count_threshold = 5)
             context_length = 30
 
         for obs_id in filtered_obs: 
-            rgb = self.voxel_map_wrapper.observations[obs_id - 1].rgb.numpy()
-            depth = self.voxel_map_wrapper.observations[obs_id - 1].depth
-            camera_pose = self.voxel_map_wrapper.observations[obs_id - 1].camera_pose
-            camera_K = self.voxel_map_wrapper.observations[obs_id - 1].camera_K
+            obs_id -= 1
+            rgb = self.voxel_map_wrapper.observations[obs_id].rgb.numpy()
+            depth = self.voxel_map_wrapper.observations[obs_id].depth
+            camera_pose = self.voxel_map_wrapper.observations[obs_id].camera_pose
+            camera_K = self.voxel_map_wrapper.observations[obs_id].camera_K
 
             # full_world_xyz = unproject_masked_depth_to_xyz_coordinates(  # Batchable!
             #     depth=depth.unsqueeze(0).unsqueeze(1),
diff --git a/src/stretch/dynav/llm_server.py b/src/stretch/dynav/llm_server.py
index ee4989922..704ef3f17 100644
--- a/src/stretch/dynav/llm_server.py
+++ b/src/stretch/dynav/llm_server.py
@@ -233,6 +233,9 @@ def process_text(self, text, start_pose):
         
         if len(localized_point) == 2:
             localized_point = np.array([localized_point[0], localized_point[1], 0])
+            
+        if torch.linalg.norm((torch.tensor(localized_point) - torch.tensor(start_pose))[:2]) < 0.7:
+            return [[np.nan, np.nan, np.nan], localized_point]
 
         point = self.sample_navigation(start_pose, localized_point)
 
@@ -391,7 +394,7 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
         )
         
         with self.voxel_map_lock:
-            self.voxel_map_localizer.voxel_pcd.clear_points(depth, torch.from_numpy(intrinsics), torch.from_numpy(pose))
+            self.voxel_map_localizer.voxel_pcd.clear_points(depth, torch.from_numpy(intrinsics), torch.from_numpy(pose), min_samples_clear = 20)
             self.voxel_map.voxel_pcd.clear_points(depth, torch.from_numpy(intrinsics), torch.from_numpy(pose))
         
         if '_owl' in self.vision_method:
@@ -420,6 +423,66 @@ def run_llm_owl(self, rgb, mask, world_xyz):
         if len(valid_xyz) != 0:
             self.add_to_voxel_pcd(valid_xyz, None, valid_rgb)
     
+    def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
+        if isinstance(pickle_file_name, str):
+            pickle_file_name = Path(pickle_file_name)
+        assert pickle_file_name.exists(), f"No file found at {pickle_file_name}"
+        with pickle_file_name.open("rb") as f:
+            data = pickle.load(f)
+        for i, (
+            camera_pose,
+            xyz,
+            rgb,
+            feats,
+            depth,
+            base_pose,
+            K,
+            world_xyz,
+        ) in enumerate(
+            zip(
+                data["camera_poses"],
+                data["xyz"],
+                data["rgb"],
+                data["feats"],
+                data["depth"],
+                data["base_poses"],
+                data["camera_K"],
+                data["world_xyz"],
+            )
+        ):
+            # Handle the case where we dont actually want to load everything
+            if num_frames > 0 and i >= num_frames:
+                break
+
+            if rgb is None:
+                continue
+
+            camera_pose = self.voxel_map.fix_data_type(camera_pose)
+            xyz = self.voxel_map.fix_data_type(xyz)
+            rgb = self.voxel_map.fix_data_type(rgb)
+            depth = self.voxel_map.fix_data_type(depth)
+            if feats is not None:
+                feats = self.voxel_map.fix_data_type(feats)
+            base_pose = self.voxel_map.fix_data_type(base_pose)
+            self.voxel_map.add(
+                camera_pose=camera_pose,
+                xyz=xyz,
+                rgb=rgb,
+                feats=feats,
+                depth=depth,
+                base_pose=base_pose,
+                camera_K=K,
+            )
+            self.obs_count += 1
+        self.voxel_map_localizer.voxel_pcd._points = data["combined_xyz"]
+        self.voxel_map_localizer.voxel_pcd._features = data["combined_feats"]
+        self.voxel_map_localizer.voxel_pcd._weights = data["combined_weights"]
+        self.voxel_map_localizer.voxel_pcd._rgb = data["combined_rgb"]
+        self.voxel_map_localizer.voxel_pcd._obs_counts = data["obs_id"]
+        self.voxel_map_localizer.voxel_pcd._entity_ids = data["entity_id"]
+        self.voxel_map_localizer.voxel_pcd.obs_count = max(self.voxel_map_localizer.voxel_pcd._obs_counts).item()
+        self.voxel_map.voxel_pcd.obs_count = max(self.voxel_map_localizer.voxel_pcd._obs_counts).item()
+
     def write_to_pickle(self):
         """Write out to a pickle file. This is a rough, quick-and-easy output for debugging, not intended to replace the scalable data writer in data_tools for bigger efforts."""
         if not os.path.exists('debug'):
@@ -434,17 +497,31 @@ def write_to_pickle(self):
         data["rgb"] = []
         data["depth"] = []
         data["feats"] = []
-        for frame in self.voxel_map.observations:
+
+        image_ids = torch.unique(self.voxel_map.voxel_pcd._obs_counts).tolist()
+
+        for idx, frame in enumerate(self.voxel_map.observations):
             # add it to pickle
             # TODO: switch to using just Obs struct?
-            data["camera_poses"].append(frame.camera_pose)
-            data["base_poses"].append(frame.base_pose)
-            data["camera_K"].append(frame.camera_K)
-            data["xyz"].append(frame.xyz)
-            data["world_xyz"].append(frame.full_world_xyz)
-            data["rgb"].append(frame.rgb)
-            data["depth"].append(frame.depth)
-            data["feats"].append(frame.feats)
+            if idx in image_ids:
+                data["camera_poses"].append(frame.camera_pose)
+                data["camera_K"].append(frame.camera_K)
+                data["rgb"].append(frame.rgb)
+                data["depth"].append(frame.depth)
+            else:
+                data["camera_poses"].append(None)
+                data["camera_K"].append(None)
+                data["rgb"].append(None)
+                data["depth"].append(None)
+            # We might not need this
+            # data["xyz"].append(frame.xyz)
+            # data["world_xyz"].append(frame.full_world_xyz)
+            # data["feats"].append(frame.feats)
+            # data["base_poses"].append(frame.base_pose)
+            data["xyz"].append(None)
+            data["world_xyz"].append(None)
+            data["feats"].append(None)
+            data["base_poses"].append(None)
             for k, v in frame.info.items():
                 if k not in data:
                     data[k] = []
@@ -459,3 +536,4 @@ def write_to_pickle(self):
         data["entity_id"] = self.voxel_map_localizer.voxel_pcd._entity_ids
         with open(filename, "wb") as f:
             pickle.dump(data, f)
+        print('write all data to', filename)
diff --git a/src/stretch/dynav/mapping_utils/voxelized_pcd.py b/src/stretch/dynav/mapping_utils/voxelized_pcd.py
index 6f4486a8e..56ca55e4b 100644
--- a/src/stretch/dynav/mapping_utils/voxelized_pcd.py
+++ b/src/stretch/dynav/mapping_utils/voxelized_pcd.py
@@ -136,7 +136,7 @@ def reset(self):
         self._maxs = self.dim_maxs
         self.obs_count = 1
 
-    def clear_points(self, depth, intrinsics, pose, depth_is_valid = None):
+    def clear_points(self, depth, intrinsics, pose, depth_is_valid = None, min_samples_clear = None):
         if self._points is not None:
             xys = project_points(self._points.detach().cpu(), intrinsics, pose).int()
             xys = xys[:, [1, 0]]
@@ -192,8 +192,8 @@ def clear_points(self, depth, intrinsics, pose, depth_is_valid = None):
             if self._entity_ids is not None:
                 self._entity_ids = self._entity_ids[indices]
 
-            if self._entity_ids is not None:
-                dbscan = DBSCAN(eps=self.voxel_size * 4, min_samples=10)
+            if self._entity_ids is not None and min_samples_clear is not None:
+                dbscan = DBSCAN(eps=self.voxel_size * 4, min_samples=min_samples_clear)
                 cluster_vertices = torch.cat((self._points.detach().cpu(), self._entity_ids.detach().cpu().reshape(-1,1) * 1000), -1).numpy()
                 clusters = dbscan.fit(cluster_vertices)
                 labels = clusters.labels_
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index 11556b0e5..803fa14dd 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -145,14 +145,15 @@ def execute_action(
 
         if len(res) > 0:
             print("Plan successful!")
-            if len(res) > 2 and np.isnan(res[-2]).all():
+            if len(res) >= 2 and np.isnan(res[-2]).all():
                 # blocking = text != ''
-                self.robot.execute_trajectory(
-                    res[:-2],
-                    pos_err_threshold=self.pos_err_threshold,
-                    rot_err_threshold=self.rot_err_threshold,
-                    blocking = True
-                )
+                if len(res) > 2:
+                    self.robot.execute_trajectory(
+                        res[:-2],
+                        pos_err_threshold=self.pos_err_threshold,
+                        rot_err_threshold=self.rot_err_threshold,
+                        blocking = True
+                    )
 
                 execution_finish = time.time()
                 execution_take = execution_finish - look_around_finish
@@ -313,6 +314,10 @@ def manipulate(self, text, init_tilt = INIT_HEAD_TILT, base_node = TOP_CAMERA_NO
 
         return True
 
+    def save(self):
+        with self.image_sender.voxel_map_lock:
+            self.image_sender.write_to_pickle()
+
 def send_array(socket, A, flags=0, copy=True, track=False):
     """send a numpy array with metadata"""
     A = np.array(A)
diff --git a/src/stretch/dynav/run_ok_nav.py b/src/stretch/dynav/run_ok_nav.py
index 41f870e30..f2d82c606 100644
--- a/src/stretch/dynav/run_ok_nav.py
+++ b/src/stretch/dynav/run_ok_nav.py
@@ -79,7 +79,8 @@ def main(
 
     def keep_looking_around():
         while True:
-            time.sleep(0.5)
+            # We don't want too many images in our memory
+            time.sleep(0.8)
             if robot.get_six_joints()[2] > 0.7 or not robot.in_navigation_mode():
                 continue
             demo.update()
@@ -119,6 +120,9 @@ def keep_looking_around():
             camera_xyz = robot.get_head_pose()[:3, 3]
             theta = compute_tilt(camera_xyz, point)
             demo.manipulate(text, theta)
+            xyt = robot.get_base_pose()
+            xyt[2] = xyt[2] - np.pi / 2
+            robot.navigate_to(xyt, blocking = True)
             
             robot.switch_to_navigation_mode()
             if input('You want to run placing: y/n') == 'n':
@@ -139,6 +143,11 @@ def keep_looking_around():
             camera_xyz = robot.get_head_pose()[:3, 3]
             theta = compute_tilt(camera_xyz, point)
             demo.place(text, theta)
+            xyt = robot.get_base_pose()
+            xyt[2] = xyt[2] - np.pi / 2
+            robot.navigate_to(xyt, blocking = True)
+
+            demo.save()
 
 
 if __name__ == "__main__":
diff --git a/src/stretch/dynav/voxel_map_server.py b/src/stretch/dynav/voxel_map_server.py
index 417cfc890..a668161cc 100644
--- a/src/stretch/dynav/voxel_map_server.py
+++ b/src/stretch/dynav/voxel_map_server.py
@@ -254,6 +254,9 @@ def process_text(self, text, start_pose):
         
         if len(localized_point) == 2:
             localized_point = np.array([localized_point[0], localized_point[1], 0])
+            
+        if torch.linalg.norm((torch.tensor(localized_point) - torch.tensor(start_pose))[:2]) < 0.7:
+            return [[np.nan, np.nan, np.nan], localized_point]
 
         point = self.sample_navigation(start_pose, localized_point)
         # if mode == 'navigation' and torch.linalg.norm(torch.Tensor(point)[:2] - torch.linalg.norm(localized_point[:2])) > 2.0:
@@ -703,7 +706,11 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
         )
         
         with self.voxel_map_lock:
-            self.voxel_map_localizer.voxel_pcd.clear_points(depth, torch.from_numpy(intrinsics), torch.from_numpy(pose))
+            if self.vision_method == 'mask&*lip':
+                min_samples_clear = 5
+            else:
+                min_samples_clear = 10
+            self.voxel_map_localizer.voxel_pcd.clear_points(depth, torch.from_numpy(intrinsics), torch.from_numpy(pose), min_samples_clear = min_samples_clear)
             self.voxel_map.voxel_pcd.clear_points(depth, torch.from_numpy(intrinsics), torch.from_numpy(pose))
         
         if self.vision_method == 'mask&*lip':
@@ -765,6 +772,9 @@ def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
             if num_frames > 0 and i >= num_frames:
                 break
 
+            if rgb is None:
+                continue
+
             camera_pose = self.voxel_map.fix_data_type(camera_pose)
             xyz = self.voxel_map.fix_data_type(xyz)
             rgb = self.voxel_map.fix_data_type(rgb)
@@ -788,6 +798,8 @@ def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
         self.voxel_map_localizer.voxel_pcd._rgb = data["combined_rgb"]
         self.voxel_map_localizer.voxel_pcd._obs_counts = data["obs_id"]
         self.voxel_map_localizer.voxel_pcd._entity_ids = data["entity_id"]
+        self.voxel_map_localizer.voxel_pcd.obs_count = max(self.voxel_map_localizer.voxel_pcd._obs_counts).item()
+        self.voxel_map.voxel_pcd.obs_count = max(self.voxel_map_localizer.voxel_pcd._obs_counts).item()
 
     def write_to_pickle(self):
         """Write out to a pickle file. This is a rough, quick-and-easy output for debugging, not intended to replace the scalable data writer in data_tools for bigger efforts."""
@@ -803,17 +815,31 @@ def write_to_pickle(self):
         data["rgb"] = []
         data["depth"] = []
         data["feats"] = []
-        for frame in self.voxel_map.observations:
+
+        image_ids = torch.unique(self.voxel_map.voxel_pcd._obs_counts).tolist()
+
+        for idx, frame in enumerate(self.voxel_map.observations):
             # add it to pickle
             # TODO: switch to using just Obs struct?
-            data["camera_poses"].append(frame.camera_pose)
-            data["base_poses"].append(frame.base_pose)
-            data["camera_K"].append(frame.camera_K)
-            data["xyz"].append(frame.xyz)
-            data["world_xyz"].append(frame.full_world_xyz)
-            data["rgb"].append(frame.rgb)
-            data["depth"].append(frame.depth)
-            data["feats"].append(frame.feats)
+            if idx in image_ids:
+                data["camera_poses"].append(frame.camera_pose)
+                data["camera_K"].append(frame.camera_K)
+                data["rgb"].append(frame.rgb)
+                data["depth"].append(frame.depth)
+            else:
+                data["camera_poses"].append(None)
+                data["camera_K"].append(None)
+                data["rgb"].append(None)
+                data["depth"].append(None)
+            # We might not need this
+            # data["xyz"].append(frame.xyz)
+            # data["world_xyz"].append(frame.full_world_xyz)
+            # data["feats"].append(frame.feats)
+            # data["base_poses"].append(frame.base_pose)
+            data["xyz"].append(None)
+            data["world_xyz"].append(None)
+            data["feats"].append(None)
+            data["base_poses"].append(None)
             for k, v in frame.info.items():
                 if k not in data:
                     data[k] = []
@@ -828,21 +854,21 @@ def write_to_pickle(self):
         data["entity_id"] = self.voxel_map_localizer.voxel_pcd._entity_ids
         with open(filename, "wb") as f:
             pickle.dump(data, f)
+        print('write all data to', filename)
 
 # @hydra.main(version_base="1.2", config_path=".", config_name="config.yaml")
-# def main(cfg):
-#     torch.manual_seed(1)
-#     imageProcessor = ImageProcessor(rerun = False, static = False, min_depth = 0., max_depth = 2.5)
-#     # imageProcessor = ImageProcessor(rerun = cfg.rerun, static = cfg.static, min_depth = cfg.min_depth, max_depth = cfg.max_depth)
-#     # if not cfg.pickle_file_name is None:
-#     #     imageProcessor.read_from_pickle(cfg.pickle_file_name)
-#     # print(imageProcessor.voxel_map_localizer.voxel_pcd._points)
-#     # if cfg.open_communication:
-#     #     while True:
-#     #         imageProcessor.recv_text()
-#     for i in range(10, 20):
-#         imageProcessor.read_from_pickle('debug/debug_2024-09-05_18-13-12.pkl', i)
-#         imageProcessor.space.sample_exploration(xyt = [0, 0, 0], planner = imageProcessor.planner, text = None)
-
-# if __name__ == "__main__":
-#     main(None)
\ No newline at end of file
+def main(cfg):
+    torch.manual_seed(1)
+    imageProcessor = ImageProcessor(rerun = False, static = False, min_depth = 0., max_depth = 2.5)
+    # imageProcessor = ImageProcessor(rerun = cfg.rerun, static = cfg.static, min_depth = cfg.min_depth, max_depth = cfg.max_depth)
+    # if not cfg.pickle_file_name is None:
+    #     imageProcessor.read_from_pickle(cfg.pickle_file_name)
+    # print(imageProcessor.voxel_map_localizer.voxel_pcd._points)
+    # if cfg.open_communication:
+    #     while True:
+    #         imageProcessor.recv_text()
+    imageProcessor.read_from_pickle('debug/debug_2024-09-07_18-15-08.pkl', -1)
+    imageProcessor.write_to_pickle()
+
+if __name__ == "__main__":
+    main(None)
\ No newline at end of file

From eecf6d984bc485a4f5dad41512d876bb17b20603 Mon Sep 17 00:00:00 2001
From: Peiqi Liu <leo20021210@163.com>
Date: Sun, 8 Sep 2024 06:21:23 -0400
Subject: [PATCH 047/410] update

---
 src/stretch/agent/zmq_client.py               |  2 +
 src/stretch/config/dynav_config.yaml          |  4 +-
 src/stretch/dynav/llm_localizer.py            |  6 +-
 src/stretch/dynav/llm_server.py               |  3 +-
 src/stretch/dynav/mapping_utils/voxel.py      |  6 +-
 src/stretch/dynav/mapping_utils/voxel_map.py  | 71 +++++-----------
 .../dynav/ok_robot_hw/utils/grasper_utils.py  | 10 +--
 src/stretch/dynav/robot_agent_manip_mdp.py    | 14 ++--
 src/stretch/dynav/run_ok_nav.py               | 42 ++++++----
 src/stretch/dynav/voxel_map_localizer.py      |  2 +-
 src/stretch/dynav/voxel_map_server.py         | 84 ++++++-------------
 src/stretch/visualization/rerun.py            | 31 ++++---
 12 files changed, 119 insertions(+), 156 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 66c6c0f61..cac8be384 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -391,6 +391,8 @@ def head_to(
             whole_body_q[HelloStretchIdx.HEAD_TILT] = float(head_tilt)
             self._wait_for_head(whole_body_q)
 
+        time.sleep(0.3)
+
     def look_front(self, blocking: bool = True, timeout: float = 10.0):
         """Let robot look to its front."""
         self.head_to(constants.look_front[0], constants.look_front[1], blocking = blocking, timeout = timeout)
diff --git a/src/stretch/config/dynav_config.yaml b/src/stretch/config/dynav_config.yaml
index 6a474c65f..5c6914fba 100644
--- a/src/stretch/config/dynav_config.yaml
+++ b/src/stretch/config/dynav_config.yaml
@@ -9,8 +9,8 @@ voxel_size: 0.1
 # 0.05 seems too low
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.5  # Ignore things over this height (eg ceilings)
-obs_min_density: 1  # This many points makes it an obstacle
-exp_min_density: 0
+obs_min_density: 5  # This many points makes it an obstacle
+exp_min_density: 1
 
 # Padding
 pad_obstacles: 2  # Add this many units (voxel_size) to the area around obstacles
diff --git a/src/stretch/dynav/llm_localizer.py b/src/stretch/dynav/llm_localizer.py
index 589b3b858..6ba142412 100644
--- a/src/stretch/dynav/llm_localizer.py
+++ b/src/stretch/dynav/llm_localizer.py
@@ -19,6 +19,8 @@
 import google.generativeai as genai
 from openai import OpenAI
 import base64
+from collections import OrderedDict
+
 genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
 generation_config = genai.GenerationConfig(temperature=0)
 safety_settings = [
@@ -171,11 +173,11 @@ def compute_coord(self, text, image_info, threshold = 0.25):
     def owl_locater(self, A, encoded_image, timestamps_lst):
         for i in timestamps_lst:
             image_info = encoded_image[i][-1]
-            res = self.compute_coord(A, image_info, threshold=0.2)
+            res = self.compute_coord(A, image_info, threshold=0.15)
             if res is not None:
                 debug_text = '#### - Obejct is detected in observations where instance' + str(i + 1) + ' comes from. **😃** Directly navigate to it.\n'
                 return res, debug_text, i, None
-                
+
         debug_text = '#### - All instances are not the target! Maybe target object has not been observed yet. **😭**\n'
         return None, debug_text, None, None
     
diff --git a/src/stretch/dynav/llm_server.py b/src/stretch/dynav/llm_server.py
index 704ef3f17..8d0ab64bd 100644
--- a/src/stretch/dynav/llm_server.py
+++ b/src/stretch/dynav/llm_server.py
@@ -42,6 +42,7 @@
 from PIL import Image
 
 from stretch.dynav.communication_util import load_socket, send_array, recv_array, send_rgb_img, recv_rgb_img, send_depth_img, recv_depth_img, send_everything, recv_everything
+from collections import OrderedDict
 
 def get_inv_intrinsics(intrinsics):
     # return intrinsics.double().inverse().to(intrinsics)
@@ -282,7 +283,7 @@ def process_text(self, text, start_pose):
                 waypoints = [pt.state for pt in res.trajectory]
                 # If we are navigating to some object of interst, send (x, y, z) of 
                 # the object so that we can make sure the robot looks at the object after navigation
-                finished = len(waypoints) <= 5 and mode == 'navigation'
+                finished = len(waypoints) <= 6 and mode == 'navigation'
                 if not finished:
                     waypoints = waypoints[:8]
                 traj = self.planner.clean_path_for_xy(waypoints)
diff --git a/src/stretch/dynav/mapping_utils/voxel.py b/src/stretch/dynav/mapping_utils/voxel.py
index 91771d5f8..b9098f43e 100644
--- a/src/stretch/dynav/mapping_utils/voxel.py
+++ b/src/stretch/dynav/mapping_utils/voxel.py
@@ -54,7 +54,7 @@
 
 VALID_FRAMES = ["camera", "world"]
 
-DEFAULT_GRID_SIZE = [200, 200]
+DEFAULT_GRID_SIZE = [145, 145]
 
 logger = logging.getLogger(__name__)
 
@@ -632,8 +632,8 @@ def get_2d_map(
         # Gets the xyz correctly - for now everything is assumed to be within the correct distance of origin
         xyz, _, counts, _ = self.voxel_pcd.get_pointcloud()
         # print(counts)
-        if xyz is not None:
-            counts = torch.ones(xyz.shape[0])
+        # if xyz is not None:
+        #     counts = torch.ones(xyz.shape[0])
         obs_ids = self.voxel_pcd._obs_counts
         if xyz is None:
             xyz = torch.zeros((0, 3))
diff --git a/src/stretch/dynav/mapping_utils/voxel_map.py b/src/stretch/dynav/mapping_utils/voxel_map.py
index 95ba395ca..1ffe278b2 100644
--- a/src/stretch/dynav/mapping_utils/voxel_map.py
+++ b/src/stretch/dynav/mapping_utils/voxel_map.py
@@ -369,14 +369,21 @@ def sample_target_point(
         # TODO: was this:
         # expanded_mask = expanded_mask & less_explored & ~obstacles
 
-        scores = []
-        points = []
-
         for selected_target in selected_targets:
             selected_x, selected_y = planner.to_xy([selected_target[0], selected_target[1]])
             theta = self.compute_theta(selected_x, selected_y, point[0], point[1])
 
+            # if debug and self.is_valid([selected_x, selected_y, theta]):
+            #     import matplotlib.pyplot as plt
+
+            #     obstacles, explored = self.voxel_map.get_2d_map()
+            #     plt.scatter(ys, xs, s = 1)
+            #     plt.scatter(selected_target[1], selected_target[0], s = 10)
+            #     plt.scatter(target_y, target_x, s = 10)
+            #     plt.imshow(obstacles)
             target_is_valid = self.is_valid([selected_x, selected_y, theta])
+            # print('Target:', [selected_x, selected_y, theta])
+            # print('Target is valid:', target_is_valid)
             if not target_is_valid:
                 continue
             if np.linalg.norm([selected_x - point[0], selected_y - point[1]]) < 0.4:
@@ -389,53 +396,17 @@ def sample_target_point(
                 index_i = int(selected_target[0].int() + i)
                 index_j = int(selected_target[1].int() + j)
                 if obstacles[index_i][index_j]:
-                    score += 0.5
-
-            scores.append(score)
-
-        if len(points) == 0:
-            return None
-        else:
-            return points[np.argmax(scores)]
-
-        # for selected_target in selected_targets:
-        #     selected_x, selected_y = planner.to_xy([selected_target[0], selected_target[1]])
-        #     theta = self.compute_theta(selected_x, selected_y, point[0], point[1])
-
-        #     # if debug and self.is_valid([selected_x, selected_y, theta]):
-        #     #     import matplotlib.pyplot as plt
-
-        #     #     obstacles, explored = self.voxel_map.get_2d_map()
-        #     #     plt.scatter(ys, xs, s = 1)
-        #     #     plt.scatter(selected_target[1], selected_target[0], s = 10)
-        #     #     plt.scatter(target_y, target_x, s = 10)
-        #     #     plt.imshow(obstacles)
-        #     target_is_valid = self.is_valid([selected_x, selected_y, theta])
-        #     # print('Target:', [selected_x, selected_y, theta])
-        #     # print('Target is valid:', target_is_valid)
-        #     if not target_is_valid:
-        #         continue
-        #     if np.linalg.norm([selected_x - point[0], selected_y - point[1]]) < 0.35:
-        #         continue
-        #     elif np.linalg.norm([selected_x - point[0], selected_y - point[1]]) <= 0.5:
-        #         print('OBSTACLE AVOIDANCE')
-        #         print(selected_target[0].int(), selected_target[1].int())
-        #         i = (point[0] - selected_target[0]) // abs(point[0] - selected_target[0])
-        #         j = (point[1] - selected_target[1]) // abs(point[1] - selected_target[1])
-        #         index_i = int(selected_target[0].int() + i)
-        #         index_j = int(selected_target[1].int() + j)
-        #         if obstacles[index_i][index_j]:
-        #             target_is_valid = False
-        #     # elif np.linalg.norm([selected_x - point[0], selected_y - point[1]]) <= 0.5:
-        #     #     for i in [-1, 0, 1]:
-        #     #         for j in [-1, 0, 1]:
-        #     #             if obstacles[selected_target[0] + i][selected_target[1] + j]:
-        #     #                 target_is_valid = False
-        #     if not target_is_valid:
-        #         continue
+                    target_is_valid = False
+            # elif np.linalg.norm([selected_x - point[0], selected_y - point[1]]) <= 0.5:
+            #     for i in [-1, 0, 1]:
+            #         for j in [-1, 0, 1]:
+            #             if obstacles[selected_target[0] + i][selected_target[1] + j]:
+            #                 target_is_valid = False
+            if not target_is_valid:
+                continue
             
-        #     return np.array([selected_x, selected_y, theta])
-        # return None
+            return np.array([selected_x, selected_y, theta])
+        return None
 
     def sample_near_mask(
         self,
@@ -689,7 +660,7 @@ def sample_exploration(
             plt.show()
         return index, time_heuristics, alignments_heuristics, total_heuristics
         
-    def _alignment_heuristic(self, alignments, outside_frontier, alignment_smooth = 100, alignment_threshold = 0.13, debug = False):
+    def _alignment_heuristic(self, alignments, outside_frontier, alignment_smooth = 20, alignment_threshold = 0.13, debug = False):
         alignments = np.ma.masked_array(alignments, ~outside_frontier)
         alignment_heuristics = 1 / (1 + np.exp(-alignment_smooth * (alignments - alignment_threshold)))
         index = np.unravel_index(np.argmax(alignment_heuristics), alignments.shape)
diff --git a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
index ff5c7872a..968cc9a72 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
@@ -14,7 +14,7 @@ def capture_and_process_image(camera, mode, obj, socket, hello_robot):
     image_publisher = ImagePublisher(camera, socket)
 
     # Centering the object
-    head_tilt_angles = [0, -0.1]
+    head_tilt_angles = [0.1, 0, -0.1]
     tilt_retries, side_retries = 1, 0
     retry_flag = True
     head_tilt = INIT_HEAD_TILT
@@ -31,14 +31,14 @@ def capture_and_process_image(camera, mode, obj, socket, hello_robot):
             hello_robot.move_to_position(base_trans=base_trans,
                                     head_pan=head_pan,
                                     head_tilt=head_tilt)
-            time.sleep(2)
+            time.sleep(1)
         
-        elif (retry_flag !=0 and side_retries == 3):
+        elif (retry_flag !=0 and side_retries == 2):
             print("Tried in all angles but couldn't succed")
-            time.sleep(2)
+            time.sleep(1)
             return None, None, None, None
 
-        elif (side_retries == 2 and tilt_retries == 3):
+        elif (side_retries == 2 and tilt_retries == 2):
             hello_robot.move_to_position(base_trans=0.1, head_tilt=head_tilt)
             side_retries = 3
 
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index 6871ca501..6e6ad3d7b 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -25,8 +25,8 @@
 from stretch.dynav.ok_robot_hw.camera import RealSenseCamera
 from stretch.dynav.ok_robot_hw.utils.grasper_utils import pickup, move_to_point, capture_and_process_image
 from stretch.dynav.ok_robot_hw.utils.communication_utils import send_array, recv_array
-# from stretch.dynav.voxel_map_server import ImageProcessor
-from stretch.dynav.llm_server import ImageProcessor
+from stretch.dynav.voxel_map_server import ImageProcessor
+# from stretch.dynav.llm_server import ImageProcessor
 
 import cv2
 
@@ -121,11 +121,11 @@ def execute_action(
         self,
         text: str,
     ):
-        # start_time = time.time()
+        start_time = time.time()
 
         self.robot.look_front()
-        # self.look_around()
-        # self.robot.look_front()
+        self.look_around()
+        self.robot.look_front()
         self.robot.switch_to_navigation_mode()
 
         start = self.robot.get_base_pose()
@@ -314,8 +314,8 @@ def manipulate(self, text, init_tilt = INIT_HEAD_TILT, base_node = TOP_CAMERA_NO
         return True
 
     def save(self):
-        with self.image_sender.voxel_map_lock:
-            self.image_sender.write_to_pickle()
+        with self.image_processor.voxel_map_lock:
+            self.image_processor.write_to_pickle()
 
 def send_array(socket, A, flags=0, copy=True, track=False):
     """send a numpy array with metadata"""
diff --git a/src/stretch/dynav/run_ok_nav.py b/src/stretch/dynav/run_ok_nav.py
index f2d82c606..38d19e9e3 100644
--- a/src/stretch/dynav/run_ok_nav.py
+++ b/src/stretch/dynav/run_ok_nav.py
@@ -25,6 +25,10 @@ def compute_tilt(camera_xyz, target_xyz):
         - camera_xyz: estimated (x, y, z) coordinates of camera
         - target_xyz: estimated (x, y, z) coordinates of the target object
     '''
+    if not isinstance(camera_xyz, np.ndarray):
+        camera_xyz = np.array(camera_xyz)
+    if not isinstance(target_xyz, np.ndarray):
+        target_xyz = np.array(target_xyz)
     vector = camera_xyz - target_xyz
     return -np.arctan2(vector[2], np.linalg.norm(vector[:2]))
 
@@ -77,17 +81,19 @@ def main(
     else:
         demo.image_processor.read_from_pickle(input_path)
 
-    def keep_looking_around():
-        while True:
-            # We don't want too many images in our memory
-            time.sleep(0.8)
-            if robot.get_six_joints()[2] > 0.7 or not robot.in_navigation_mode():
-                continue
-            demo.update()
+    demo.save()
+
+    # def keep_looking_around():
+    #     while True:
+    #         # We don't want too many images in our memory
+    #         time.sleep(0.8)
+    #         if robot.get_six_joints()[2] > 0.7 or not robot.in_navigation_mode():
+    #             continue
+    #         demo.update()
 
-    img_thread = threading.Thread(target=keep_looking_around)
-    img_thread.daemon = True
-    img_thread.start()
+    # img_thread = threading.Thread(target=keep_looking_around)
+    # img_thread.daemon = True
+    # img_thread.start()
 
     while True:
         mode = input('select mode? E/N/S')
@@ -120,9 +126,11 @@ def keep_looking_around():
             camera_xyz = robot.get_head_pose()[:3, 3]
             theta = compute_tilt(camera_xyz, point)
             demo.manipulate(text, theta)
-            xyt = robot.get_base_pose()
-            xyt[2] = xyt[2] - np.pi / 2
-            robot.navigate_to(xyt, blocking = True)
+            # robot.switch_to_navigation_mode()
+            # xyt = robot.get_base_pose()
+            # xyt[2] = xyt[2] - np.pi / 2
+            robot.look_front()
+            # robot.navigate_to(xyt, blocking = True)
             
             robot.switch_to_navigation_mode()
             if input('You want to run placing: y/n') == 'n':
@@ -143,9 +151,11 @@ def keep_looking_around():
             camera_xyz = robot.get_head_pose()[:3, 3]
             theta = compute_tilt(camera_xyz, point)
             demo.place(text, theta)
-            xyt = robot.get_base_pose()
-            xyt[2] = xyt[2] - np.pi / 2
-            robot.navigate_to(xyt, blocking = True)
+            # robot.switch_to_navigation_mode()
+            # xyt = robot.get_base_pose()
+            # xyt[2] = xyt[2] - np.pi / 2
+            robot.move_to_nav_posture()
+            # robot.navigate_to(xyt, blocking = True)
 
             demo.save()
 
diff --git a/src/stretch/dynav/voxel_map_localizer.py b/src/stretch/dynav/voxel_map_localizer.py
index 481f50f5b..373aab0c8 100644
--- a/src/stretch/dynav/voxel_map_localizer.py
+++ b/src/stretch/dynav/voxel_map_localizer.py
@@ -161,7 +161,7 @@ def find_obs_id_for_A(self, A):
         alignments = self.find_alignment_over_model(A).cpu()
         return obs_counts[alignments.argmax(dim = -1)].detach().cpu()
 
-    def compute_coord(self, text, obs_id, threshold = 0.25):
+    def compute_coord(self, text, obs_id, threshold = 0.2):
         if obs_id <= 0:
             return None
         rgb = self.voxel_map_wrapper.observations[obs_id - 1].rgb
diff --git a/src/stretch/dynav/voxel_map_server.py b/src/stretch/dynav/voxel_map_server.py
index 0076cf7d6..49adcdcc6 100644
--- a/src/stretch/dynav/voxel_map_server.py
+++ b/src/stretch/dynav/voxel_map_server.py
@@ -152,9 +152,6 @@ def __init__(self,
             self.img_thread = threading.Thread(target=self._recv_image)
             self.img_thread.daemon = True
             self.img_thread.start()
-
-        self.path = None
-        self.text = None
     
     def create_obstacle_map(self):
         print("- Load parameters")
@@ -223,25 +220,25 @@ def create_vision_model(self):
         self.yolo_model.set_classes(self.texts)
         self.voxel_map_localizer = VoxelMapLocalizer(self.voxel_map, clip_model = self.clip_model, processor = self.clip_preprocess, device = self.device, siglip = self.siglip)
 
-    def compute_path(self, start_pose):
+    def process_text(self, text, start_pose):
         if self.rerun:
             rr.log('/object', rr.Clear(recursive = True), static = self.static)
             rr.log('/robot_start_pose', rr.Clear(recursive = True), static = self.static)
             rr.log('/direction', rr.Clear(recursive = True), static = self.static)
             rr.log('robot_monologue', rr.Clear(recursive = True), static = self.static)
             rr.log('/Past_observation_most_similar_to_text', rr.Clear(recursive = True), static = self.static)
-        
+            if not self.static:
+                rr.connect('100.108.67.79:9876')
         debug_text = ''
         mode = 'navigation'
         obs = None
-        with self.voxel_map_lock:
-            text = self.text
         # Do visual grounding
         if text is not None and text != '':
-            localized_point, debug_text, obs, pointcloud = self.voxel_map_localizer.localize_A(text, debug = True, return_debug = True)
+            with self.voxel_map_lock:
+                localized_point, debug_text, obs, pointcloud = self.voxel_map_localizer.localize_A(text, debug = True, return_debug = True)
             if localized_point is not None:
                 rr.log("/object", rr.Points3D([localized_point[0], localized_point[1], 1.5], colors=torch.Tensor([1, 0, 0]), radii=0.1), static = self.static)
-        # Do exploration
+        # Do Frontier based exploration
         if text is None or text == '' or localized_point is None:
             debug_text += '## Navigation fails, so robot starts exploring environments.\n'
             localized_point = self.sample_frontier(start_pose, text)
@@ -250,22 +247,15 @@ def compute_path(self, start_pose):
             print('\n', localized_point, '\n')
 
         if localized_point is None:
-            with self.voxel_map_lock:
-                self.traj = (text, None, start_pose)
-            return
+            return []
         
         if len(localized_point) == 2:
             localized_point = np.array([localized_point[0], localized_point[1], 0])
             
-        if torch.linalg.norm((torch.tensor(localized_point) - torch.tensor(start_pose))[:2]) < 0.7:
-            return [[np.nan, np.nan, np.nan], localized_point]
+        # if torch.linalg.norm((torch.tensor(localized_point) - torch.tensor(start_pose))[:2]) < 0.7:
+        #     return [[np.nan, np.nan, np.nan], localized_point]
 
         point = self.sample_navigation(start_pose, localized_point)
-        # if mode == 'navigation' and torch.linalg.norm(torch.Tensor(point)[:2] - torch.linalg.norm(localized_point[:2])) > 2.0:
-        #     localized_point = self.sample_frontier(start_pose, None)
-        #     mode = 'exploration'
-        #     point = self.sample_navigation(start_pose, localized_point)
-        #     debug_text += '## All reachable points of robot are too far from the target object, explore to find new paths. \n'
 
         if self.rerun:
             buf = BytesIO()
@@ -300,6 +290,7 @@ def compute_path(self, start_pose):
                 rr.log('/Past_observation_most_similar_to_text', rr.Image(rgb), static = self.static)
         traj = []
         waypoints = None
+
         if point is None:
             print('Unable to find any target point, some exception might happen')
         else:
@@ -309,9 +300,9 @@ def compute_path(self, start_pose):
                 waypoints = [pt.state for pt in res.trajectory]
                 # If we are navigating to some object of interst, send (x, y, z) of 
                 # the object so that we can make sure the robot looks at the object after navigation
-                finished = len(waypoints) <= 5 and mode == 'navigation'
+                finished = len(waypoints) <= 10 and mode == 'navigation'
                 if not finished:
-                    waypoints = waypoints[:4]
+                    waypoints = waypoints[:8]
                 traj = self.planner.clean_path_for_xy(waypoints)
                 # traj = traj[1:]
                 if finished:
@@ -319,6 +310,9 @@ def compute_path(self, start_pose):
                     if isinstance(localized_point, torch.Tensor):
                         localized_point = localized_point.tolist()
                     traj.append(localized_point)
+                print('Planned trajectory:', traj)
+            else:
+                print('[FAILURE]', res.reason)
             
         if traj is not None:
             origins = []
@@ -329,20 +323,10 @@ def compute_path(self, start_pose):
                     vectors.append([traj[idx + 1][0] - traj[idx][0], traj[idx + 1][1] - traj[idx][1], 0])
             rr.log("/direction", rr.Arrows3D(origins = origins, vectors = vectors, colors=torch.Tensor([0, 1, 0]), radii=0.05), static = self.static)
             rr.log("/robot_start_pose", rr.Points3D([start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1), static = self.static)
-        
-        self.write_to_pickle()
-        with self.voxel_map_lock:
-            self.traj = (text, traj, start_pose)
 
-    def process_text(self, text, start_pose):
-        if self.traj is not None:
-            saved_text, traj, saved_start_pose = self.traj
-            if saved_text == text and np.linalg.norm(saved_start_pose - start_pose) <= 0.25:
-                return traj
-        with self.voxel_map_lock:
-            self.text = text
-        self.compute_path(start_pose)
-        return self.traj[1]
+        # self.write_to_pickle()
+        return traj
+
 
 
     # def process_text(self, text, start_pose):
@@ -460,7 +444,7 @@ def process_text(self, text, start_pose):
     def sample_navigation(self, start, point, mode = 'navigation'):
         # plt.clf()
         obstacles, _ = self.voxel_map.get_2d_map()
-        # plt.imshow(obstacles)
+        plt.imshow(obstacles)
         if point is None:
             start_pt = self.planner.to_pt(start)
             # plt.scatter(start_pt[1], start_pt[0], s = 10)
@@ -469,12 +453,12 @@ def sample_navigation(self, start, point, mode = 'navigation'):
         print("point:", point, "goal:", goal)
         obstacles, explored = self.voxel_map.get_2d_map()
         start_pt = self.planner.to_pt(start)
-        # plt.scatter(start_pt[1], start_pt[0], s = 15, c = 'b')
+        plt.scatter(start_pt[1], start_pt[0], s = 15, c = 'b')
         point_pt = self.planner.to_pt(point)
-        # plt.scatter(point_pt[1], point_pt[0], s = 15, c = 'g')
+        plt.scatter(point_pt[1], point_pt[0], s = 15, c = 'g')
         if goal is not None:
             goal_pt = self.planner.to_pt(goal)
-            # plt.scatter(goal_pt[1], goal_pt[0], s = 10, c = 'r')
+            plt.scatter(goal_pt[1], goal_pt[0], s = 10, c = 'r')
         return goal
 
     def sample_frontier(self, start_pose = [0, 0, 0], text = None):
@@ -775,9 +759,6 @@ def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
             if num_frames > 0 and i >= num_frames:
                 break
 
-            if rgb is None:
-                continue
-
             camera_pose = self.voxel_map.fix_data_type(camera_pose)
             xyz = self.voxel_map.fix_data_type(xyz)
             rgb = self.voxel_map.fix_data_type(rgb)
@@ -819,26 +800,13 @@ def write_to_pickle(self):
         data["depth"] = []
         data["feats"] = []
 
-        image_ids = torch.unique(self.voxel_map.voxel_pcd._obs_counts).tolist()
-
         for idx, frame in enumerate(self.voxel_map.observations):
             # add it to pickle
             # TODO: switch to using just Obs struct?
-            if idx in image_ids:
-                data["camera_poses"].append(frame.camera_pose)
-                data["camera_K"].append(frame.camera_K)
-                data["rgb"].append(frame.rgb)
-                data["depth"].append(frame.depth)
-            else:
-                data["camera_poses"].append(None)
-                data["camera_K"].append(None)
-                data["rgb"].append(None)
-                data["depth"].append(None)
-            # We might not need this
-            # data["xyz"].append(frame.xyz)
-            # data["world_xyz"].append(frame.full_world_xyz)
-            # data["feats"].append(frame.feats)
-            # data["base_poses"].append(frame.base_pose)
+            data["xyz"].append(frame.xyz)
+            data["world_xyz"].append(frame.full_world_xyz)
+            data["feats"].append(frame.feats)
+            data["base_poses"].append(frame.base_pose)
             data["xyz"].append(None)
             data["world_xyz"].append(None)
             data["feats"].append(None)
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 89a146089..1f6e5bff3 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -94,6 +94,7 @@ def __init__(self):
                 vectors=[[1, 0, 0], [0, 1, 0], [0, 0, 1]],
                 colors=[[255, 0, 0], [0, 255, 0], [0, 0, 255]],
             ),
+            static = True
         )
 
         self.bbox_colors_memory = {}
@@ -113,10 +114,10 @@ def setup_blueprint(self):
     def log_head_camera(self, obs):
         """Log head camera pose and images"""
         rr.set_time_seconds("realtime", time.time())
-        rr.log("world/head_camera/rgb", rr.Image(obs["rgb"]))
-        rr.log("world/head_camera/depth", rr.DepthImage(obs["depth"]))
+        rr.log("world/head_camera/rgb", rr.Image(obs["rgb"]), static = True)
+        rr.log("world/head_camera/depth", rr.DepthImage(obs["depth"]), static = True)
         rot, trans = decompose_homogeneous_matrix(obs["camera_pose"])
-        rr.log("world/head_camera", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3))
+        rr.log("world/head_camera", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3), static = True)
         rr.log(
             "world/head_camera",
             rr.Pinhole(
@@ -124,6 +125,7 @@ def log_head_camera(self, obs):
                 image_from_camera=obs["camera_K"],
                 image_plane_distance=0.35,
             ),
+            static = True
         )
 
     def log_robot_xyt(self, obs):
@@ -138,8 +140,8 @@ def log_robot_xyt(self, obs):
             labels="robot",
             colors=[255, 0, 0, 255],
         )
-        rr.log("world/robot/arrow", rb_arrow)
-        rr.log("world/robot/blob", rr.Points3D([0, 0, 0], colors=[255, 0, 0, 255], radii=0.13))
+        rr.log("world/robot/arrow", rb_arrow, static = True)
+        rr.log("world/robot/blob", rr.Points3D([0, 0, 0], colors=[255, 0, 0, 255], radii=0.13), static = True)
         rr.log(
             "world/robot",
             rr.Transform3D(
@@ -147,6 +149,7 @@ def log_robot_xyt(self, obs):
                 rotation=rr.RotationAxisAngle(axis=[0, 0, 1], radians=theta),
                 axis_length=0.7,
             ),
+            static = True
         )
 
     def log_ee_frame(self, obs):
@@ -160,8 +163,8 @@ def log_ee_frame(self, obs):
         ee_arrow = rr.Arrows3D(
             origins=[0, 0, 0], vectors=[0.2, 0, 0], radii=0.02, labels="ee", colors=[0, 255, 0, 255]
         )
-        rr.log("world/ee/arrow", ee_arrow)
-        rr.log("world/ee", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3))
+        rr.log("world/ee/arrow", ee_arrow, static = True)
+        rr.log("world/ee", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3), static = True)
 
     def log_ee_camera(self, servo):
         """Log end effector camera pose and images
@@ -170,10 +173,10 @@ def log_ee_camera(self, servo):
         """
         rr.set_time_seconds("realtime", time.time())
         # EE Camera
-        rr.log("world/ee_camera/rgb", rr.Image(servo.ee_rgb))
-        rr.log("world/ee_camera/depth", rr.DepthImage(servo.ee_depth))
+        rr.log("world/ee_camera/rgb", rr.Image(servo.ee_rgb), static = True)
+        rr.log("world/ee_camera/depth", rr.DepthImage(servo.ee_depth), static = True)
         rot, trans = decompose_homogeneous_matrix(servo.ee_camera_pose)
-        rr.log("world/ee_camera", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3))
+        rr.log("world/ee_camera", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3), static = True)
         rr.log(
             "world/ee_camera",
             rr.Pinhole(
@@ -181,6 +184,7 @@ def log_ee_camera(self, servo):
                 image_from_camera=servo.ee_camera_K,
                 image_plane_distance=0.35,
             ),
+            static = True
         )
 
     def log_robot_state(self, obs):
@@ -188,7 +192,7 @@ def log_robot_state(self, obs):
         rr.set_time_seconds("realtime", time.time())
         state = obs["joint"]
         for k in HelloStretchIdx.name_to_idx:
-            rr.log(f"robot_state/joint_pose/{k}", rr.Scalar(state[HelloStretchIdx.name_to_idx[k]]))
+            rr.log(f"robot_state/joint_pose/{k}", rr.Scalar(state[HelloStretchIdx.name_to_idx[k]]), static = True)
 
     def update_voxel_map(self, space: SparseVoxelMapNavigationSpace):
         """Log voxel map
@@ -203,6 +207,7 @@ def update_voxel_map(self, space: SparseVoxelMapNavigationSpace):
         rr.log(
             "world/point_cloud",
             rr.Points3D(positions=points, radii=np.ones(rgb.shape[0]) * 0.01, colors=np.int64(rgb)),
+            static = True
         )
         grid_origin = space.voxel_map.grid_origin
         obstacles, explored = space.voxel_map.get_2d_map()
@@ -216,6 +221,7 @@ def update_voxel_map(self, space: SparseVoxelMapNavigationSpace):
             rr.Points3D(
                 positions=obs_points, radii=np.ones(points.shape[0]) * 0.025, colors=[255, 0, 0]
             ),
+            static = True
         )
         rr.log(
             "world/explored",
@@ -224,6 +230,7 @@ def update_voxel_map(self, space: SparseVoxelMapNavigationSpace):
                 radii=np.ones(points.shape[0]) * 0.01,
                 colors=[255, 255, 255],
             ),
+            static = True
         )
 
     def update_scene_graph(
@@ -252,6 +259,7 @@ def update_scene_graph(
                 rr.log(
                     f"world/{instance.id}_{name}",
                     rr.Points3D(positions=point_cloud_rgb, colors=np.int64(pcd_rgb)),
+                    static = True
                 )
                 half_sizes = [(b[0] - b[1]) / 2 for b in bbox_bounds]
                 bounds.append(half_sizes)
@@ -269,6 +277,7 @@ def update_scene_graph(
                     radii=0.01,
                     colors=colors,
                 ),
+                static = True
             )
 
     def step(self, obs, servo):

From 87a184d33d9fc6ea0dff0ba9a22da4867ac4edda Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 10:50:37 -0400
Subject: [PATCH 048/410] Update

---
 src/stretch/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/version.py b/src/stretch/version.py
index 8126e484a..ae3ecf3ff 100644
--- a/src/stretch/version.py
+++ b/src/stretch/version.py
@@ -12,7 +12,7 @@
 # 2) we can import it in setup.py for the same reason
 # 3) we can import it into your module
 
-__version__ = "0.0.10"
+__version__ = "0.0.11"
 __stretchpy_protocol__ = "spp0"
 
 if __name__ == "__main__":

From de45ef2c65f27ed9ef912472e0951445281dd43e Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 10:52:08 -0400
Subject: [PATCH 049/410] run a new mapping test; remove some printouts

---
 src/stretch/agent/zmq_client.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 7557efaae..021ab74ad 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -451,7 +451,9 @@ def reset(self):
         self._finish = False
         self._last_step = -1
 
-    def open_gripper(self, blocking: bool = True, timeout: float = 10.0) -> bool:
+    def open_gripper(
+        self, blocking: bool = True, timeout: float = 10.0, verbose: bool = False
+    ) -> bool:
         """Open the gripper based on hard-coded presets."""
         gripper_target = self._robot_model.GRIPPER_OPEN
         print("[ZMQ CLIENT] Opening gripper to", gripper_target)
@@ -463,7 +465,8 @@ def open_gripper(self, blocking: bool = True, timeout: float = 10.0) -> bool:
                 joint_state = self.get_joint_positions()
                 if joint_state is None:
                     continue
-                print("Opening gripper:", joint_state[HelloStretchIdx.GRIPPER])
+                if verbose:
+                    print("Opening gripper:", joint_state[HelloStretchIdx.GRIPPER])
                 gripper_err = np.abs(joint_state[HelloStretchIdx.GRIPPER] - gripper_target)
                 if gripper_err < 0.1:
                     return True
@@ -477,7 +480,11 @@ def open_gripper(self, blocking: bool = True, timeout: float = 10.0) -> bool:
         return True
 
     def close_gripper(
-        self, loose: bool = False, blocking: bool = True, timeout: float = 10.0
+        self,
+        loose: bool = False,
+        blocking: bool = True,
+        timeout: float = 10.0,
+        verbose: bool = False,
     ) -> bool:
         """Close the gripper based on hard-coded presets."""
         gripper_target = (
@@ -492,7 +499,8 @@ def close_gripper(
                 if joint_state is None:
                     continue
                 gripper_err = np.abs(joint_state[HelloStretchIdx.GRIPPER] - gripper_target)
-                print("Closing gripper:", gripper_err, gripper_target)
+                if verbose:
+                    print("Closing gripper:", gripper_err, gripper_target)
                 if gripper_err < 0.1:
                     return True
                 t1 = timeit.default_timer()

From 100daab0c8d5ce147ab5fd74dbe844fcb165e9de Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 11:12:16 -0400
Subject: [PATCH 050/410] update motion planning params

---
 src/stretch/agent/robot_agent.py        | 9 +++++----
 src/stretch/config/default_planner.yaml | 8 ++++----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index f2c903e31..9d1865f7d 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -120,10 +120,10 @@ def __init__(
         self.space = SparseVoxelMapNavigationSpace(
             self.voxel_map,
             self.robot.get_robot_model(),
-            step_size=parameters["step_size"],
-            rotation_step_size=parameters["rotation_step_size"],
-            dilate_frontier_size=parameters["dilate_frontier_size"],
-            dilate_obstacle_size=parameters["dilate_obstacle_size"],
+            step_size=parameters["motion_planner"]["step_size"],
+            rotation_step_size=parameters["motion_planner"]["rotation_step_size"],
+            dilate_frontier_size=parameters["motion_planner"]["frontier"]["dilate_frontier_size"],
+            dilate_obstacle_size=parameters["motion_planner"]["frontier"]["dilate_obstacle_size"],
             grid=self.voxel_map.grid,
         )
 
@@ -1035,6 +1035,7 @@ def plan_to_frontier(
             sampler = self.space.sample_closest_frontier(
                 start,
                 verbose=False,
+                expand_size=self._default_expand_frontier_size,
                 min_dist=self._frontier_min_dist,
                 step_dist=self._frontier_step_dist,
             )
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 7557062d4..cfabf3d97 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -96,16 +96,16 @@ scene_graph:
   max_on_height: 0.2
 
 # Navigation space - used for motion planning and computing goals.
-step_size: 0.1
-rotation_step_size: 0.2 
-dilate_frontier_size: 3  # Used to shrink the frontier back from the edges of the world
-dilate_obstacle_size: 4  # Used when selecting goals and computing what the "frontier" is 
 motion_planner:
+  step_size: 0.1
+  rotation_step_size: 0.2 
   simplify_plans: True
   shortcut_plans: True
   shortcut_iter: 100
   # Parameters for frontier exploration using the motion planner.
   frontier:
+    dilate_frontier_size: 3  # Used to shrink the frontier back from the edges of the world
+    dilate_obstacle_size: 5  # Used when selecting goals and computing what the "frontier" is 
     default_expand_frontier_size: 10  # margin along the frontier where final robot position can be
     # Distance away you search for frontier points
     min_dist: 0.1

From 042d7d9ac989f1770815cbdaa25875fc65891d35 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 11:38:06 -0400
Subject: [PATCH 051/410] Update code to make sure we can get better tforms

---
 src/stretch/agent/robot_agent.py             | 19 +++--
 src/stretch/mapping/instance/instance_map.py | 52 ++++++++------
 src/stretch/mapping/voxel/voxel.py           | 27 ++++---
 src/stretch/utils/point_cloud.py             | 76 ++++++++++++++++++++
 4 files changed, 135 insertions(+), 39 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 9d1865f7d..e9cc7a7bc 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -34,7 +34,7 @@
 from stretch.perception.encoders import BaseImageTextEncoder, get_encoder
 from stretch.perception.wrapper import OvmmPerception
 from stretch.utils.geometry import angle_difference
-from stretch.utils.point_cloud import find_se3_transform
+from stretch.utils.point_cloud import ransac_transform
 
 
 class RobotAgent:
@@ -1125,10 +1125,12 @@ def run_exploration(
             else:
                 # if it fails, try again or just quit
                 if self._retry_on_fail:
-                    print("Failed. Try again!")
+                    print("Exploration failed. Try again!")
                     continue
                 else:
-                    print("Failed. Quitting!")
+                    print("Start = ", start)
+                    print("Reason = ", res.reason)
+                    print("Exploration failed. Quitting!")
                     break
 
             # Error handling
@@ -1356,6 +1358,8 @@ def load_map(self, filename: str, color_weight: float = 0.5, debug: bool = False
             filename(str): the name of the file to load the map from
         """
 
+        debug = True
+
         # Load the map from the file
         loaded_voxel_map = self._create_voxel_map(self.parameters)
         loaded_voxel_map.read_from_pickle(filename, perception=self.semantic_sensor)
@@ -1363,7 +1367,8 @@ def load_map(self, filename: str, color_weight: float = 0.5, debug: bool = False
         xyz1, _, _, rgb1 = loaded_voxel_map.get_pointcloud()
         xyz2, _, _, rgb2 = self.voxel_map.get_pointcloud()
 
-        tform = find_se3_transform(xyz1, xyz2, rgb1, rgb2)
+        # tform = find_se3_transform(xyz1, xyz2, rgb1, rgb2)
+        tform = ransac_transform(xyz1, xyz2, visualize=debug)
 
         # Apply the transform to the loaded map
         xyz1 = xyz1 @ tform[0].T + tform[1]
@@ -1373,11 +1378,13 @@ def load_map(self, filename: str, color_weight: float = 0.5, debug: bool = False
             from stretch.utils.point_cloud import show_point_cloud
 
             _xyz = np.concatenate([xyz1.cpu().numpy(), xyz2.cpu().numpy()], axis=0)
-            _rgb = np.concatenate([rgb1.cpu().numpy(), rgb2.cpu().numpy() * 0.5], axis=0)
+            _rgb = np.concatenate([rgb1.cpu().numpy(), rgb2.cpu().numpy() * 0.5], axis=0) / 255
             show_point_cloud(_xyz, _rgb, orig=np.zeros(3))
 
+        breakpoint()
+
         # Add the loaded map to the current map
-        self.voxel_map.add_pointcloud(xyz1, rgb1)
+        # self.voxel_map.add_pointcloud(xyz1, rgb1)
 
     def get_detections(self, **kwargs) -> List[Instance]:
         """Get the current detections."""
diff --git a/src/stretch/mapping/instance/instance_map.py b/src/stretch/mapping/instance/instance_map.py
index 9d1800fdd..63608aa43 100644
--- a/src/stretch/mapping/instance/instance_map.py
+++ b/src/stretch/mapping/instance/instance_map.py
@@ -576,6 +576,7 @@ def process_instances_for_env(
         background_instance_labels: List[int] = [0],
         valid_points: Optional[Tensor] = None,
         pose: Optional[Tensor] = None,
+        verbose: bool = False,
     ):
         """
         Process instance information in the current frame and add instance views to the list of unprocessed views for future association.
@@ -772,13 +773,15 @@ def process_instances_for_env(
                 volume = float(box3d_volume_from_bounds(bounds).squeeze())
 
                 if volume < float(self.min_instance_vol):
-                    logger.info(
-                        f"Skipping box with {n_points} points in cloud and {n_points} points in mask and {volume} volume",
-                    )
+                    if verbose:
+                        logger.info(
+                            f"Skipping box with {n_points} points in cloud and {n_points} points in mask and {volume} volume",
+                        )
                 elif volume > float(self.max_instance_vol):
-                    logger.info(
-                        f"Skipping box with {n_points} points in cloud and {n_points} points in mask and {volume} volume",
-                    )
+                    if verbose:
+                        logger.info(
+                            f"Skipping box with {n_points} points in cloud and {n_points} points in mask and {volume} volume",
+                        )
                 elif (
                     min(
                         bounds[0][1] - bounds[0][0],
@@ -787,17 +790,20 @@ def process_instances_for_env(
                     )
                     < self.min_instance_thickness
                 ):
-                    logger.info(
-                        f"Skipping a flat instance with {n_points} points",
-                    )
+                    if verbose:
+                        logger.info(
+                            f"Skipping a flat instance with {n_points} points",
+                        )
                 elif (bounds[2][0] + bounds[2][1]) / 2.0 < self.min_instance_height:
-                    logger.info(
-                        f"Skipping a instance with low height: {(bounds[2][0] + bounds[2][1]) / 2.0}",
-                    )
+                    if verbose:
+                        logger.info(
+                            f"Skipping a instance with low height: {(bounds[2][0] + bounds[2][1]) / 2.0}",
+                        )
                 elif (bounds[2][0] + bounds[2][1]) / 2.0 > self.max_instance_height:
-                    logger.info(
-                        f"Skipping a instance with high height: {(bounds[2][0] + bounds[2][1]) / 2.0}",
-                    )
+                    if verbose:
+                        logger.info(
+                            f"Skipping a instance with high height: {(bounds[2][0] + bounds[2][1]) / 2.0}",
+                        )
                 else:
                     # get instance view
                     instance_view = InstanceView(
@@ -820,15 +826,17 @@ def process_instances_for_env(
                     self.unprocessed_views[env_id][instance_id.item()] = instance_view
                     added = True
             else:
-                logger.info(
-                    f"Skipping a small instance with {n_mask} pixels",
-                )
+                if verbose:
+                    logger.info(
+                        f"Skipping a small instance with {n_mask} pixels",
+                    )
 
             t1 = timeit.default_timer()
-            if added:
-                print(f"Added Instance {instance_id} took {t1-t0} seconds")
-            else:
-                print(f"Skipped Instance {instance_id} took {t1-t0} seconds")
+            if verbose:
+                if added:
+                    print(f"Added Instance {instance_id} took {t1-t0} seconds")
+                else:
+                    print(f"Skipped Instance {instance_id} took {t1-t0} seconds")
 
         # This timestep should be passable (e.g. for Spot we have a Datetime object)
         self.timesteps[env_id] += 1
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index 8c3a5c42c..da10e078b 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -690,17 +690,22 @@ def read_from_pickle(
             return False
 
         for i, (camera_pose, K, rgb, feats, depth, base_pose, instance) in enumerate(
-            # TODO: compression of observations
-            # Right now we just do not support this
-            # data["obs"],  TODO: compression of Observations
-            zip(
-                data["camera_poses"],
-                data["camera_K"],
-                data["rgb"],
-                data["feats"],
-                data["depth"],
-                data["base_poses"],
-                instance_data,
+            tqdm.tqdm(
+                # TODO: compression of observations
+                # Right now we just do not support this
+                # data["obs"],  TODO: compression of Observations
+                zip(
+                    data["camera_poses"],
+                    data["camera_K"],
+                    data["rgb"],
+                    data["feats"],
+                    data["depth"],
+                    data["base_poses"],
+                    instance_data,
+                ),
+                ncols=80,
+                desc="Reading data from pickle",
+                unit="frame",
             )
         ):
             # Handle the case where we dont actually want to load everything
diff --git a/src/stretch/utils/point_cloud.py b/src/stretch/utils/point_cloud.py
index 713ec9bef..e722eb7d7 100644
--- a/src/stretch/utils/point_cloud.py
+++ b/src/stretch/utils/point_cloud.py
@@ -18,6 +18,7 @@
 import open3d as o3d
 import torch
 import trimesh.transformations as tra
+from scipy.spatial import cKDTree
 
 
 def numpy_to_pcd(xyz: np.ndarray, rgb: np.ndarray = None) -> o3d.geometry.PointCloud:
@@ -422,9 +423,84 @@ def dropout_random_ellipses(depth_img, dropout_mean, gamma_shape=10000, gamma_sc
 
 
 import numpy as np
+import open3d as o3d
 from scipy.spatial import cKDTree
 
 
+def ransac_transform(source_xyz, target_xyz, visualize=False):
+    """
+    Find the transformation between two point clouds using RANSAC.
+
+    Parameters:
+    source_xyz (np.ndarray): Source point cloud as a Nx3 numpy array
+    target_xyz (np.ndarray): Target point cloud as a Nx3 numpy array
+    visualize (bool): If True, visualize the registration result
+
+    Returns:
+    np.ndarray: 4x4 transformation matrix
+    """
+    # Convert numpy arrays to Open3D point clouds
+    source = o3d.geometry.PointCloud()
+    source.points = o3d.utility.Vector3dVector(source_xyz)
+    target = o3d.geometry.PointCloud()
+    target.points = o3d.utility.Vector3dVector(target_xyz)
+
+    # Estimate normals
+    source.estimate_normals(
+        search_param=o3d.geometry.KDTreeSearchParamHybrid(radius=0.1, max_nn=30)
+    )
+    target.estimate_normals(
+        search_param=o3d.geometry.KDTreeSearchParamHybrid(radius=0.1, max_nn=30)
+    )
+
+    # Compute FPFH features
+    source_fpfh = o3d.pipelines.registration.compute_fpfh_feature(
+        source, o3d.geometry.KDTreeSearchParamHybrid(radius=0.25, max_nn=100)
+    )
+    target_fpfh = o3d.pipelines.registration.compute_fpfh_feature(
+        target, o3d.geometry.KDTreeSearchParamHybrid(radius=0.25, max_nn=100)
+    )
+
+    # Apply RANSAC registration
+    distance_threshold = 0.01
+
+    result = o3d.pipelines.registration.registration_ransac_based_on_feature_matching(
+        source,
+        target,
+        source_fpfh,
+        target_fpfh,
+        True,
+        distance_threshold,
+        o3d.pipelines.registration.TransformationEstimationPointToPoint(False),
+        4,
+        [
+            o3d.pipelines.registration.CorrespondenceCheckerBasedOnEdgeLength(0.9),
+            o3d.pipelines.registration.CorrespondenceCheckerBasedOnDistance(distance_threshold),
+        ],
+        o3d.pipelines.registration.RANSACConvergenceCriteria(4000000, 500),
+    )
+
+    # Visualize if flag is set
+    if visualize:
+        # Apply the transformation to the source point cloud
+        source_transformed = source.transform(result.transformation)
+
+        # Color the point clouds
+        source_transformed.paint_uniform_color([1, 0, 0])  # Red for source
+        target.paint_uniform_color([0, 1, 0])  # Green for target
+
+        # Visualize the result
+        o3d.visualization.draw_geometries(
+            [source_transformed, target],
+            window_name="RANSAC Registration Result",
+            width=1200,
+            height=800,
+        )
+
+    # Return the transformation matrix
+    return result.transformation
+
+
 def find_se3_transform(
     cloud1: Union[torch.Tensor, np.ndarray],
     cloud2: Union[torch.Tensor, np.ndarray],

From 09f395d02ab13682884a4c0466e1fd65d92db58e Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 11:39:52 -0400
Subject: [PATCH 052/410] udpate

---
 src/stretch/mapping/instance/matching.py |  7 +++++--
 src/stretch/utils/point_cloud.py         | 13 ++++++-------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/stretch/mapping/instance/matching.py b/src/stretch/mapping/instance/matching.py
index b06a06bf1..79bdde7d8 100644
--- a/src/stretch/mapping/instance/matching.py
+++ b/src/stretch/mapping/instance/matching.py
@@ -171,6 +171,7 @@ def get_similarity(
     text_embedding1: Optional[Tensor] = None,
     text_embedding2: Optional[Tensor] = None,
     view_matching_config: ViewMatchingConfig = ViewMatchingConfig(),
+    verbose: bool = False,
 ):
     """Compute similarity based on bounding boxes for now"""
     # BBox similarity
@@ -180,14 +181,16 @@ def get_similarity(
         overlap_eps=view_matching_config.box_overlap_eps,
         mode=view_matching_config.box_match_mode,
     )
-    print(f"geometric similarity score: {overlap_similarity}")
+    if verbose:
+        print(f"geometric similarity score: {overlap_similarity}")
     similarity = overlap_similarity * view_matching_config.box_overlap_weight
 
     if view_matching_config.visual_similarity_weight > 0.0:
         visual_similarity = nn.CosineSimilarity(dim=1)(
             visual_embedding1, torch.stack(visual_embedding2, dim=0)
         ).unsqueeze(0)
-        print(f"visual similarity score: {visual_similarity}")
+        if verbose:
+            print(f"visual similarity score: {visual_similarity}")
         # Handle the case where there is no embedding to examine
         # If we return visual similarity, only then do we use it
         if visual_similarity is not None:
diff --git a/src/stretch/utils/point_cloud.py b/src/stretch/utils/point_cloud.py
index e722eb7d7..9148ba6b0 100644
--- a/src/stretch/utils/point_cloud.py
+++ b/src/stretch/utils/point_cloud.py
@@ -427,17 +427,18 @@ def dropout_random_ellipses(depth_img, dropout_mean, gamma_shape=10000, gamma_sc
 from scipy.spatial import cKDTree
 
 
-def ransac_transform(source_xyz, target_xyz, visualize=False):
+def ransac_transform(source_xyz, target_xyz, visualize=False, distance_threshold: float = 0.01):
     """
     Find the transformation between two point clouds using RANSAC.
 
     Parameters:
-    source_xyz (np.ndarray): Source point cloud as a Nx3 numpy array
-    target_xyz (np.ndarray): Target point cloud as a Nx3 numpy array
-    visualize (bool): If True, visualize the registration result
+        source_xyz (np.ndarray): Source point cloud as a Nx3 numpy array
+        target_xyz (np.ndarray): Target point cloud as a Nx3 numpy array
+        visualize (bool): If True, visualize the registration result
+        distance_threshold (float): Maximum correspondence distance
 
     Returns:
-    np.ndarray: 4x4 transformation matrix
+        np.ndarray: 4x4 transformation matrix
     """
     # Convert numpy arrays to Open3D point clouds
     source = o3d.geometry.PointCloud()
@@ -462,8 +463,6 @@ def ransac_transform(source_xyz, target_xyz, visualize=False):
     )
 
     # Apply RANSAC registration
-    distance_threshold = 0.01
-
     result = o3d.pipelines.registration.registration_ransac_based_on_feature_matching(
         source,
         target,

From 84d1a088f4979db95d69fc1872bc8eebe707902b Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 12:12:21 -0400
Subject: [PATCH 053/410] cleaning up terminal stuff

---
 src/stretch/agent/robot_agent.py   | 1 +
 src/stretch/mapping/voxel/voxel.py | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index e9cc7a7bc..ea0e02d3a 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1369,6 +1369,7 @@ def load_map(self, filename: str, color_weight: float = 0.5, debug: bool = False
 
         # tform = find_se3_transform(xyz1, xyz2, rgb1, rgb2)
         tform = ransac_transform(xyz1, xyz2, visualize=debug)
+        breakpoint()
 
         # Apply the transform to the loaded map
         xyz1 = xyz1 @ tform[0].T + tform[1]
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index da10e078b..5dd229d5a 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -108,6 +108,7 @@ class SparseVoxelMap(object):
         mask_cropped_instances="False",
     )
     debug_valid_depth: bool = False
+    debug_instance_memory_processing_time: bool = False
 
     def __init__(
         self,
@@ -535,8 +536,9 @@ def add(
             )
             t1 = timeit.default_timer()
             self.instances.associate_instances_to_memory()
-            t2 = timeit.default_timer()
-            print(__file__, ": Instance memory processing time: ", t1 - t0, t2 - t1)
+            if self.debug_instance_memory_processing_time:
+                t2 = timeit.default_timer()
+                print(__file__, ": Instance memory processing time: ", t1 - t0, t2 - t1)
 
         if self.prune_detected_objects:
             valid_depth = valid_depth & (instance_image == self.background_instance_label)

From 360ba0a03233c3024afa9cf6dc897b5db1fe1c96 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 12:26:09 -0400
Subject: [PATCH 054/410] Update stuff

---
 src/stretch/agent/robot_agent.py        | 16 +++++++++++-----
 src/stretch/config/default_planner.yaml |  2 +-
 src/stretch/utils/point_cloud.py        |  7 +++++--
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index ea0e02d3a..66e81a018 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1358,8 +1358,6 @@ def load_map(self, filename: str, color_weight: float = 0.5, debug: bool = False
             filename(str): the name of the file to load the map from
         """
 
-        debug = True
-
         # Load the map from the file
         loaded_voxel_map = self._create_voxel_map(self.parameters)
         loaded_voxel_map.read_from_pickle(filename, perception=self.semantic_sensor)
@@ -1368,11 +1366,19 @@ def load_map(self, filename: str, color_weight: float = 0.5, debug: bool = False
         xyz2, _, _, rgb2 = self.voxel_map.get_pointcloud()
 
         # tform = find_se3_transform(xyz1, xyz2, rgb1, rgb2)
-        tform = ransac_transform(xyz1, xyz2, visualize=debug)
-        breakpoint()
+        tform, fitness, inlier_rmse, num_inliers = ransac_transform(xyz1, xyz2, visualize=debug)
+        if debug:
+            print("Aligning maps...")
+            print("RANSAC transform:")
+            print(tform)
+            print("Fitness:", fitness)
+            print("Inlier RMSE:", inlier_rmse)
+            print("Num inliers:", num_inliers)
 
         # Apply the transform to the loaded map
-        xyz1 = xyz1 @ tform[0].T + tform[1]
+        xyz1 = xyz1 @ tform[:3, :3].T + tform[:3, 3]
+
+        debug = True
 
         if debug:
             # for visualization
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index cfabf3d97..923d2a16d 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -106,7 +106,7 @@ motion_planner:
   frontier:
     dilate_frontier_size: 3  # Used to shrink the frontier back from the edges of the world
     dilate_obstacle_size: 5  # Used when selecting goals and computing what the "frontier" is 
-    default_expand_frontier_size: 10  # margin along the frontier where final robot position can be
+    default_expand_frontier_size: 15  # margin along the frontier where final robot position can be
     # Distance away you search for frontier points
     min_dist: 0.1
     # Subsampling frontier space at this discretization
diff --git a/src/stretch/utils/point_cloud.py b/src/stretch/utils/point_cloud.py
index 9148ba6b0..6e09aa6c6 100644
--- a/src/stretch/utils/point_cloud.py
+++ b/src/stretch/utils/point_cloud.py
@@ -427,7 +427,7 @@ def dropout_random_ellipses(depth_img, dropout_mean, gamma_shape=10000, gamma_sc
 from scipy.spatial import cKDTree
 
 
-def ransac_transform(source_xyz, target_xyz, visualize=False, distance_threshold: float = 0.01):
+def ransac_transform(source_xyz, target_xyz, visualize=False, distance_threshold: float = 0.25):
     """
     Find the transformation between two point clouds using RANSAC.
 
@@ -439,6 +439,9 @@ def ransac_transform(source_xyz, target_xyz, visualize=False, distance_threshold
 
     Returns:
         np.ndarray: 4x4 transformation matrix
+        float: Fitness score
+        float: Inlier RMSE
+        int: Number of inliers found
     """
     # Convert numpy arrays to Open3D point clouds
     source = o3d.geometry.PointCloud()
@@ -497,7 +500,7 @@ def ransac_transform(source_xyz, target_xyz, visualize=False, distance_threshold
         )
 
     # Return the transformation matrix
-    return result.transformation
+    return result.transformation, result.fitness, result.inlier_rmse, len(result.correspondence_set)
 
 
 def find_se3_transform(

From 694c513a21935c24d10c533feacd3a99a0449f7e Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 12:28:26 -0400
Subject: [PATCH 055/410] updates

---
 src/stretch/agent/robot_agent.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 66e81a018..62cbcd474 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -304,6 +304,11 @@ def get_navigation_space(self) -> ConfigurationSpace:
         """Returns reference to the navigation space."""
         return self.space
 
+    @property
+    def navigation_space(self) -> ConfigurationSpace:
+        """Returns reference to the navigation space."""
+        return self.space
+
     def place_object(self, object_goal: Optional[str] = None, **kwargs) -> bool:
         """Try to place an object."""
         if not self.robot.in_manipulation_mode():
@@ -1158,7 +1163,7 @@ def run_exploration(
                 # After doing everything - show where we will move to
                 robot_center = np.zeros(3)
                 robot_center[:2] = self.robot.get_base_pose()[:2]
-                self.voxel_map.show(
+                self.navigation_space.show(
                     orig=robot_center,
                     xyt=self.robot.get_base_pose(),
                     footprint=self.robot.get_robot_model().get_footprint(),

From 9a3ecfe03c9c8f39cdbd8e95a42ae281b7c8d3de Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 12:46:44 -0400
Subject: [PATCH 056/410] some code cleanup

---
 src/stretch/agent/robot_agent.py |  2 --
 src/stretch/app/read_map.py      |  2 --
 src/stretch/app/rerun.py         | 10 ++++++++++
 src/stretch/app/vlm_planning.py  |  2 --
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 62cbcd474..5f558148e 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -51,7 +51,6 @@ def __init__(
         semantic_sensor: Optional[OvmmPerception] = None,
         grasp_client: Optional[AbstractGraspClient] = None,
         voxel_map: Optional[SparseVoxelMap] = None,
-        rpc_stub=None,
         debug_instances: bool = True,
         show_instances_detected: bool = False,
         use_instance_memory: bool = False,
@@ -64,7 +63,6 @@ def __init__(
         else:
             raise RuntimeError(f"parameters of unsupported type: {type(parameters)}")
         self.robot = robot
-        self.rpc_stub = rpc_stub
         self.grasp_client = grasp_client
         self.debug_instances = debug_instances
         self.show_instances_detected = show_instances_detected
diff --git a/src/stretch/app/read_map.py b/src/stretch/app/read_map.py
index 7d7f33c33..bd6ce5608 100644
--- a/src/stretch/app/read_map.py
+++ b/src/stretch/app/read_map.py
@@ -182,8 +182,6 @@ def main(
             dummy_robot,
             parameters,
             semantic_sensor=semantic_sensor,
-            rpc_stub=None,
-            grasp_client=None,
             voxel_map=loaded_voxel_map,
             use_instance_memory=(run_segmentation or show_instances),
         )
diff --git a/src/stretch/app/rerun.py b/src/stretch/app/rerun.py
index 870379ef4..730c5f4ae 100644
--- a/src/stretch/app/rerun.py
+++ b/src/stretch/app/rerun.py
@@ -13,20 +13,25 @@
 
 import click
 
+from stretch.agent import RobotAgent
 from stretch.agent.zmq_client import HomeRobotZmqClient
+from stretch.core import get_parameters
 
 
 @click.command()
 @click.option("--robot_ip", default="", help="IP address of the robot")
+@click.option("--test", is_flag=True, help="Set if we are testing the planner")
 @click.option(
     "--local",
     is_flag=True,
+    test=False,
     help="Set if we are executing on the robot and not on a remote computer",
 )
 def main(
     robot_ip: str = "192.168.1.15",
     local: bool = False,
     parameter_file: str = "config/default_planner.yaml",
+    test: bool = False,
 ):
     # Create robot
     robot = HomeRobotZmqClient(
@@ -35,6 +40,11 @@ def main(
         enable_rerun_server=True,
     )
     robot.start()
+    if test:
+        parameters = get_parameters(parameter_file)
+        agent = RobotAgent(robot, parameters)
+        robot.update()
+        robot.update_rerun()
     try:
         while True:
             time.sleep(0.01)
diff --git a/src/stretch/app/vlm_planning.py b/src/stretch/app/vlm_planning.py
index 1c8a04e07..e7a2067a0 100644
--- a/src/stretch/app/vlm_planning.py
+++ b/src/stretch/app/vlm_planning.py
@@ -156,8 +156,6 @@ def main(
     agent = RobotAgent(
         dummy_robot,
         parameters,
-        rpc_stub=None,
-        grasp_client=None,
         voxel_map=loaded_voxel_map,
         semantic_sensor=semantic_sensor,
     )

From 6964d02401d705c71e42548d9a20a51e5916920c Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 12:49:08 -0400
Subject: [PATCH 057/410] update the world

---
 src/stretch/app/rerun.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/stretch/app/rerun.py b/src/stretch/app/rerun.py
index 730c5f4ae..45aa2e28c 100644
--- a/src/stretch/app/rerun.py
+++ b/src/stretch/app/rerun.py
@@ -10,6 +10,7 @@
 # license information maybe found below, if so.
 
 import time
+import timeit
 
 import click
 
@@ -20,18 +21,17 @@
 
 @click.command()
 @click.option("--robot_ip", default="", help="IP address of the robot")
-@click.option("--test", is_flag=True, help="Set if we are testing the planner")
 @click.option(
     "--local",
     is_flag=True,
-    test=False,
     help="Set if we are executing on the robot and not on a remote computer",
 )
+@click.option("--test-svm", is_flag=True, help="Set if we are testing the planner")
 def main(
     robot_ip: str = "192.168.1.15",
     local: bool = False,
-    parameter_file: str = "config/default_planner.yaml",
-    test: bool = False,
+    parameter_file: str = "default_planner.yaml",
+    test_svm: bool = False,
 ):
     # Create robot
     robot = HomeRobotZmqClient(
@@ -40,11 +40,14 @@ def main(
         enable_rerun_server=True,
     )
     robot.start()
-    if test:
+    if test_svm:
         parameters = get_parameters(parameter_file)
         agent = RobotAgent(robot, parameters)
-        robot.update()
+        agent.update()
+        t0 = timeit.default_timer()
         robot.update_rerun()
+        t1 = timeit.default_timer()
+        print(f"Time to update rerun: {t1 - t0}")
     try:
         while True:
             time.sleep(0.01)

From 808647823e062ee8ecf7609fc85b93011a9fbac1 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 13:10:30 -0400
Subject: [PATCH 058/410] fixing rerun gui while exploring the world

---
 src/stretch/agent/robot_agent.py   |  2 +-
 src/stretch/app/rerun.py           |  2 +-
 src/stretch/visualization/rerun.py | 74 ++++++++++++++++++++++++------
 3 files changed, 61 insertions(+), 17 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 5f558148e..4500e40c0 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -42,7 +42,7 @@ class RobotAgent:
 
     _retry_on_fail: bool = False
     debug_update_timing: bool = False
-    update_rerun_every_time: bool = False
+    update_rerun_every_time: bool = True
 
     def __init__(
         self,
diff --git a/src/stretch/app/rerun.py b/src/stretch/app/rerun.py
index 45aa2e28c..29398d246 100644
--- a/src/stretch/app/rerun.py
+++ b/src/stretch/app/rerun.py
@@ -45,7 +45,7 @@ def main(
         agent = RobotAgent(robot, parameters)
         agent.update()
         t0 = timeit.default_timer()
-        robot.update_rerun()
+        agent.update_rerun()
         t1 = timeit.default_timer()
         print(f"Time to update rerun: {t1 - t0}")
     try:
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 89a146089..8d4d5c5c2 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -10,11 +10,13 @@
 # license information maybe found below, if so.
 
 import time
-from typing import Optional, Tuple
+import timeit
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import rerun as rr
 import rerun.blueprint as rrb
+import torch
 
 from stretch.mapping.scene_graph import SceneGraph
 from stretch.mapping.voxel.voxel_map import SparseVoxelMapNavigationSpace
@@ -41,9 +43,34 @@ def decompose_homogeneous_matrix(homogeneous_matrix: np.ndarray) -> Tuple[np.nda
     return rotation_matrix, translation_vector
 
 
+def occupancy_map_to_indices(occupancy_map):
+    """
+    Convert a 2D occupancy map to an Nx3 array of float indices of occupied cells.
+
+    Args:
+    occupancy_map (np.ndarray): 2D boolean array where True represents occupied cells.
+
+    Returns:
+    np.ndarray: Nx3 float array where each row is [x, y, 0] of an occupied cell.
+    """
+    # Find the indices of occupied cells
+    occupied_indices = np.where(occupancy_map)
+
+    # Create the Nx3 array
+    num_points = len(occupied_indices[0])
+    xyz_array = np.zeros((num_points, 3), dtype=float)
+
+    # Fill in x and y coordinates
+    xyz_array[:, 0] = occupied_indices[0]  # x coordinates
+    xyz_array[:, 1] = occupied_indices[1]  # y coordinates
+    # z coordinates are already 0
+
+    return xyz_array
+
+
 def occupancy_map_to_3d_points(
     occupancy_map: np.ndarray,
-    grid_center: np.ndarray,
+    grid_center: Union[np.ndarray, torch.Tensor],
     grid_resolution: float,
     offset: Optional[np.ndarray] = np.zeros(3),
 ) -> np.ndarray:
@@ -62,18 +89,12 @@ def occupancy_map_to_3d_points(
     rows, cols = occupancy_map.shape
     center_row, center_col, _ = grid_center
 
-    for i in range(rows):
-        for j in range(cols):
-            if occupancy_map[i][j]:
-                x = (i - center_row) * grid_resolution
-                y = (j - center_col) * grid_resolution
-                z = 0  # Assuming the map is 2D, so z is always 0
-                x = x + offset[0]
-                y = y + offset[1]
-                z = z + offset[2]
-                points.append(np.array([x, y, z]))
+    if isinstance(grid_center, torch.Tensor):
+        grid_center = grid_center.cpu().numpy()
 
-    return np.array(points)
+    indices = occupancy_map_to_indices(occupancy_map)
+    points = (indices - grid_center) * grid_resolution + offset
+    return points
 
 
 class RerunVsualizer:
@@ -190,12 +211,14 @@ def log_robot_state(self, obs):
         for k in HelloStretchIdx.name_to_idx:
             rr.log(f"robot_state/joint_pose/{k}", rr.Scalar(state[HelloStretchIdx.name_to_idx[k]]))
 
-    def update_voxel_map(self, space: SparseVoxelMapNavigationSpace):
-        """Log voxel map
+    def update_voxel_map(self, space: SparseVoxelMapNavigationSpace, debug: bool = False):
+        """Log voxel map and send it to Rerun visualizer
         Args:
             space (SparseVoxelMapNavigationSpace): Voxel map object
         """
         rr.set_time_seconds("realtime", time.time())
+
+        t0 = timeit.default_timer()
         points, _, _, rgb = space.voxel_map.voxel_pcd.get_pointcloud()
         if rgb is None:
             return
@@ -204,13 +227,25 @@ def update_voxel_map(self, space: SparseVoxelMapNavigationSpace):
             "world/point_cloud",
             rr.Points3D(positions=points, radii=np.ones(rgb.shape[0]) * 0.01, colors=np.int64(rgb)),
         )
+
+        t1 = timeit.default_timer()
         grid_origin = space.voxel_map.grid_origin
+        t2 = timeit.default_timer()
         obstacles, explored = space.voxel_map.get_2d_map()
+        t3 = timeit.default_timer()
+
+        # Get obstacles and explored points
         grid_resolution = space.voxel_map.grid_resolution
         obs_points = np.array(occupancy_map_to_3d_points(obstacles, grid_origin, grid_resolution))
+        t4 = timeit.default_timer()
+
+        # Get explored points
         explored_points = np.array(
             occupancy_map_to_3d_points(explored, grid_origin, grid_resolution)
         )
+        t5 = timeit.default_timer()
+
+        # Log points
         rr.log(
             "world/obstacles",
             rr.Points3D(
@@ -225,6 +260,15 @@ def update_voxel_map(self, space: SparseVoxelMapNavigationSpace):
                 colors=[255, 255, 255],
             ),
         )
+        t6 = timeit.default_timer()
+
+        if debug:
+            print("Time to get point cloud: ", t1 - t0, "% = ", (t1 - t0) / (t6 - t0))
+            print("Time to get grid origin: ", t2 - t1, "% = ", (t2 - t1) / (t6 - t0))
+            print("Time to get 2D map: ", t3 - t2, "% = ", (t3 - t2) / (t6 - t0))
+            print("Time to get obstacles points: ", t4 - t3, "% = ", (t4 - t3) / (t6 - t0))
+            print("Time to get explored points: ", t5 - t4, "% = ", (t5 - t4) / (t6 - t0))
+            print("Time to log points: ", t6 - t5, "% = ", (t6 - t5) / (t6 - t0))
 
     def update_scene_graph(
         self, scene_graph: SceneGraph, semantic_sensor: Optional[OvmmPerception] = None

From f1e7794310031772fbc85b3ae1a83ddbad3158cb Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 13:48:57 -0400
Subject: [PATCH 059/410] updates to fix voxel issues

---
 src/stretch/config/default_planner.yaml | 2 +-
 src/stretch/mapping/voxel/voxel.py      | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 923d2a16d..695035aaf 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -8,7 +8,7 @@ open_vocab_category_map_file: example_cat_map.json
 tts_engine: "gTTS"
 
 # Sparse Voxel Map parameters
-voxel_size: 0.05
+voxel_size: 0.01 # Size of a voxel in meters
 # 0.05 seems too low
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index 5dd229d5a..419175211 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -1155,6 +1155,7 @@ def from_parameters(
         return SparseVoxelMap(
             resolution=voxel_size,
             local_radius=parameters["local_radius"],
+            grid_resolution=parameters["voxel_size"],
             obs_min_height=parameters["obs_min_height"],
             obs_max_height=parameters["obs_max_height"],
             neg_obs_height=parameters["neg_obs_height"],

From ab0b2234d156f65d8dcec69a103feb1d293d001c Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 13:52:55 -0400
Subject: [PATCH 060/410] Updating things

---
 src/stretch/config/default_planner.yaml |  2 +-
 src/stretch/mapping/voxel/voxel.py      |  2 ++
 src/stretch/utils/voxel.py              | 20 ++++++++++++++++++++
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 695035aaf..a8d840aeb 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -8,7 +8,7 @@ open_vocab_category_map_file: example_cat_map.json
 tts_engine: "gTTS"
 
 # Sparse Voxel Map parameters
-voxel_size: 0.01 # Size of a voxel in meters
+voxel_size: 0.05 # Size of a voxel in meters
 # 0.05 seems too low
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index 419175211..bce86946f 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -551,7 +551,9 @@ def add(
 
         # TODO: weights could also be confidence, inv distance from camera, etc
         if world_xyz.nelement() > 0:
+            print("Adding this many points to the map:", world_xyz.shape)
             self.voxel_pcd.add(world_xyz, features=feats, rgb=rgb, weights=None)
+            print("Added points to the map:", self.voxel_pcd.num_points)
 
         if self._add_local_radius_points and len(self.observations) < 2:
             # Only do this at the first step, never after it.
diff --git a/src/stretch/utils/voxel.py b/src/stretch/utils/voxel.py
index 0aa0818d3..d60e1219b 100644
--- a/src/stretch/utils/voxel.py
+++ b/src/stretch/utils/voxel.py
@@ -269,6 +269,26 @@ def get_pointcloud(self) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
         """
         return self._points, self._features, self._weights, self._rgb
 
+    @property
+    def points(self) -> Tensor:
+        return self._points
+
+    @property
+    def features(self) -> Tensor:
+        return self._features
+
+    @property
+    def weights(self) -> Tensor:
+        return self._weights
+
+    @property
+    def rgb(self) -> Tensor:
+        return self._rgb
+
+    @property
+    def num_points(self) -> int:
+        return len(self._points)
+
     def clone(self):
         """
         Deep copy of object. All internal tensors are cloned individually.

From 04dd5939e895bfa58a385de9fe82a36dc4e82577 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 14:00:58 -0400
Subject: [PATCH 061/410] updates to voxel cfg

---
 src/stretch/utils/voxel.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/stretch/utils/voxel.py b/src/stretch/utils/voxel.py
index d60e1219b..6e2c24ecb 100644
--- a/src/stretch/utils/voxel.py
+++ b/src/stretch/utils/voxel.py
@@ -202,6 +202,7 @@ def add(
         cluster_voxel_idx, cluster_consecutive_idx, _ = voxelize(
             all_points, voxel_size=self.voxel_size, start=self._mins, end=self._maxs
         )
+
         self._points, self._features, self._weights, self._rgb = reduce_pointcloud(
             cluster_consecutive_idx,
             pos=all_points,

From dc10af78a40e6d6c4315b367a6d78239ed6130bb Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 14:34:59 -0400
Subject: [PATCH 062/410] updates to config

---
 src/stretch/config/default_planner.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index a8d840aeb..0d782374b 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -8,12 +8,12 @@ open_vocab_category_map_file: example_cat_map.json
 tts_engine: "gTTS"
 
 # Sparse Voxel Map parameters
-voxel_size: 0.05 # Size of a voxel in meters
+voxel_size: 0.04 # Size of a voxel in meters
 # 0.05 seems too low
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
 neg_obs_height: -0.05  # Things less than this height ARE obstacles
-obs_min_density: 10  # This many points makes it an obstacle
+obs_min_density: 15  # This many points makes it an obstacle
 
 # Padding
 pad_obstacles: 2  # Add this many units (voxel_size) to the area around obstacles
@@ -104,9 +104,9 @@ motion_planner:
   shortcut_iter: 100
   # Parameters for frontier exploration using the motion planner.
   frontier:
-    dilate_frontier_size: 3  # Used to shrink the frontier back from the edges of the world
+    dilate_frontier_size: 4  # Used to shrink the frontier back from the edges of the world
     dilate_obstacle_size: 5  # Used when selecting goals and computing what the "frontier" is 
-    default_expand_frontier_size: 15  # margin along the frontier where final robot position can be
+    default_expand_frontier_size: 10  # margin along the frontier where final robot position can be
     # Distance away you search for frontier points
     min_dist: 0.1
     # Subsampling frontier space at this discretization

From 97de0dd655062789bb6c4080b4064e9d33273936 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 15:11:44 -0400
Subject: [PATCH 063/410] Update camera pose and other information

---
 src/stretch/agent/robot_agent.py   | 10 +++++-----
 src/stretch/mapping/voxel/voxel.py | 12 +++++++++---
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 4500e40c0..4b459679a 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1381,8 +1381,6 @@ def load_map(self, filename: str, color_weight: float = 0.5, debug: bool = False
         # Apply the transform to the loaded map
         xyz1 = xyz1 @ tform[:3, :3].T + tform[:3, 3]
 
-        debug = True
-
         if debug:
             # for visualization
             from stretch.utils.point_cloud import show_point_cloud
@@ -1391,10 +1389,12 @@ def load_map(self, filename: str, color_weight: float = 0.5, debug: bool = False
             _rgb = np.concatenate([rgb1.cpu().numpy(), rgb2.cpu().numpy() * 0.5], axis=0) / 255
             show_point_cloud(_xyz, _rgb, orig=np.zeros(3))
 
-        breakpoint()
-
         # Add the loaded map to the current map
-        # self.voxel_map.add_pointcloud(xyz1, rgb1)
+        print("Reprocessing map with new pose transform...")
+        self.voxel_map.read_from_pickle(
+            filename, perception=self.semantic_sensor, transform_pose=tform
+        )
+        self.voxel_map.show()
 
     def get_detections(self, **kwargs) -> List[Instance]:
         """Get the current detections."""
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index bce86946f..b005a79e7 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -551,9 +551,9 @@ def add(
 
         # TODO: weights could also be confidence, inv distance from camera, etc
         if world_xyz.nelement() > 0:
-            print("Adding this many points to the map:", world_xyz.shape)
+            # print("Adding this many points to the map:", world_xyz.shape)
             self.voxel_pcd.add(world_xyz, features=feats, rgb=rgb, weights=None)
-            print("Added points to the map:", self.voxel_pcd.num_points)
+            # print("Added points to the map:", self.voxel_pcd.num_points)
 
         if self._add_local_radius_points and len(self.observations) < 2:
             # Only do this at the first step, never after it.
@@ -664,7 +664,11 @@ def fix_data_type(self, tensor) -> torch.Tensor:
             raise NotImplementedError("unsupported data type for tensor:", tensor)
 
     def read_from_pickle(
-        self, filename: str, num_frames: int = -1, perception: Optional[OvmmPerception] = None
+        self,
+        filename: str,
+        num_frames: int = -1,
+        perception: Optional[OvmmPerception] = None,
+        transform_pose: Optional[torch.Tensor] = None,
     ) -> bool:
         """Read from a pickle file as above. Will clear all currently stored data first."""
         self.reset_cache()
@@ -726,6 +730,8 @@ def read_from_pickle(
                 continue
 
             camera_pose = self.fix_data_type(camera_pose)
+            if transform_pose is not None:
+                camera_pose = camera_pose @ transform_pose
             if compressed:
                 rgb = compression.from_jpg(rgb)
                 depth = compression.from_jp2(depth) / 1000.0

From 9f064f788175c390b916d7967f25299dfdd9a0f2 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 15:21:55 -0400
Subject: [PATCH 064/410] update the code to make sure we can capture and load
 a voxel map

---
 src/stretch/agent/robot_agent.py   |  1 +
 src/stretch/mapping/voxel/voxel.py | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 4b459679a..c28f12811 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1390,6 +1390,7 @@ def load_map(self, filename: str, color_weight: float = 0.5, debug: bool = False
             show_point_cloud(_xyz, _rgb, orig=np.zeros(3))
 
         # Add the loaded map to the current map
+        self.voxel_map.show()
         print("Reprocessing map with new pose transform...")
         self.voxel_map.read_from_pickle(
             filename, perception=self.semantic_sensor, transform_pose=tform
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index b005a79e7..35667bf31 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -669,9 +669,19 @@ def read_from_pickle(
         num_frames: int = -1,
         perception: Optional[OvmmPerception] = None,
         transform_pose: Optional[torch.Tensor] = None,
+        reset: bool = False,
     ) -> bool:
-        """Read from a pickle file as above. Will clear all currently stored data first."""
-        self.reset_cache()
+        """Read from a pickle file as above. Will clear all currently stored data first.
+
+        Args:
+            filename(str): path to the pickle file
+            num_frames(int): number of frames to read from the file
+            perception(OvmmPerception): perception model to use for instance segmentation
+            transform_pose(torch.Tensor): transformation to apply to camera poses
+            reset(bool): whether to clear all currently stored data first
+        """
+        if reset:
+            self.reset_cache()
         if isinstance(filename, str):
             filename = Path(filename)
         assert filename.exists(), f"No file found at {filename}"

From 132924be9d5427ae9ea53a449e015d2cd026bbfc Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 15:42:40 -0400
Subject: [PATCH 065/410] update voxel loading code a bit

---
 src/stretch/agent/robot_agent.py   | 37 +++++++++++++++++++-----------
 src/stretch/mapping/voxel/voxel.py |  7 ++++--
 src/stretch/utils/point_cloud.py   | 16 ++++++++++---
 3 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index c28f12811..7f0e3c9a7 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1354,7 +1354,13 @@ def wave(self, **kwargs) -> bool:
 
         self.robot.switch_to_navigation_mode()
 
-    def load_map(self, filename: str, color_weight: float = 0.5, debug: bool = False) -> None:
+    def load_map(
+        self,
+        filename: str,
+        color_weight: float = 0.5,
+        debug: bool = False,
+        ransac_distance_threshold: float = 0.1,
+    ) -> None:
         """Load a map from a PKL file. Creates a new voxel map and loads the data from the file into this map. Then uses RANSAC to figure out where the current map and the loaded map overlap, computes a transform, and applies this transform to the loaded map to align it with the current map.
 
         Args:
@@ -1369,28 +1375,31 @@ def load_map(self, filename: str, color_weight: float = 0.5, debug: bool = False
         xyz2, _, _, rgb2 = self.voxel_map.get_pointcloud()
 
         # tform = find_se3_transform(xyz1, xyz2, rgb1, rgb2)
-        tform, fitness, inlier_rmse, num_inliers = ransac_transform(xyz1, xyz2, visualize=debug)
-        if debug:
-            print("Aligning maps...")
-            print("RANSAC transform:")
-            print(tform)
-            print("Fitness:", fitness)
-            print("Inlier RMSE:", inlier_rmse)
-            print("Num inliers:", num_inliers)
-
-        # Apply the transform to the loaded map
-        xyz1 = xyz1 @ tform[:3, :3].T + tform[:3, 3]
-
+        tform, fitness, inlier_rmse, num_inliers = ransac_transform(
+            xyz1, xyz2, visualize=debug, distance_threshold=ransac_distance_threshold
+        )
+        print("------------- Loaded map ---------------")
+        print("Aligning maps...")
+        print("RANSAC transform from old to new map:")
+        print(tform)
+        print("Fitness:", fitness)
+        print("Inlier RMSE:", inlier_rmse)
+        print("Num inliers:", num_inliers)
+        print("----------------------------------------")
+
+        # Apply the transform to the loaded map and
         if debug:
             # for visualization
             from stretch.utils.point_cloud import show_point_cloud
 
+            # Apply the transform to the loaded map
+            xyz1 = xyz1 @ tform[:3, :3].T + tform[:3, 3]
+
             _xyz = np.concatenate([xyz1.cpu().numpy(), xyz2.cpu().numpy()], axis=0)
             _rgb = np.concatenate([rgb1.cpu().numpy(), rgb2.cpu().numpy() * 0.5], axis=0) / 255
             show_point_cloud(_xyz, _rgb, orig=np.zeros(3))
 
         # Add the loaded map to the current map
-        self.voxel_map.show()
         print("Reprocessing map with new pose transform...")
         self.voxel_map.read_from_pickle(
             filename, perception=self.semantic_sensor, transform_pose=tform
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index 35667bf31..6077f1343 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -375,6 +375,7 @@ def add(
         instance_scores: Optional[Tensor] = None,
         obs: Optional[Observations] = None,
         xyz_frame: str = "camera",
+        pose_correction: Optional[Tensor] = None,
         **info,
     ):
         """Add this to our history of observations. Also update the current running map.
@@ -468,6 +469,9 @@ def add(
                 inv_intrinsics=torch.linalg.inv(camera_K[:3, :3]).unsqueeze(0),
             )
 
+        if pose_correction is not None:
+            full_world_xyz = full_world_xyz @ pose_correction[:3, :3].T + pose_correction[:3, 3]
+
         # add observations before we start changing things
         self.observations.append(
             Frame(
@@ -740,8 +744,6 @@ def read_from_pickle(
                 continue
 
             camera_pose = self.fix_data_type(camera_pose)
-            if transform_pose is not None:
-                camera_pose = camera_pose @ transform_pose
             if compressed:
                 rgb = compression.from_jpg(rgb)
                 depth = compression.from_jp2(depth) / 1000.0
@@ -783,6 +785,7 @@ def read_from_pickle(
                 instance_classes=instance_classes,
                 instance_scores=instance_scores,
                 camera_K=K,
+                pose_correction=transform_pose,
             )
         return True
 
diff --git a/src/stretch/utils/point_cloud.py b/src/stretch/utils/point_cloud.py
index 6e09aa6c6..aa70dac83 100644
--- a/src/stretch/utils/point_cloud.py
+++ b/src/stretch/utils/point_cloud.py
@@ -449,6 +449,7 @@ def ransac_transform(source_xyz, target_xyz, visualize=False, distance_threshold
     target = o3d.geometry.PointCloud()
     target.points = o3d.utility.Vector3dVector(target_xyz)
 
+    """
     # Estimate normals
     source.estimate_normals(
         search_param=o3d.geometry.KDTreeSearchParamHybrid(radius=0.1, max_nn=30)
@@ -474,12 +475,21 @@ def ransac_transform(source_xyz, target_xyz, visualize=False, distance_threshold
         True,
         distance_threshold,
         o3d.pipelines.registration.TransformationEstimationPointToPoint(False),
-        4,
-        [
+        ransac_n=4,
+        checkers=[
             o3d.pipelines.registration.CorrespondenceCheckerBasedOnEdgeLength(0.9),
             o3d.pipelines.registration.CorrespondenceCheckerBasedOnDistance(distance_threshold),
         ],
-        o3d.pipelines.registration.RANSACConvergenceCriteria(4000000, 500),
+        criteria=o3d.pipelines.registration.RANSACConvergenceCriteria(max_iteration=5000000, confidence=0.9999),
+    )
+    """
+
+    result = o3d.pipelines.registration.registration_icp(
+        source,
+        target,
+        distance_threshold,
+        np.eye(4),
+        o3d.pipelines.registration.TransformationEstimationPointToPoint(),
     )
 
     # Visualize if flag is set

From a8b365cca5c1ecd6af84f64666b5ef233111df9a Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 17:50:30 -0400
Subject: [PATCH 066/410] Add dinobot template

---
 src/stretch/perception/manipulation/__init__.py | 8 ++++++++
 src/stretch/perception/manipulation/dinobot.py  | 8 ++++++++
 2 files changed, 16 insertions(+)
 create mode 100644 src/stretch/perception/manipulation/__init__.py
 create mode 100644 src/stretch/perception/manipulation/dinobot.py

diff --git a/src/stretch/perception/manipulation/__init__.py b/src/stretch/perception/manipulation/__init__.py
new file mode 100644
index 000000000..2532abc5e
--- /dev/null
+++ b/src/stretch/perception/manipulation/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
diff --git a/src/stretch/perception/manipulation/dinobot.py b/src/stretch/perception/manipulation/dinobot.py
new file mode 100644
index 000000000..2532abc5e
--- /dev/null
+++ b/src/stretch/perception/manipulation/dinobot.py
@@ -0,0 +1,8 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.

From 97e26d968b15936a025198a7985202dc4a466df4 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 17:50:46 -0400
Subject: [PATCH 067/410] move to a better location

---
 src/stretch/{perception => agent}/manipulation/__init__.py | 0
 src/stretch/{perception => agent}/manipulation/dinobot.py  | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename src/stretch/{perception => agent}/manipulation/__init__.py (100%)
 rename src/stretch/{perception => agent}/manipulation/dinobot.py (100%)

diff --git a/src/stretch/perception/manipulation/__init__.py b/src/stretch/agent/manipulation/__init__.py
similarity index 100%
rename from src/stretch/perception/manipulation/__init__.py
rename to src/stretch/agent/manipulation/__init__.py
diff --git a/src/stretch/perception/manipulation/dinobot.py b/src/stretch/agent/manipulation/dinobot.py
similarity index 100%
rename from src/stretch/perception/manipulation/dinobot.py
rename to src/stretch/agent/manipulation/dinobot.py

From 62341228d7eb9a35a04703db00c02faebc30f3d7 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 9 Sep 2024 18:12:20 -0400
Subject: [PATCH 068/410] setting up the first pass at a dinobot config

---
 src/stretch/agent/manipulation/dinobot.py | 227 ++++++++++++++++++++++
 src/stretch/agent/zmq_client.py           |  31 +++
 2 files changed, 258 insertions(+)

diff --git a/src/stretch/agent/manipulation/dinobot.py b/src/stretch/agent/manipulation/dinobot.py
index 2532abc5e..5ecbeeaf1 100644
--- a/src/stretch/agent/manipulation/dinobot.py
+++ b/src/stretch/agent/manipulation/dinobot.py
@@ -6,3 +6,230 @@
 #
 # Some code may be adapted from other open-source works with their respective licenses. Original
 # license information maybe found below, if so.
+
+import glob
+
+import click
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+
+# Install this DINO repo to extract correspondences: https://github.com/ShirAmir/dino-vit-features
+from correspondences import draw_correspondences, find_correspondences
+from PIL import Image
+
+from stretch.agent import HomeRobotZmqClient
+
+# Hyperparameters for DINO correspondences extraction
+# num_pairs = 8
+# load_size = 224
+# layer = 9
+# facet = "key"
+# bin = True
+# thresh = 0.05
+# model_type = "dino_vits8"
+# stride = 4
+
+# @markdown Choose number of points to output:
+num_pairs = 10  # @param
+# @markdown Choose loading size:
+load_size = 224  # @param
+# @markdown Choose layer of descriptor:
+layer = 9  # @param
+# @markdown Choose facet of descriptor:
+facet = "key"  # @param
+# @markdown Choose if to use a binned descriptor:
+bin = True  # @param
+# @markdown Choose fg / bg threshold:
+thresh = 0.05  # @param
+# @markdown Choose model type:
+model_type = "dino_vits8"  # @param
+# @markdown Choose stride:
+stride = 4  # @param
+
+
+# Deployment hyperparameters
+ERR_THRESHOLD = 50  # A generic error between the two sets of points
+
+
+def load_demo(path_to_demo_folder):
+    # Load a demonstration from a folder containing a set of deltas.
+    # The deltas are stored as numpy arrays.
+    demo_deltas = []
+    for filename in glob.glob(path_to_demo_folder):
+        delta = np.load(filename)
+        demo_deltas.append(delta)
+
+    # pull out the first frame in the videos for the bottleneck images
+    rgb_bn = Image.open("bottleneck_rgb.png")
+    depth_bn = Image.open("bottleneck_depth.png")
+
+    return rgb_bn, depth_bn, demo_deltas
+
+
+def show_correspondences(points1, points2, image1_pil, image2_pil):
+    fig_1, ax1 = plt.subplots()
+    ax1.axis("off")
+    ax1.imshow(image1_pil)
+    fig_2, ax2 = plt.subplots()
+    ax2.axis("off")
+    ax2.imshow(image2_pil)
+
+    fig1, fig2 = draw_correspondences(points1, points2, image1_pil, image2_pil)
+    plt.show()
+
+
+def find_transformation(X, Y):
+    """
+    Given two sets of 3d points, find the rigid transformation that aligns them.
+
+    Args:
+        X: np.array of shape (n, 3) representing the first set of 3d points
+        Y: np.array of shape (n, 3) representing the second set of 3d points
+
+    Returns:
+        R: np.array of shape (3, 3) representing the rotation matrix
+        t: np.array of shape (3,) representing the translation vector
+    """
+
+    # Calculate the centroids of the two sets of points
+    cX = np.mean(X, axis=0)
+    cY = np.mean(Y, axis=0)
+
+    # Subtract the centroids to obtain two centered sets of points
+    Xc = X - cX
+    Yc = Y - cY
+
+    # Calculate the covariance matrix
+    C = np.dot(Xc.T, Yc)
+
+    # Compute SVD
+    U, S, Vt = np.linalg.svd(C)
+
+    # Determine the rotation matrix
+    R = np.dot(Vt.T, U.T)
+
+    # Determine translation vector
+    t = cY - np.dot(R, cX)
+    return R, t
+
+
+def extract_3d_coordinates(points, xyz):
+    """
+    Given a set of 2d points and a 3d point cloud, extract the 3d coordinates of the points.
+
+    Args:
+        points: np.array of shape (n, 2) representing the 2d points
+        xyz: np.array of shape (h, w, 3) representing the 3d point cloud
+
+    Returns:
+        np.array of shape (n, 3) representing the 3d coordinates of the points
+
+    """
+    # Extract the depth values of the points from the 3d point cloud
+    depths = []
+    for point in points:
+        x, y = point
+        depths.append(xyz[y, x, 2])
+
+    # Create a new array of shape (n, 3) with the 3d coordinates
+    points_3d = []
+    for i, point in enumerate(points):
+        x, y = point
+        points_3d.append([x, y, depths[i]])
+
+    return np.array(points_3d)
+
+
+def compute_error(points1: np.ndarray, points2: np.ndarray) -> float:
+    """Compute the error between two sets of points.
+
+    Args:
+        points1: np.array of shape (n, 3) representing the first set of 3d points
+        points2: np.array of shape (n, 3) representing the second set of 3d points
+
+    Returns:
+        float: The error between the two sets of points
+    """
+    return np.linalg.norm(np.array(points1) - np.array(points2))
+
+
+def replay_demo(robot, demo_deltas):
+    # Replay a demonstration by moving the robot according to the deltas.
+    breakpoint()
+    for delta in demo_deltas:
+        robot.move(delta)
+
+
+def dinobot_demo(robot, path_to_demo_folder):
+    # RECORD DEMO:
+    # Move the end-effector to the bottleneck pose and store the initial observation.
+
+    # Record a demonstration. A demonstration is a set of deltas between the bottleneck pose and the current pose.
+    rgb_bn, depth_bn, demo_deltas = load_demo(path_to_demo_folder)
+    xyz_bn = robot.depth_to_xyz(depth_bn)
+
+    # Reset the arm to the bottleneck pose relative to robot base
+    while robot.running:
+        error = 100000
+        while error > ERR_THRESHOLD:
+            # Collect observations at the current pose.
+            rgb_live, depth_live = robot.get_ee_rgbd()
+            xyz_live = robot.depth_to_xyz(depth_live)
+
+            # Compute pixel correspondences between new observation and bottleneck observation.
+            with torch.no_grad():
+                # This function from an external library takes image paths as input. Therefore, store the paths of the images, and load them.
+                points1, points2, image1_pil, image2_pil = find_correspondences(
+                    rgb_bn,
+                    rgb_live,
+                    num_pairs,
+                    load_size,
+                    layer,
+                    facet,
+                    bin,
+                    thresh,
+                    model_type,
+                    stride,
+                )
+
+                # Debug: visualize the correspondences
+                show_correspondences(points1, points2, image1_pil, image2_pil)
+
+            # Given the pixel coordinates of the correspondences, and their depth values,
+            # project the points to 3D space.
+            points1 = extract_3d_coordinates(points1, xyz_bn)
+            points2 = extract_3d_coordinates(points2, xyz_live)
+
+            # Find rigid translation and rotation that aligns the points by minimising error, using SVD.
+            R, t = find_transformation(points1, points2)
+
+            # Move robot
+            robot.move(t, R)
+            error = compute_error(points1, points2)
+
+        # Once error is small enough, execute the demonstration.
+        replay_demo(robot, demo_deltas)
+        break
+
+
+@click.command()
+@click.option("--robot_ip", default="", help="IP address of the robot")
+@click.option(
+    "--local",
+    is_flag=True,
+    help="Set if we are executing on the robot and not on a remote computer",
+)
+@click.option(
+    "--path_to_demo_folder", default="", help="Path to the folder containing the demonstration"
+)
+def main(robot_ip, local, path_to_demo_folder):
+    # Initialize robot
+    robot = HomeRobotZmqClient(
+        robot_ip=robot_ip,
+        use_remote_computer=(not local),
+        enable_rerun_server=False,
+    )
+
+    # Start the demo
+    dinobot_demo(robot, path_to_demo_folder)
diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index f927e8618..5bbbcf98b 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -193,6 +193,32 @@ def __init__(
     def parameters(self) -> Parameters:
         return self._parameters
 
+    def get_ee_rgbd(self) -> Tuple[np.ndarray, np.ndarray]:
+        """Get the RGB and depth images from the end effector camera.
+
+        Returns:
+            Tuple[np.ndarray, np.ndarray]: The RGB and depth images
+        """
+        with self._servo_lock:
+            if self._servo is None:
+                return None, None
+            rgb = self._servo_obs["ee_rgb"]
+            depth = self._servo_obs["ee_depth"]
+        return rgb, depth
+
+    def get_head_rgbd(self) -> Tuple[np.ndarray, np.ndarray]:
+        """Get the RGB and depth images from the head camera.
+
+        Returns:
+            Tuple[np.ndarray, np.ndarray]: The RGB and depth images
+        """
+        with self._servo_lock:
+            if self._servo is None:
+                return None, None
+            rgb = self._servo["head_rgb"]
+            depth = self._servo["head_depth"]
+        return rgb, depth
+
     def get_joint_state(self, timeout: float = 5.0) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
         """Get the current joint positions, velocities, and efforts"""
         t0 = timeit.default_timer()
@@ -1048,6 +1074,11 @@ def blocking_spin_servo(self, verbose: bool = False):
                 )
             t0 = timeit.default_timer()
 
+    @property
+    def running(self) -> bool:
+        """Is the client running"""
+        return not self._finish
+
     def blocking_spin_state(self, verbose: bool = False):
         """Listen for incoming observations and update internal state"""
 

From 7bb5f2e68fc54754e2dc65871b0680bfe6517d0a Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 10 Sep 2024 11:24:38 -0400
Subject: [PATCH 069/410] update 10->11

---
 src/stretch/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/version.py b/src/stretch/version.py
index 8126e484a..ae3ecf3ff 100644
--- a/src/stretch/version.py
+++ b/src/stretch/version.py
@@ -12,7 +12,7 @@
 # 2) we can import it in setup.py for the same reason
 # 3) we can import it into your module
 
-__version__ = "0.0.10"
+__version__ = "0.0.11"
 __stretchpy_protocol__ = "spp0"
 
 if __name__ == "__main__":

From 76ef26a0fdcd5700a1b58d36d788fa1e41d9dbe7 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 10 Sep 2024 11:52:20 -0400
Subject: [PATCH 070/410] update things to use YOLO instead

---
 src/stretch/app/query.py                                 | 5 +++++
 src/stretch/config/default_planner.yaml                  | 7 ++++---
 src/stretch/perception/detection/yolo/yolo_perception.py | 6 +++++-
 src/stretch/perception/wrapper.py                        | 9 +++++++--
 src/stretch/utils/voxel.py                               | 5 +++++
 5 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/src/stretch/app/query.py b/src/stretch/app/query.py
index ed19623da..34f24f466 100644
--- a/src/stretch/app/query.py
+++ b/src/stretch/app/query.py
@@ -60,6 +60,7 @@
     is_flag=True,
     help="Don't move the robot to the instance, if using real robot instead of offline data",
 )
+@click.option("--show-svm", is_flag=True, help="Show the SVM output")
 @click.option("-o", "--offline", is_flag=True, help="Run code offline on stored data.")
 def main(
     device_id: int = 0,
@@ -82,6 +83,7 @@ def main(
     all_matches: bool = False,
     threshold: float = 0.5,
     offline: bool = False,
+    show_svm: bool = False,
 ):
 
     print("- Load parameters")
@@ -134,6 +136,9 @@ def main(
         agent = RobotAgent(dummy_robot, parameters, semantic_sensor)
         agent.voxel_map.read_from_pickle(input_path, num_frames=frame)
 
+    if show_svm:
+        agent.voxel_map.show()
+
     try:
         if len(text) == 0:
             # Read text from command line
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 7557062d4..f1ba1ac67 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -28,16 +28,17 @@ max_depth: 2.0
 
 # Object detection parameters
 detection:
-  module: "detic"
+  # module: "detic"
+  module: "yolo"
   category_map_file: example_cat_map.json
-  use_detic_viz: True
+  use_detic_viz: False
 
 # Point cloud cleanup
 filters:
   smooth_kernel_size: 2
   #smooth_kernel_size: 0
   use_median_filter: True
-  median_filter_size: 2
+  median_filter_size: 3
   median_filter_max_error: 0.01
   use_derivative_filter: False
   derivative_filter_threshold: 0.1
diff --git a/src/stretch/perception/detection/yolo/yolo_perception.py b/src/stretch/perception/detection/yolo/yolo_perception.py
index 382a02f88..4400a36d5 100644
--- a/src/stretch/perception/detection/yolo/yolo_perception.py
+++ b/src/stretch/perception/detection/yolo/yolo_perception.py
@@ -117,7 +117,11 @@ def predict(
 
         if pred[0].boxes is None or pred[0].masks is None:
             task_observations["semantic_frame"] = None
-            return None, None, task_observations
+            return (
+                np.zeros(rgb.shape[0], rgb.shape[1]),
+                np.zeros(rgb.shape[0], rgb.shape[1]),
+                task_observations,
+            )
 
         class_idcs = pred[0].boxes.cls.cpu().numpy()
         masks = pred[0].masks.data.cpu().numpy()
diff --git a/src/stretch/perception/wrapper.py b/src/stretch/perception/wrapper.py
index 7f2f0e219..7d5d12d30 100644
--- a/src/stretch/perception/wrapper.py
+++ b/src/stretch/perception/wrapper.py
@@ -17,6 +17,7 @@
 
 import torch
 
+import stretch.utils.logger as logger
 from stretch.core.interfaces import Observations
 from stretch.core.parameters import Parameters, get_parameters
 from stretch.perception.constants import RearrangeDETICCategories
@@ -181,6 +182,7 @@ def predict(
         )
         obs.semantic = semantic
         obs.instance = instance
+        breakpoint()
         if obs.task_observations is None:
             obs.task_observations = task_observations
         else:
@@ -247,14 +249,17 @@ def create_semantic_sensor(
 ):
     """Create segmentation sensor and load config. Returns config from file, as well as a OvmmPerception object that can be used to label scenes."""
     if verbose:
-        print("- Loading configuration")
+        print("[PERCEPTION] Loading configuration")
     if parameters is None:
         parameters = get_parameters(config_path)
     if category_map_file is None:
         category_map_file = get_full_config_path(parameters["detection"]["category_map_file"])
 
     if verbose:
-        print("- Create and load vocabulary and perception model")
+        logger.alert(
+            "[PERCEPTION] Create and load vocabulary and perception model:",
+            parameters["detection"]["module"],
+        )
     semantic_sensor = OvmmPerception(
         parameters=parameters,
         gpu_device_id=device_id,
diff --git a/src/stretch/utils/voxel.py b/src/stretch/utils/voxel.py
index 0aa0818d3..0eef99c47 100644
--- a/src/stretch/utils/voxel.py
+++ b/src/stretch/utils/voxel.py
@@ -202,6 +202,9 @@ def add(
         cluster_voxel_idx, cluster_consecutive_idx, _ = voxelize(
             all_points, voxel_size=self.voxel_size, start=self._mins, end=self._maxs
         )
+        if self._points is not None:
+            print(self._points.shape, all_points.shape, cluster_consecutive_idx.shape)
+        print(all_points.shape)
         self._points, self._features, self._weights, self._rgb = reduce_pointcloud(
             cluster_consecutive_idx,
             pos=all_points,
@@ -210,6 +213,8 @@ def add(
             rgbs=all_rgb,
             feature_reduce=self.feature_pool_method,
         )
+        print(self._points.shape, all_points.shape, cluster_consecutive_idx.shape)
+        print("Done adding")
         return
 
     def get_idxs(self, points: Tensor) -> Tuple[Tensor, Tensor]:

From 5e05dbc47e1ca1e1560cc3003d2f4159e90dcf5c Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 10 Sep 2024 11:57:06 -0400
Subject: [PATCH 071/410] updates to perception code

---
 src/stretch/perception/detection/yolo/yolo_perception.py | 2 +-
 src/stretch/perception/wrapper.py                        | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/stretch/perception/detection/yolo/yolo_perception.py b/src/stretch/perception/detection/yolo/yolo_perception.py
index 4400a36d5..d2708e65c 100644
--- a/src/stretch/perception/detection/yolo/yolo_perception.py
+++ b/src/stretch/perception/detection/yolo/yolo_perception.py
@@ -119,7 +119,7 @@ def predict(
             task_observations["semantic_frame"] = None
             return (
                 np.zeros(rgb.shape[0], rgb.shape[1]),
-                np.zeros(rgb.shape[0], rgb.shape[1]),
+                -1 * np.ones(rgb.shape[0], rgb.shape[1]),
                 task_observations,
             )
 
diff --git a/src/stretch/perception/wrapper.py b/src/stretch/perception/wrapper.py
index 7d5d12d30..94c3fd756 100644
--- a/src/stretch/perception/wrapper.py
+++ b/src/stretch/perception/wrapper.py
@@ -182,7 +182,6 @@ def predict(
         )
         obs.semantic = semantic
         obs.instance = instance
-        breakpoint()
         if obs.task_observations is None:
             obs.task_observations = task_observations
         else:

From 751e67163d9dcaf06a1f5968d2b6f2a30470141b Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 10 Sep 2024 11:57:35 -0400
Subject: [PATCH 072/410] update to code

---
 src/stretch/perception/detection/yolo/yolo_perception.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/stretch/perception/detection/yolo/yolo_perception.py b/src/stretch/perception/detection/yolo/yolo_perception.py
index d2708e65c..0ff1a7453 100644
--- a/src/stretch/perception/detection/yolo/yolo_perception.py
+++ b/src/stretch/perception/detection/yolo/yolo_perception.py
@@ -118,8 +118,8 @@ def predict(
         if pred[0].boxes is None or pred[0].masks is None:
             task_observations["semantic_frame"] = None
             return (
-                np.zeros(rgb.shape[0], rgb.shape[1]),
-                -1 * np.ones(rgb.shape[0], rgb.shape[1]),
+                np.zeros((rgb.shape[0], rgb.shape[1])),
+                -1 * np.ones((rgb.shape[0], rgb.shape[1])),
                 task_observations,
             )
 

From 43d64b3c7dac79641835d0c8e417e84727d4d00c Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 10 Sep 2024 12:09:18 -0400
Subject: [PATCH 073/410] Update to add min points per voxel

---
 src/stretch/agent/robot_agent.py   |  1 +
 src/stretch/mapping/voxel/voxel.py | 11 ++++++++++-
 src/stretch/utils/voxel.py         | 12 ++++++++++++
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index f2c903e31..f795519cd 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -421,6 +421,7 @@ def update(self, visualize_map: bool = False, debug_instances: bool = False):
                 pan = -1 * i * np.pi / 4
                 print(f"[UPDATE] Head sweep {i} at {pan}, {tilt}")
                 self.robot.head_to(pan, tilt, blocking=True)
+                time.sleep(0.1)
                 obs = self.robot.get_observation()
 
             t1 = timeit.default_timer()
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index 8c3a5c42c..e052b5152 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -99,6 +99,7 @@ class SparseVoxelMap(object):
         encoder (Optional[BaseImageTextEncoder]): An encoder for feature embeddings (optional).
         map_2d_device (str): The device for 2D mapping.
         use_instance_memory (bool): Whether to create object-centric instance memory.
+        min_points_per_voxel (int): The minimum number of points per voxel.
     """
 
     DEFAULT_INSTANCE_MAP_KWARGS = dict(
@@ -139,6 +140,7 @@ def __init__(
         use_derivative_filter: bool = False,
         derivative_filter_threshold: float = 0.5,
         prune_detected_objects: bool = False,
+        min_points_per_voxel: int = 10,
     ):
         """
         Args:
@@ -212,6 +214,7 @@ def __init__(
         self.voxel_kwargs = voxel_kwargs
         self.encoder = encoder
         self.map_2d_device = map_2d_device
+        self._min_points_per_voxel = min_points_per_voxel
 
         # Set the device we use for things here
         if device is not None:
@@ -549,7 +552,13 @@ def add(
 
         # TODO: weights could also be confidence, inv distance from camera, etc
         if world_xyz.nelement() > 0:
-            self.voxel_pcd.add(world_xyz, features=feats, rgb=rgb, weights=None)
+            self.voxel_pcd.add(
+                world_xyz,
+                features=feats,
+                rgb=rgb,
+                weights=None,
+                min_weight_per_voxel=self._min_points_per_voxel,
+            )
 
         if self._add_local_radius_points and len(self.observations) < 2:
             # Only do this at the first step, never after it.
diff --git a/src/stretch/utils/voxel.py b/src/stretch/utils/voxel.py
index 0eef99c47..8507b498c 100644
--- a/src/stretch/utils/voxel.py
+++ b/src/stretch/utils/voxel.py
@@ -137,6 +137,7 @@ def add(
         features: Optional[Tensor],
         rgb: Optional[Tensor],
         weights: Optional[Tensor] = None,
+        min_weight_per_voxel: float = 10.0,
     ):
         """Add a feature pointcloud to the voxel grid.
 
@@ -212,9 +213,11 @@ def add(
             weights=all_weights,
             rgbs=all_rgb,
             feature_reduce=self.feature_pool_method,
+            min_weight_per_voxel=min_weight_per_voxel,
         )
         print(self._points.shape, all_points.shape, cluster_consecutive_idx.shape)
         print("Done adding")
+        breakpoint()
         return
 
     def get_idxs(self, points: Tensor) -> Tuple[Tensor, Tensor]:
@@ -386,6 +389,7 @@ def reduce_pointcloud(
     weights: Optional[Tensor] = None,
     rgbs: Optional[Tensor] = None,
     feature_reduce: str = "mean",
+    min_weight_per_voxel: float = 10.0,
 ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
     """Pools values within each voxel
 
@@ -412,12 +416,17 @@ def reduce_pointcloud(
 
     pos_cluster = scatter_weighted_mean(pos, weights, voxel_cluster, weights_cluster, dim=0)
 
+    valid_idx = weights_cluster >= min_weight_per_voxel
+
     if rgbs is not None:
         rgb_cluster = scatter_weighted_mean(rgbs, weights, voxel_cluster, weights_cluster, dim=0)
+        rgb_cluster = rgb_cluster[valid_idx]
     else:
         rgb_cluster = None
 
     if features is None:
+        weights_cluster = weights_cluster[valid_idx]
+        pos_cluster = pos_cluster[valid_idx]
         return pos_cluster, None, weights_cluster, rgb_cluster
 
     if feature_reduce == "mean":
@@ -431,6 +440,9 @@ def reduce_pointcloud(
     else:
         raise NotImplementedError(f"Unknown feature reduction method {feature_reduce}")
 
+    weights_cluster = weights_cluster[valid_idx]
+    pos_cluster = pos_cluster[valid_idx]
+    feature_cluster = feature_cluster[valid_idx]
     return pos_cluster, feature_cluster, weights_cluster, rgb_cluster
 
 

From 7eea5651af5fe98879b8766af41fdb46e1287ca3 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 10 Sep 2024 12:10:20 -0400
Subject: [PATCH 074/410] Remove some debugging code

---
 src/stretch/utils/voxel.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/stretch/utils/voxel.py b/src/stretch/utils/voxel.py
index 8507b498c..7d7464722 100644
--- a/src/stretch/utils/voxel.py
+++ b/src/stretch/utils/voxel.py
@@ -205,7 +205,6 @@ def add(
         )
         if self._points is not None:
             print(self._points.shape, all_points.shape, cluster_consecutive_idx.shape)
-        print(all_points.shape)
         self._points, self._features, self._weights, self._rgb = reduce_pointcloud(
             cluster_consecutive_idx,
             pos=all_points,
@@ -215,9 +214,6 @@ def add(
             feature_reduce=self.feature_pool_method,
             min_weight_per_voxel=min_weight_per_voxel,
         )
-        print(self._points.shape, all_points.shape, cluster_consecutive_idx.shape)
-        print("Done adding")
-        breakpoint()
         return
 
     def get_idxs(self, points: Tensor) -> Tuple[Tensor, Tensor]:

From e3ac4d633856119c5c7d6e3fc1053a369aed9185 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 10 Sep 2024 12:26:39 -0400
Subject: [PATCH 075/410] voxel setup

---
 src/stretch/config/default_planner.yaml | 1 +
 src/stretch/mapping/voxel/voxel.py      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index f1ba1ac67..6ebdd5d25 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -14,6 +14,7 @@ obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
 neg_obs_height: -0.05  # Things less than this height ARE obstacles
 obs_min_density: 10  # This many points makes it an obstacle
+min_points_per_voxel: 20  # Drop things below this density per voxel
 
 # Padding
 pad_obstacles: 2  # Add this many units (voxel_size) to the area around obstacles
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index e052b5152..27948ca10 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -1162,6 +1162,7 @@ def from_parameters(
             neg_obs_height=parameters["neg_obs_height"],
             min_depth=parameters["min_depth"],
             max_depth=parameters["max_depth"],
+            min_points_per_voxel=parameters["min_points_per_voxel"],
             pad_obstacles=parameters["pad_obstacles"],
             add_local_radius_points=parameters.get("add_local_radius_points", default=True),
             remove_visited_from_obstacles=parameters.get(

From 6f7dde734aeeeaacf31c3019104289df040a8bb9 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 10 Sep 2024 12:27:15 -0400
Subject: [PATCH 076/410] updates to point cloud setup

---
 src/stretch/utils/voxel.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/stretch/utils/voxel.py b/src/stretch/utils/voxel.py
index 7d7464722..5896da351 100644
--- a/src/stretch/utils/voxel.py
+++ b/src/stretch/utils/voxel.py
@@ -203,8 +203,6 @@ def add(
         cluster_voxel_idx, cluster_consecutive_idx, _ = voxelize(
             all_points, voxel_size=self.voxel_size, start=self._mins, end=self._maxs
         )
-        if self._points is not None:
-            print(self._points.shape, all_points.shape, cluster_consecutive_idx.shape)
         self._points, self._features, self._weights, self._rgb = reduce_pointcloud(
             cluster_consecutive_idx,
             pos=all_points,

From 68c2c66fcc4e272a1da16313ef23d07ccdd0635a Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 10 Sep 2024 14:19:46 -0400
Subject: [PATCH 077/410] update

---
 src/test/mapping/test_svm.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/test/mapping/test_svm.py b/src/test/mapping/test_svm.py
index aa3ae7c4d..2cc8902d5 100644
--- a/src/test/mapping/test_svm.py
+++ b/src/test/mapping/test_svm.py
@@ -45,8 +45,6 @@ def _eval_svm(filename: str, start_pos: np.ndarray, possible: bool = False) -> N
         dummy_robot,
         parameters,
         semantic_sensor=None,
-        rpc_stub=None,
-        grasp_client=None,
         voxel_map=None,
         use_instance_memory=True,
     )

From 250e26e94f7fe8660d0b67afb0d0859644706ab1 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 10 Sep 2024 14:24:06 -0400
Subject: [PATCH 078/410] update planner config

---
 src/test/mapping/planner.yaml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/test/mapping/planner.yaml b/src/test/mapping/planner.yaml
index 8b3e83109..76056e6a2 100644
--- a/src/test/mapping/planner.yaml
+++ b/src/test/mapping/planner.yaml
@@ -93,16 +93,18 @@ scene_graph:
   max_on_height: 0.2
 
 # Navigation space - used for motion planning and computing goals.
-step_size: 0.1
-rotation_step_size: 0.2 
-dilate_frontier_size: 3  # Used to shrink the frontier back from the edges of the world
-dilate_obstacle_size: 4  # Used when selecting goals and computing what the "frontier" is 
 motion_planner:
+  step_size: 0.1
+  rotation_step_size: 0.2 
+  dilate_frontier_size: 3  # Used to shrink the frontier back from the edges of the world
+  dilate_obstacle_size: 4  # Used when selecting goals and computing what the "frontier" is 
   simplify_plans: True
   shortcut_plans: True
   shortcut_iter: 100
   # Parameters for frontier exploration using the motion planner.
   frontier:
+    dilate_frontier_size: 4  # Used to shrink the frontier back from the edges of the world
+    dilate_obstacle_size: 5  # Used when selecting goals and computing what the "frontier" is 
     default_expand_frontier_size: 10  # margin along the frontier where final robot position can be
     # Distance away you search for frontier points
     min_dist: 0.1

From 98f07afd9e0cf17bb47551afa4aef48ba2205521 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 10 Sep 2024 15:13:42 -0400
Subject: [PATCH 079/410] tweak to tests

---
 src/stretch/config/default_planner.yaml | 1 +
 src/test/mapping/test_svm.py            | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 0d782374b..122693d31 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -21,6 +21,7 @@ pad_obstacles: 2  # Add this many units (voxel_size) to the area around obstacle
 min_pad_obstacles: 0  # Do not pad LESS than this amount, for safety.
 
 local_radius: 0.8  # Area around the robot to mark as explored (kind of a hack)
+add_local_every_step: True
 remove_visited_from_obstacles: False
 min_depth: 0.5
 # max_depth: 2.5
diff --git a/src/test/mapping/test_svm.py b/src/test/mapping/test_svm.py
index 2cc8902d5..747bf1e3b 100644
--- a/src/test/mapping/test_svm.py
+++ b/src/test/mapping/test_svm.py
@@ -143,5 +143,5 @@ def test_svm_large():
 
 if __name__ == "__main__":
     debug = True
-    test_svm_small()
-    # test_svm_large()
+    # test_svm_small()
+    test_svm_large()

From a667b4fcccf21e42b4535ff78e087109494ee810 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 10 Sep 2024 22:38:56 -0400
Subject: [PATCH 080/410] debugging stuff

---
 src/stretch/config/default_planner.yaml | 13 ++++++-------
 src/stretch/mapping/voxel/voxel.py      |  2 ++
 src/stretch/utils/voxel.py              |  1 +
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 6ec7de243..da6da912f 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -8,12 +8,11 @@ open_vocab_category_map_file: example_cat_map.json
 tts_engine: "gTTS"
 
 # Sparse Voxel Map parameters
-voxel_size: 0.04 # Size of a voxel in meters
-# 0.05 seems too low
+voxel_size: 0.05 # Size of a voxel in meters
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
 neg_obs_height: -0.05  # Things less than this height ARE obstacles
-obs_min_density: 15  # This many points makes it an obstacle
+obs_min_density: 10  # This many points makes it an obstacle
 min_points_per_voxel: 20  # Drop things below this density per voxel
 
 # Padding
@@ -22,7 +21,7 @@ pad_obstacles: 2  # Add this many units (voxel_size) to the area around obstacle
 min_pad_obstacles: 0  # Do not pad LESS than this amount, for safety.
 
 local_radius: 0.8  # Area around the robot to mark as explored (kind of a hack)
-add_local_every_step: True
+add_local_every_step: False
 remove_visited_from_obstacles: False
 min_depth: 0.5
 # max_depth: 2.5
@@ -30,8 +29,8 @@ max_depth: 2.0
 
 # Object detection parameters
 detection:
-  # module: "detic"
-  module: "yolo"
+  module: "detic"
+  # module: "yolo"
   category_map_file: example_cat_map.json
   use_detic_viz: False
 
@@ -40,7 +39,7 @@ filters:
   smooth_kernel_size: 2
   #smooth_kernel_size: 0
   use_median_filter: True
-  median_filter_size: 3
+  median_filter_size: 2
   median_filter_max_error: 0.01
   use_derivative_filter: False
   derivative_filter_threshold: 0.1
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index c2af6f2ea..7ba507824 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -141,6 +141,7 @@ def __init__(
         use_derivative_filter: bool = False,
         derivative_filter_threshold: float = 0.5,
         prune_detected_objects: bool = False,
+        add_local_radius_every_step: bool = False,
         min_points_per_voxel: int = 10,
     ):
         """
@@ -1189,6 +1190,7 @@ def from_parameters(
             neg_obs_height=parameters["neg_obs_height"],
             min_depth=parameters["min_depth"],
             max_depth=parameters["max_depth"],
+            add_local_radius_every_step=parameters["add_local_every_step"],
             min_points_per_voxel=parameters["min_points_per_voxel"],
             pad_obstacles=parameters["pad_obstacles"],
             add_local_radius_points=parameters.get("add_local_radius_points", default=True),
diff --git a/src/stretch/utils/voxel.py b/src/stretch/utils/voxel.py
index 972fd61e8..d6aba493c 100644
--- a/src/stretch/utils/voxel.py
+++ b/src/stretch/utils/voxel.py
@@ -195,6 +195,7 @@ def add(
                 torch.cat([self._features, features], dim=0) if (features is not None) else None
             )
             all_rgb = torch.cat([self._rgb, rgb], dim=0) if (rgb is not None) else None
+
         # Future optimization:
         # If there are no new voxels, then we could save a bit of compute time
         # by only recomputing the voxel/cluster for the new points

From c4dc3dea3876e29cef049f7ad2c55e63022fac7e Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 10 Sep 2024 22:41:03 -0400
Subject: [PATCH 081/410] mproving mapping code

---
 src/stretch/mapping/voxel/voxel.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index 7ba507824..d6388f041 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -238,6 +238,7 @@ def __init__(
 
         # Add points with local_radius to the voxel map at (0,0,0) unless we receive lidar points
         self._add_local_radius_points = add_local_radius_points
+        self._add_local_radius_every_step = add_local_radius_every_step
         self._remove_visited_from_obstacles = remove_visited_from_obstacles
         self.local_radius = local_radius
 
@@ -567,7 +568,9 @@ def add(
                 min_weight_per_voxel=self._min_points_per_voxel,
             )
 
-        if self._add_local_radius_points and len(self.observations) < 2:
+        if self._add_local_radius_points and (
+            len(self.observations) < 2 or self._add_local_radius_every_step
+        ):
             # Only do this at the first step, never after it.
             # TODO: just get this from camera_pose?
             # Add local radius points to the map around base

From fcb102ca6bfc5bf4b36f70cf387b0ed29c5f39a4 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 10 Sep 2024 22:47:39 -0400
Subject: [PATCH 082/410] updates

---
 src/stretch/config/default_planner.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index da6da912f..349b35b02 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -12,7 +12,7 @@ voxel_size: 0.05 # Size of a voxel in meters
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
 neg_obs_height: -0.05  # Things less than this height ARE obstacles
-obs_min_density: 10  # This many points makes it an obstacle
+obs_min_density: 30  # This many points makes it an obstacle
 min_points_per_voxel: 20  # Drop things below this density per voxel
 
 # Padding
@@ -24,7 +24,6 @@ local_radius: 0.8  # Area around the robot to mark as explored (kind of a hack)
 add_local_every_step: False
 remove_visited_from_obstacles: False
 min_depth: 0.5
-# max_depth: 2.5
 max_depth: 2.0
 
 # Object detection parameters

From 892f5ffedebff24d2e189b9715f83917e8e2bb78 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 10 Sep 2024 22:49:59 -0400
Subject: [PATCH 083/410] update

---
 src/stretch/config/default_planner.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 349b35b02..735a4aaad 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -8,7 +8,7 @@ open_vocab_category_map_file: example_cat_map.json
 tts_engine: "gTTS"
 
 # Sparse Voxel Map parameters
-voxel_size: 0.05 # Size of a voxel in meters
+voxel_size: 0.04 # Size of a voxel in meters
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
 neg_obs_height: -0.05  # Things less than this height ARE obstacles

From 5b99b354c4e24010d799b6b94cab8c08b4cbe806 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 10 Sep 2024 23:13:08 -0400
Subject: [PATCH 084/410] various fixes

---
 src/stretch/config/default_planner.yaml                  | 8 ++++----
 src/stretch/utils/torch_cluster/torch_cluster_helpers.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 735a4aaad..a665a09dd 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -8,11 +8,11 @@ open_vocab_category_map_file: example_cat_map.json
 tts_engine: "gTTS"
 
 # Sparse Voxel Map parameters
-voxel_size: 0.04 # Size of a voxel in meters
+voxel_size: 0.05 # Size of a voxel in meters
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
-neg_obs_height: -0.05  # Things less than this height ARE obstacles
-obs_min_density: 30  # This many points makes it an obstacle
+neg_obs_height: -5.00  # Things less than this height ARE obstacles
+obs_min_density: 100  # This many points makes it an obstacle
 min_points_per_voxel: 20  # Drop things below this density per voxel
 
 # Padding
@@ -38,7 +38,7 @@ filters:
   smooth_kernel_size: 2
   #smooth_kernel_size: 0
   use_median_filter: True
-  median_filter_size: 2
+  median_filter_size: 4
   median_filter_max_error: 0.01
   use_derivative_filter: False
   derivative_filter_threshold: 0.1
diff --git a/src/stretch/utils/torch_cluster/torch_cluster_helpers.py b/src/stretch/utils/torch_cluster/torch_cluster_helpers.py
index bab634dfd..f0b028cfa 100644
--- a/src/stretch/utils/torch_cluster/torch_cluster_helpers.py
+++ b/src/stretch/utils/torch_cluster/torch_cluster_helpers.py
@@ -69,7 +69,7 @@ def grid_cluster(
     num_voxels = torch.max(num_voxels, torch.ones_like(num_voxels))
 
     # Compute voxel indices for each point
-    voxel_indices = ((pos - start) / size).long()
+    voxel_indices = ((pos - start) / size).round().long()
 
     # Clamp indices to ensure they're within bounds
     voxel_indices = torch.clamp(voxel_indices, min=torch.zeros_like(num_voxels), max=num_voxels - 1)

From a40d20eeeb708493eb52504e4714c8cc72d62e67 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 10 Sep 2024 23:17:35 -0400
Subject: [PATCH 085/410] update robot navigation code

---
 src/stretch/config/default_planner.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index a665a09dd..a5ab05ea4 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -11,9 +11,9 @@ tts_engine: "gTTS"
 voxel_size: 0.05 # Size of a voxel in meters
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
-neg_obs_height: -5.00  # Things less than this height ARE obstacles
+neg_obs_height: -0.10  # Things less than this height ARE obstacles
 obs_min_density: 100  # This many points makes it an obstacle
-min_points_per_voxel: 20  # Drop things below this density per voxel
+min_points_per_voxel: 100  # Drop things below this density per voxel
 
 # Padding
 pad_obstacles: 2  # Add this many units (voxel_size) to the area around obstacles

From d7fd3e5560971fccd23827589fc8bf049777f880 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 10 Sep 2024 23:39:12 -0400
Subject: [PATCH 086/410] update action code

---
 src/stretch/agent/zmq_client.py         | 14 ++++++++++++--
 src/stretch/config/default_planner.yaml |  4 ++--
 src/stretch/visualization/rerun.py      | 14 +++++++++++---
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index ef6977361..82c4e1875 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -312,13 +312,14 @@ def head_to(
         head_pan = np.clip(head_pan, -np.pi, 0)
         head_tilt = np.clip(head_tilt, -np.pi / 2, 0)
         next_action = {"head_to": [float(head_pan), float(head_tilt)], "manip_blocking": blocking}
-        self.send_action(next_action, timeout=timeout)
+        sent = self.send_action(next_action, timeout=timeout)
 
         if blocking:
+            step = sent["step"]
             whole_body_q = np.zeros(self._robot_model.dof, dtype=np.float32)
             whole_body_q[HelloStretchIdx.HEAD_PAN] = float(head_pan)
             whole_body_q[HelloStretchIdx.HEAD_TILT] = float(head_tilt)
-            self._wait_for_head(whole_body_q)
+            self._wait_for_head(whole_body_q, block_id=step)
 
     def arm_to(
         self,
@@ -587,6 +588,7 @@ def _wait_for_head(
         timeout: float = 10.0,
         min_wait_time: float = 0.5,
         resend_action: Optional[dict] = None,
+        block_id: int = -1,
         verbose: bool = False,
     ) -> None:
         """Wait for the head to move to a particular configuration."""
@@ -598,8 +600,16 @@ def _wait_for_head(
         # Head must be stationary for at least min_wait_time
         while not self._finish:
             joint_positions, joint_velocities, _ = self.get_joint_state()
+
             if joint_positions is None:
                 continue
+
+            if self._last_step < block_id:
+                # TODO: remove debug info
+                # print("Waiting for step", block_id, "to be processed; currently on:", self._last_step)
+                time.sleep(0.05)
+                continue
+
             pan_err = np.abs(
                 joint_positions[HelloStretchIdx.HEAD_PAN] - q[HelloStretchIdx.HEAD_PAN]
             )
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index a5ab05ea4..b50497a00 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -8,11 +8,11 @@ open_vocab_category_map_file: example_cat_map.json
 tts_engine: "gTTS"
 
 # Sparse Voxel Map parameters
-voxel_size: 0.05 # Size of a voxel in meters
+voxel_size: 0.04 # Size of a voxel in meters
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
 neg_obs_height: -0.10  # Things less than this height ARE obstacles
-obs_min_density: 100  # This many points makes it an obstacle
+obs_min_density: 150  # This many points makes it an obstacle
 min_points_per_voxel: 100  # Drop things below this density per voxel
 
 # Padding
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 8d4d5c5c2..a243ed7c8 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -211,7 +211,13 @@ def log_robot_state(self, obs):
         for k in HelloStretchIdx.name_to_idx:
             rr.log(f"robot_state/joint_pose/{k}", rr.Scalar(state[HelloStretchIdx.name_to_idx[k]]))
 
-    def update_voxel_map(self, space: SparseVoxelMapNavigationSpace, debug: bool = False):
+    def update_voxel_map(
+        self,
+        space: SparseVoxelMapNavigationSpace,
+        debug: bool = False,
+        explored_radius=0.04,
+        obstacle_radius=0.05,
+    ):
         """Log voxel map and send it to Rerun visualizer
         Args:
             space (SparseVoxelMapNavigationSpace): Voxel map object
@@ -249,14 +255,16 @@ def update_voxel_map(self, space: SparseVoxelMapNavigationSpace, debug: bool = F
         rr.log(
             "world/obstacles",
             rr.Points3D(
-                positions=obs_points, radii=np.ones(points.shape[0]) * 0.025, colors=[255, 0, 0]
+                positions=obs_points,
+                radii=np.ones(points.shape[0]) * obstacle_radius,
+                colors=[255, 0, 0],
             ),
         )
         rr.log(
             "world/explored",
             rr.Points3D(
                 positions=explored_points,
-                radii=np.ones(points.shape[0]) * 0.01,
+                radii=np.ones(points.shape[0]) * explored_radius,
                 colors=[255, 255, 255],
             ),
         )

From 34c9f44087803f07bf6953cbdc31b645b375cffa Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 10 Sep 2024 23:49:16 -0400
Subject: [PATCH 087/410] increase safety margin

---
 src/stretch/config/default_planner.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index b50497a00..ff0e3986b 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -16,7 +16,7 @@ obs_min_density: 150  # This many points makes it an obstacle
 min_points_per_voxel: 100  # Drop things below this density per voxel
 
 # Padding
-pad_obstacles: 2  # Add this many units (voxel_size) to the area around obstacles
+pad_obstacles: 3  # Add this many units (voxel_size) to the area around obstacles
 # pad_obstacles: 1  # Add this many units (voxel_size) to the area around obstacles
 min_pad_obstacles: 0  # Do not pad LESS than this amount, for safety.
 

From 29db482c13259fc82d4b1ecf8030c7976c150eac Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 11 Sep 2024 00:00:01 -0400
Subject: [PATCH 088/410] updates

---
 src/stretch/config/default_planner.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index ff0e3986b..98753064e 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -12,7 +12,7 @@ voxel_size: 0.04 # Size of a voxel in meters
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
 neg_obs_height: -0.10  # Things less than this height ARE obstacles
-obs_min_density: 150  # This many points makes it an obstacle
+obs_min_density: 100  # This many points makes it an obstacle
 min_points_per_voxel: 100  # Drop things below this density per voxel
 
 # Padding
@@ -36,7 +36,7 @@ detection:
 # Point cloud cleanup
 filters:
   smooth_kernel_size: 2
-  #smooth_kernel_size: 0
+  # smooth_kernel_size: 0
   use_median_filter: True
   median_filter_size: 4
   median_filter_max_error: 0.01

From 720660b4f6e05042790e4521bc0d585bec346909 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 11 Sep 2024 09:34:50 -0400
Subject: [PATCH 089/410] fixing an issue in scatter3d() which was creating
 spurious obstacles

---
 src/stretch/config/default_planner.yaml |  8 +++----
 src/stretch/mapping/voxel/voxel.py      | 11 +++++-----
 src/stretch/utils/voxel.py              | 29 ++++++++++++-------------
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 98753064e..15665eebb 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -12,11 +12,11 @@ voxel_size: 0.04 # Size of a voxel in meters
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
 neg_obs_height: -0.10  # Things less than this height ARE obstacles
-obs_min_density: 100  # This many points makes it an obstacle
-min_points_per_voxel: 100  # Drop things below this density per voxel
+obs_min_density: 50  # This many points makes it an obstacle
+min_points_per_voxel: 50  # Drop things below this density per voxel
 
 # Padding
-pad_obstacles: 3  # Add this many units (voxel_size) to the area around obstacles
+pad_obstacles: 2  # Add this many units (voxel_size) to the area around obstacles
 # pad_obstacles: 1  # Add this many units (voxel_size) to the area around obstacles
 min_pad_obstacles: 0  # Do not pad LESS than this amount, for safety.
 
@@ -38,7 +38,7 @@ filters:
   smooth_kernel_size: 2
   # smooth_kernel_size: 0
   use_median_filter: True
-  median_filter_size: 4
+  median_filter_size: 2
   median_filter_max_error: 0.01
   use_derivative_filter: False
   derivative_filter_threshold: 0.1
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index d6388f041..1cd803770 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -110,6 +110,7 @@ class SparseVoxelMap(object):
     )
     debug_valid_depth: bool = False
     debug_instance_memory_processing_time: bool = False
+    use_negative_obstacles: bool = False
 
     def __init__(
         self,
@@ -837,8 +838,6 @@ def get_2d_map(self, debug: bool = False) -> Tuple[np.ndarray, np.ndarray]:
 
         device = xyz.device
         xyz = ((xyz / self.grid_resolution) + self.grid_origin).long()
-        # xyz[xyz[:, -1] < 0, -1] = 0
-        negative_obstacles = xyz[:, -1] < self.neg_obs_height
 
         # Crop to robot height
         min_height = int(self.obs_min_height / self.grid_resolution)
@@ -848,12 +847,15 @@ def get_2d_map(self, debug: bool = False) -> Tuple[np.ndarray, np.ndarray]:
 
         # Mask out obstacles only above a certain height
         obs_mask = xyz[:, -1] < max_height
-        obs_mask = obs_mask | negative_obstacles
+        if self.use_negative_obstacles:
+            neg_height = int(self.neg_obs_height / self.grid_resolution)
+            negative_obstacles = xyz[:, -1] < neg_height
+            obs_mask = obs_mask | negative_obstacles
         xyz = xyz[obs_mask, :]
         counts = counts[obs_mask][:, None]
 
         # voxels[x_coords, y_coords, z_coords] = 1
-        voxels = scatter3d(xyz, counts, grid_size)
+        voxels = scatter3d(xyz, counts, grid_size).squeeze()
 
         # Compute the obstacle voxel grid based on what we've seen
         obstacle_voxels = voxels[:, :, min_height:]
@@ -871,7 +873,6 @@ def get_2d_map(self, debug: bool = False) -> Tuple[np.ndarray, np.ndarray]:
             )[0, 0].bool()
 
         # Explored area = only floor mass
-        # floor_voxels = voxels[:, :, :min_height]
         explored_soft = torch.sum(voxels, dim=-1)
 
         # Add explored radius around the robot, up to min depth
diff --git a/src/stretch/utils/voxel.py b/src/stretch/utils/voxel.py
index d6aba493c..82cbca90e 100644
--- a/src/stretch/utils/voxel.py
+++ b/src/stretch/utils/voxel.py
@@ -477,22 +477,21 @@ def scatter3d(voxel_indices: Tensor, weights: Tensor, grid_dimensions: List[int]
     assert len(grid_dimensions) == 3, "this is designed to work only in 3d"
     assert voxel_indices.shape[-1] == 3, "3d points expected for indices"
 
-    # Calculate a unique index for each voxel in the 3D grid
-    unique_voxel_indices = (
-        voxel_indices[:, 0] * (grid_dimensions[1] * grid_dimensions[2])
-        + voxel_indices[:, 1] * grid_dimensions[2]
-        + voxel_indices[:, 2]
-    )
+    N, F = weights.shape
+    X, Y, Z = grid_dimensions
 
-    # Use scatter to accumulate weights into voxels
-    voxel_weights = scatter(
-        weights,
-        unique_voxel_indices,
-        dim=0,
-        reduce="mean",
-        dim_size=grid_dimensions[0] * grid_dimensions[1] * grid_dimensions[2],
-    )
-    return voxel_weights.reshape(*grid_dimensions)
+    # Compute voxel indices for each point
+    # voxel_indices = (points / voxel_size).long().clamp(min=0, max=torch.tensor(grid_size) - 1)
+    voxel_indices = voxel_indices.clamp(
+        min=torch.zeros(3), max=torch.tensor(grid_dimensions) - 1
+    ).long()
+
+    # Create empty voxel grid
+    voxel_grid = torch.zeros(*grid_dimensions, F, device=weights.device)
+
+    # Scatter features into voxel grid
+    voxel_grid[voxel_indices[:, 0], voxel_indices[:, 1], voxel_indices[:, 2]] = weights
+    return voxel_grid
 
 
 def drop_smallest_weight_points(

From 86d3253f992d20285ba336130e916beaeeabc7a8 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 11 Sep 2024 11:16:11 -0400
Subject: [PATCH 090/410] improve handling for detecting objects

---
 src/stretch/agent/robot_agent.py                  | 2 +-
 src/stretch/perception/encoders/base_encoder.py   | 4 ++++
 src/stretch/perception/encoders/clip_encoder.py   | 4 ++++
 src/stretch/perception/encoders/siglip_encoder.py | 4 ++++
 4 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 6fcfcb9b2..622908e6d 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -237,7 +237,7 @@ def get_instance_from_text(
             emb = instance.get_image_embedding(
                 aggregation_method=aggregation_method, normalize=normalize
             )
-            activation = torch.cosine_similarity(emb, encoded_text, dim=-1)
+            activation = self.encoder.compute_score(emb, encoded_text)
             if activation.item() > best_activation:
                 best_activation = activation.item()
                 best_instance = instance
diff --git a/src/stretch/perception/encoders/base_encoder.py b/src/stretch/perception/encoders/base_encoder.py
index 8229910ba..e84608fed 100644
--- a/src/stretch/perception/encoders/base_encoder.py
+++ b/src/stretch/perception/encoders/base_encoder.py
@@ -31,3 +31,7 @@ def encode_image(self, image: Union[ndarray, Tensor]):
     @abc.abstractmethod
     def encode_text(self, text: str):
         raise NotImplementedError
+
+    @abc.abstractmethod
+    def compute_score(self, image: Tensor, text: Tensor):
+        raise NotImplementedError
diff --git a/src/stretch/perception/encoders/clip_encoder.py b/src/stretch/perception/encoders/clip_encoder.py
index b4c60f952..0d3b1c1f3 100644
--- a/src/stretch/perception/encoders/clip_encoder.py
+++ b/src/stretch/perception/encoders/clip_encoder.py
@@ -49,6 +49,10 @@ def encode_text(self, text: str) -> torch.Tensor:
             text_features = self.model.encode_text(text)
         return text_features.float()
 
+    def compute_score(self, image: torch.Tensor, text: torch.Tensor) -> torch.Tensor:
+        """Compute similarity score between image and text"""
+        return (100.0 * image @ text.T).squeeze()
+
 
 class NormalizedClipEncoder(ClipEncoder):
     """Simple wrapper for encoding different things as text. Normalizes the results."""
diff --git a/src/stretch/perception/encoders/siglip_encoder.py b/src/stretch/perception/encoders/siglip_encoder.py
index 88eb426dc..30f26c22d 100644
--- a/src/stretch/perception/encoders/siglip_encoder.py
+++ b/src/stretch/perception/encoders/siglip_encoder.py
@@ -55,3 +55,7 @@ def encode_text(self, text: str) -> torch.Tensor:
         with torch.no_grad():
             text_features = self.model.get_text_features(**inputs)
         return text_features.float()
+
+    def compute_score(self, image: torch.Tensor, text: torch.Tensor) -> torch.Tensor:
+        """Compute similarity score between image and text"""
+        return torch.nn.functional.cosine_similarity(image, text, dim=-1)

From 122fc8cc52e8b3951b1a4d779a7da9a163bfc744 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 11 Sep 2024 11:43:36 -0400
Subject: [PATCH 091/410] update code

---
 src/stretch/agent/robot_agent.py                  |  5 +++--
 src/stretch/app/query.py                          | 11 +++++++----
 src/stretch/mapping/instance/core.py              |  1 +
 src/stretch/mapping/instance/instance_map.py      |  8 --------
 src/stretch/perception/encoders/siglip_encoder.py |  2 +-
 5 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 622908e6d..9b73ccc30 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -43,6 +43,7 @@ class RobotAgent:
     _retry_on_fail: bool = False
     debug_update_timing: bool = False
     update_rerun_every_time: bool = True
+    normalize_embeddings: bool = False
 
     def __init__(
         self,
@@ -68,7 +69,6 @@ def __init__(
         self.show_instances_detected = show_instances_detected
 
         self.semantic_sensor = semantic_sensor
-        self.normalize_embeddings = True
         self.pos_err_threshold = parameters["trajectory_pos_err_threshold"]
         self.rot_err_threshold = parameters["trajectory_rot_err_threshold"]
         self.current_state = "WAITING"
@@ -232,6 +232,7 @@ def get_instance_from_text(
         encoded_text = self.encode_text(text_query).to(self.voxel_map.device)
         best_instance = None
         best_activation = -1.0
+        print("--- Searching for instance ---")
         for instance in self.voxel_map.get_instances():
             ins = instance.get_instance_id()
             emb = instance.get_image_embedding(
@@ -287,7 +288,7 @@ def get_instances_from_text(
             emb = instance.get_image_embedding(
                 aggregation_method=aggregation_method, normalize=normalize
             )
-            activation = torch.cosine_similarity(emb, encoded_text, dim=-1)
+            activation = self.encoder.compute_score(emb, encoded_text)
             # Add the instance to the list of matches if the cosine similarity is above the threshold
             if activation.item() > threshold:
                 activations.append(activation.item())
diff --git a/src/stretch/app/query.py b/src/stretch/app/query.py
index 34f24f466..7a8d6f050 100644
--- a/src/stretch/app/query.py
+++ b/src/stretch/app/query.py
@@ -54,7 +54,7 @@
     help="Find all objects with a similarity to the query above some threshold",
 )
 # This threshold seems to work ok for Siglip - will not work for e.g. CLIP
-@click.option("--threshold", default=0.05, help="Threshold for similarity when using --all-matches")
+@click.option("--threshold", default=0.5, help="Threshold for similarity when using --all-matches")
 @click.option(
     "--stationary",
     is_flag=True,
@@ -173,7 +173,7 @@ def main(
         else:
             # Get the best instance using agent's API
             if all_matches:
-                instances = agent.get_instances_from_text(text, threshold=threshold)
+                scores, instances = agent.get_instances_from_text(text, threshold=threshold)
             else:
                 _, instance = agent.get_instance_from_text(text)
                 instances = [instance]
@@ -182,8 +182,11 @@ def main(
                 logger.error("No matches found for query")
                 return
 
-            # Show the best view of the detected instance
-            instance.show_best_view()
+            # Loop over all instances and show them to the user
+            for instance in instances:
+                breakpoint()
+                # Show the best view of the detected instance
+                instance.show_best_view()
 
             if real_robot and not stationary:
                 # Confirm before moving
diff --git a/src/stretch/mapping/instance/core.py b/src/stretch/mapping/instance/core.py
index a72c3420a..8495cd296 100644
--- a/src/stretch/mapping/instance/core.py
+++ b/src/stretch/mapping/instance/core.py
@@ -175,6 +175,7 @@ def get_image_embedding(
             )
         if normalize:
             emb = emb / emb.norm(dim=-1, keepdim=True)
+
         return emb
 
     def get_best_view(self, metric: str = "area") -> InstanceView:
diff --git a/src/stretch/mapping/instance/instance_map.py b/src/stretch/mapping/instance/instance_map.py
index 63608aa43..86b38da17 100644
--- a/src/stretch/mapping/instance/instance_map.py
+++ b/src/stretch/mapping/instance/instance_map.py
@@ -755,14 +755,6 @@ def process_instances_for_env(
 
             percent_points = n_mask / (instance_mask.shape[0] * instance_mask.shape[1])
 
-            # Create InstanceView if the view is large enough
-            # print(f"n_mask: {n_mask}/{self.min_pixels_for_instance_view}, n_points: {n_points}>1, percent_points: {percent_points}>{self.min_percent_for_instance_view}")
-            # import matplotlib
-            # matplotlib.use("TkAgg")
-            # import matplotlib.pyplot as plt
-            # plt.imshow(cropped_image / 255)
-            # plt.show()
-
             added = False
             if (
                 n_mask >= self.min_pixels_for_instance_view
diff --git a/src/stretch/perception/encoders/siglip_encoder.py b/src/stretch/perception/encoders/siglip_encoder.py
index 30f26c22d..b9245bd9f 100644
--- a/src/stretch/perception/encoders/siglip_encoder.py
+++ b/src/stretch/perception/encoders/siglip_encoder.py
@@ -58,4 +58,4 @@ def encode_text(self, text: str) -> torch.Tensor:
 
     def compute_score(self, image: torch.Tensor, text: torch.Tensor) -> torch.Tensor:
         """Compute similarity score between image and text"""
-        return torch.nn.functional.cosine_similarity(image, text, dim=-1)
+        return torch.sigmoid((image @ text.T).sum(dim=-1))

From f0d5ff7ba2ab9d0d6215bbef074fa70c5084bcaf Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 11 Sep 2024 12:53:57 -0400
Subject: [PATCH 092/410] update

---
 src/stretch/app/query.py                     |  1 -
 src/stretch/config/default_planner.yaml      |  4 ++++
 src/stretch/mapping/instance/instance_map.py | 10 ++++------
 src/stretch/mapping/voxel/voxel.py           |  3 +++
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/stretch/app/query.py b/src/stretch/app/query.py
index 7a8d6f050..de40ac7e4 100644
--- a/src/stretch/app/query.py
+++ b/src/stretch/app/query.py
@@ -184,7 +184,6 @@ def main(
 
             # Loop over all instances and show them to the user
             for instance in instances:
-                breakpoint()
                 # Show the best view of the detected instance
                 instance.show_best_view()
 
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 15665eebb..baee56895 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -80,6 +80,10 @@ instance_memory:
   max_instance_height: 1.8
   min_pixels_for_instance_view: 100
   min_percent_for_instance_view: 0.1
+  # Should we remove the background from the instance views?
+  # What doe this mean? If you have a view of a bottle on a table, should we remove the table?
+  # It will have an effect on performance.
+  mask_cropped_instances: False  # Should we remove the background from the instance views?
   matching:
     # Feature matching threshold for if something is considered a particular class
     # Set this value by experimting with:
diff --git a/src/stretch/mapping/instance/instance_map.py b/src/stretch/mapping/instance/instance_map.py
index 86b38da17..ddb704fdf 100644
--- a/src/stretch/mapping/instance/instance_map.py
+++ b/src/stretch/mapping/instance/instance_map.py
@@ -724,17 +724,15 @@ def process_instances_for_env(
             h, w = masked_image.shape[1:]
             cropped_image = self.get_cropped_image(image, bbox)
             instance_mask = self.get_cropped_image(instance_mask.unsqueeze(0), bbox)
+            if self.mask_cropped_instances:
+                cropped_image = cropped_image * instance_mask
 
             # get embedding
             if self.encoder is not None:
-                # option 1: encoder the original crop
+                # Compute semantic image features (e.g. SigLIP or CLIP)
                 embedding = self.encoder.encode_image(cropped_image).to(cropped_image.device)
 
-                # option 2: encode crop with applied mask
-                # embedding = self.encoder.encode_image(cropped_image * instance_mask).to(
-                #     cropped_image.device
-                # )
-
+                # Get a separate set of visual, not semantic, features
                 if hasattr(self.encoder, "get_visual_feat"):
                     visual_feat = self.encoder.get_visual_feat(cropped_image).to(
                         cropped_image.device
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index 1cd803770..78febdbd7 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -1225,6 +1225,9 @@ def from_parameters(
                 "min_percent_for_instance_view": parameters.get(
                     "instance_memory/min_percent_for_instance_view", 0.2
                 ),
+                "mask_cropped_instances": parameters.get(
+                    "instance_memory/mask_cropped_instances", False
+                ),
                 "open_vocab_cat_map_file": parameters.get("open_vocab_category_map_file", None),
                 "use_visual_feat": parameters.get("use_visual_feat", False),
             },

From b861a837bba581d095306d31448217c105b9bc55 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 11 Sep 2024 13:20:35 -0400
Subject: [PATCH 093/410] updates to different models and parameters

---
 src/stretch/agent/robot_agent.py              | 11 +++++--
 .../perception/encoders/siglip_encoder.py     | 32 +++++++++++++++++++
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 9b73ccc30..8074b459c 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -256,7 +256,7 @@ def get_instances_from_text(
         aggregation_method: str = "mean",
         normalize: bool = False,
         verbose: bool = True,
-        threshold: float = 0.5,
+        threshold: float = 0.01,
     ) -> List[Tuple[float, Instance]]:
         """Get all instances that match the text query.
 
@@ -288,7 +288,14 @@ def get_instances_from_text(
             emb = instance.get_image_embedding(
                 aggregation_method=aggregation_method, normalize=normalize
             )
-            activation = self.encoder.compute_score(emb, encoded_text)
+
+            # TODO: this is hacky - should probably just not support other encoders this way
+            if hasattr(self.encoder, "classify"):
+                prob = self.encoder.classify(instance.get_best_view().get_image(), text_query)
+            else:
+                activation = self.encoder.compute_score(emb, encoded_text)
+            activation = prob
+
             # Add the instance to the list of matches if the cosine similarity is above the threshold
             if activation.item() > threshold:
                 activations.append(activation.item())
diff --git a/src/stretch/perception/encoders/siglip_encoder.py b/src/stretch/perception/encoders/siglip_encoder.py
index b9245bd9f..737b194f2 100644
--- a/src/stretch/perception/encoders/siglip_encoder.py
+++ b/src/stretch/perception/encoders/siglip_encoder.py
@@ -56,6 +56,38 @@ def encode_text(self, text: str) -> torch.Tensor:
             text_features = self.model.get_text_features(**inputs)
         return text_features.float()
 
+    def classify(self, image: Union[np.ndarray, torch.Tensor], text: str) -> torch.Tensor:
+        """Classify image and text"""
+
+        # Convert image to PIL
+        if isinstance(image, torch.Tensor):
+            image = image.cpu().numpy()
+        image = image.astype(np.uint8)
+        pil_image = Image.fromarray(image)
+
+        # Process image and text
+        inputs = self.processor(
+            images=pil_image, text=text, return_tensors="pt", padding="max_length"
+        )
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+
+        # Evaluate model
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+
+        logits = outputs.logits_per_image
+        probs = torch.sigmoid(logits)
+        return probs
+
+    def encode_batch_text(self, texts: list[str]) -> torch.Tensor:
+        """Return feature vector for text"""
+        # inputs = self.processor(text, return_tensors="pt")
+        inputs = self.tokenizer(texts, padding="max_length", return_tensors="pt")
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            text_features = self.model.get_text_features(**inputs)
+        return text_features.float()
+
     def compute_score(self, image: torch.Tensor, text: torch.Tensor) -> torch.Tensor:
         """Compute similarity score between image and text"""
         return torch.sigmoid((image @ text.T).sum(dim=-1))

From 206dfa66ef7a2176a746151908ce4998301c5ec6 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 11 Sep 2024 13:25:17 -0400
Subject: [PATCH 094/410] various tweaks to object detection

---
 src/stretch/agent/robot_agent.py               | 18 ++++++++++--------
 src/stretch/app/query.py                       |  2 +-
 .../perception/encoders/siglip_encoder.py      | 10 ++++++++--
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 8074b459c..8f721e847 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -256,7 +256,7 @@ def get_instances_from_text(
         aggregation_method: str = "mean",
         normalize: bool = False,
         verbose: bool = True,
-        threshold: float = 0.01,
+        threshold: float = 0.05,
     ) -> List[Tuple[float, Instance]]:
         """Get all instances that match the text query.
 
@@ -290,20 +290,22 @@ def get_instances_from_text(
             )
 
             # TODO: this is hacky - should probably just not support other encoders this way
-            if hasattr(self.encoder, "classify"):
-                prob = self.encoder.classify(instance.get_best_view().get_image(), text_query)
-            else:
-                activation = self.encoder.compute_score(emb, encoded_text)
-            activation = prob
+            # if hasattr(self.encoder, "classify"):
+            #    prob = self.encoder.classify(instance.get_best_view().get_image(), text_query)
+            # else:
+            activation = self.encoder.compute_score(emb, encoded_text)
+            # activation = prob
 
             # Add the instance to the list of matches if the cosine similarity is above the threshold
             if activation.item() > threshold:
                 activations.append(activation.item())
                 matches.append(instance)
                 if verbose:
-                    print(f" - Instance {ins} has activation {activation.item()}")
+                    print(f" - Instance {ins} has activation {activation.item()} > {threshold}")
             elif verbose:
-                print(f" - Skipped instance {ins} with activation {activation.item()}")
+                print(
+                    f" - Skipped instance {ins} with activation {activation.item()} < {threshold}"
+                )
         return activations, matches
 
     def get_navigation_space(self) -> ConfigurationSpace:
diff --git a/src/stretch/app/query.py b/src/stretch/app/query.py
index de40ac7e4..d5f1a9f36 100644
--- a/src/stretch/app/query.py
+++ b/src/stretch/app/query.py
@@ -54,7 +54,7 @@
     help="Find all objects with a similarity to the query above some threshold",
 )
 # This threshold seems to work ok for Siglip - will not work for e.g. CLIP
-@click.option("--threshold", default=0.5, help="Threshold for similarity when using --all-matches")
+@click.option("--threshold", default=0.05, help="Threshold for similarity when using --all-matches")
 @click.option(
     "--stationary",
     is_flag=True,
diff --git a/src/stretch/perception/encoders/siglip_encoder.py b/src/stretch/perception/encoders/siglip_encoder.py
index 737b194f2..683bda49f 100644
--- a/src/stretch/perception/encoders/siglip_encoder.py
+++ b/src/stretch/perception/encoders/siglip_encoder.py
@@ -27,10 +27,11 @@ class SiglipEncoder(BaseImageTextEncoder):
     Generally, these features are much better than OpenAI CLIP for open-vocabulary object detection.
     """
 
-    def __init__(self, device: Optional[str] = None, **kwargs) -> None:
+    def __init__(self, normalize: bool = True, device: Optional[str] = None, **kwargs) -> None:
         if device is None:
             device = "cuda" if torch.cuda.is_available() else "cpu"
         self.device = device
+        self.normalize = normalize
         self.processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
         self.tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")
         self.model = AutoModel.from_pretrained("google/siglip-base-patch16-224").to(self.device)
@@ -45,6 +46,8 @@ def encode_image(self, image: Union[torch.tensor, np.ndarray]) -> torch.Tensor:
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
         with torch.no_grad():
             image_features = self.model.get_image_features(**inputs)
+        if self.normalize:
+            image_features /= image_features.norm(dim=-1, keepdim=True)
         return image_features.float()
 
     def encode_text(self, text: str) -> torch.Tensor:
@@ -54,6 +57,8 @@ def encode_text(self, text: str) -> torch.Tensor:
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
         with torch.no_grad():
             text_features = self.model.get_text_features(**inputs)
+        if self.normalize:
+            text_features /= text_features.norm(dim=-1, keepdim=True)
         return text_features.float()
 
     def classify(self, image: Union[np.ndarray, torch.Tensor], text: str) -> torch.Tensor:
@@ -90,4 +95,5 @@ def encode_batch_text(self, texts: list[str]) -> torch.Tensor:
 
     def compute_score(self, image: torch.Tensor, text: torch.Tensor) -> torch.Tensor:
         """Compute similarity score between image and text"""
-        return torch.sigmoid((image @ text.T).sum(dim=-1))
+        # return torch.sigmoid((image @ text.T).sum(dim=-1))
+        return torch.cosine_similarity(image, text, dim=-1)

From 698a8e1c51b85253f83248e3eb65178bca1586e5 Mon Sep 17 00:00:00 2001
From: Peiqi Liu <leo20021210@163.com>
Date: Wed, 11 Sep 2024 15:00:19 -0400
Subject: [PATCH 095/410] robot experiment

---
 src/stretch/agent/zmq_client.py               |   2 +-
 src/stretch/config/dynav_config.yaml          |   7 +-
 src/stretch/dynav/llm_localizer.py            |   2 +
 src/stretch/dynav/mapping_utils/a_star.py     |   9 +-
 src/stretch/dynav/mapping_utils/voxel.py      |   3 +-
 src/stretch/dynav/mapping_utils/voxel_map.py  |  12 +-
 src/stretch/dynav/ok_robot_hw/robot.py        |   5 +-
 .../dynav/ok_robot_hw/utils/grasper_utils.py  |  21 +-
 src/stretch/dynav/robot_agent_manip_mdp.py    |   8 +-
 src/stretch/dynav/run_ok_nav.py               |  98 ++++----
 src/stretch/dynav/voxel_map_localizer.py      | 155 +++++++++++-
 src/stretch/dynav/voxel_map_server.py         | 232 +++++++++++-------
 12 files changed, 381 insertions(+), 173 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index cac8be384..cfdb49e05 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -409,7 +409,7 @@ def arm_to(
         blocking: bool = False,
         timeout: float = 10.0,
         verbose: bool = False,
-        min_time: float = 2.0,
+        min_time: float = 2.5,
         **config,
     ) -> bool:
         """Move the arm to a particular joint configuration.
diff --git a/src/stretch/config/dynav_config.yaml b/src/stretch/config/dynav_config.yaml
index 5c6914fba..e5955a1a7 100644
--- a/src/stretch/config/dynav_config.yaml
+++ b/src/stretch/config/dynav_config.yaml
@@ -6,10 +6,9 @@ open_vocab_cat_map_file: projects/real_world_ovmm/configs/example_cat_map.json
 
 # Sparse Voxel Map parameters
 voxel_size: 0.1
-# 0.05 seems too low
-obs_min_height: 0.1  # Ignore things less than this high when planning motions
+obs_min_height: 0.2  # Ignore things less than this high when planning motions
 obs_max_height: 1.5  # Ignore things over this height (eg ceilings)
-obs_min_density: 5  # This many points makes it an obstacle
+obs_min_density: 15  # This many points makes it an obstacle
 exp_min_density: 1
 
 # Padding
@@ -27,7 +26,7 @@ filters:
         use_median_filter: True
         median_filter_size: 2
         median_filter_max_error: 0.01
-        use_derivative_filter: False
+        use_derivative_filter: True
         derivative_filter_threshold: 0.1
         # use_voxel_filter: True
 
diff --git a/src/stretch/dynav/llm_localizer.py b/src/stretch/dynav/llm_localizer.py
index 6ba142412..cf9c15e2b 100644
--- a/src/stretch/dynav/llm_localizer.py
+++ b/src/stretch/dynav/llm_localizer.py
@@ -172,6 +172,8 @@ def compute_coord(self, text, image_info, threshold = 0.25):
 
     def owl_locater(self, A, encoded_image, timestamps_lst):
         for i in timestamps_lst:
+            if i not in encoded_image:
+                continue
             image_info = encoded_image[i][-1]
             res = self.compute_coord(A, image_info, threshold=0.15)
             if res is not None:
diff --git a/src/stretch/dynav/mapping_utils/a_star.py b/src/stretch/dynav/mapping_utils/a_star.py
index ee70bd817..4fa4317cd 100644
--- a/src/stretch/dynav/mapping_utils/a_star.py
+++ b/src/stretch/dynav/mapping_utils/a_star.py
@@ -51,12 +51,19 @@ def compute_theta(self, cur_x, cur_y, end_x, end_y):
 
     def reset(self):
         # print('loading the up to date navigable map')
-        print('Wait')
+        # print('Wait')
         obs, exp = self.space.voxel_map.get_2d_map()
         # print('up to date navigable map loaded')
         self._navigable = ~obs & exp
         self.start_time = time.time()
 
+    def verify_path(self, path) -> bool:
+        self.reset()
+        for i in range(max(10, len(path))):
+            if self.point_is_occupied(*self.to_pt(path[i][:2])):
+                return False
+            return True
+
     def point_is_occupied(self, x: int, y: int) -> bool:
         return not bool(self._navigable[x][y])
     
diff --git a/src/stretch/dynav/mapping_utils/voxel.py b/src/stretch/dynav/mapping_utils/voxel.py
index b9098f43e..2d97b3cc8 100644
--- a/src/stretch/dynav/mapping_utils/voxel.py
+++ b/src/stretch/dynav/mapping_utils/voxel.py
@@ -54,7 +54,7 @@
 
 VALID_FRAMES = ["camera", "world"]
 
-DEFAULT_GRID_SIZE = [145, 145]
+DEFAULT_GRID_SIZE = [200, 200]
 
 logger = logging.getLogger(__name__)
 
@@ -647,6 +647,7 @@ def get_2d_map(
         # Crop to robot height
         min_height = int(self.obs_min_height / self.grid_resolution)
         max_height = int(self.obs_max_height / self.grid_resolution)
+        # print('min_height', min_height, 'max_height', max_height)
         grid_size = self.grid_size + [max_height]
         voxels = torch.zeros(grid_size, device=device)
 
diff --git a/src/stretch/dynav/mapping_utils/voxel_map.py b/src/stretch/dynav/mapping_utils/voxel_map.py
index 1ffe278b2..dff9f70fe 100644
--- a/src/stretch/dynav/mapping_utils/voxel_map.py
+++ b/src/stretch/dynav/mapping_utils/voxel_map.py
@@ -349,10 +349,10 @@ def sample_target_point(
         reachable[reachable_xs, reachable_ys] = True
         
         obstacles, explored = self.voxel_map.get_2d_map()
-        if self.dilate_obstacles_kernel is not None:
-            obstacles = binary_dilation(
-                obstacles.float().unsqueeze(0).unsqueeze(0), self.dilate_obstacles_kernel
-            )[0, 0].bool()
+        # if self.dilate_obstacles_kernel is not None:
+        #     obstacles = binary_dilation(
+        #         obstacles.float().unsqueeze(0).unsqueeze(0), self.dilate_obstacles_kernel
+        #     )[0, 0].bool()
         reachable = reachable & ~obstacles
 
         target_x, target_y = planner.to_pt(point)
@@ -386,9 +386,9 @@ def sample_target_point(
             # print('Target is valid:', target_is_valid)
             if not target_is_valid:
                 continue
-            if np.linalg.norm([selected_x - point[0], selected_y - point[1]]) < 0.4:
+            if np.linalg.norm([selected_x - point[0], selected_y - point[1]]) <= 0.35:
                 continue
-            elif np.linalg.norm([selected_x - point[0], selected_y - point[1]]) <= 0.5:
+            elif np.linalg.norm([selected_x - point[0], selected_y - point[1]]) <= 0.45:
                 # print('OBSTACLE AVOIDANCE')
                 # print(selected_target[0].int(), selected_target[1].int())
                 i = (point[0] - selected_target[0]) // abs(point[0] - selected_target[0])
diff --git a/src/stretch/dynav/ok_robot_hw/robot.py b/src/stretch/dynav/ok_robot_hw/robot.py
index 13ca87be8..d3aa1e1a8 100644
--- a/src/stretch/dynav/ok_robot_hw/robot.py
+++ b/src/stretch/dynav/ok_robot_hw/robot.py
@@ -144,7 +144,7 @@ def pickup(self, width):
             curr_gripper_pose = self.robot.get_gripper_position()
             # print('Robot means to move gripper to', next_gripper_pos * self.STRETCH_GRIPPER_MAX)
             # print('Robot actually moves gripper to', curr_gripper_pose)
-            if next_gripper_pos == -1 or (curr_gripper_pose > max(next_gripper_pos * self.STRETCH_GRIPPER_MAX, -0.2) + 0.1):
+            if next_gripper_pos == -1:
                 break
             
             if next_gripper_pos > 0:
@@ -205,7 +205,8 @@ def move_to_joints(self, joints, gripper, mode=0):
         # Moving only the lift first
         if mode == 1:
             target1 = state
-            target1[1] = target_state[1]
+            target1[0] = target_state[0]
+            target1[1] = min(1.1, target_state[1] + 0.2)
             self.robot.arm_to(target1, blocking = True, head = np.array([self.pan, self.tilt]))
 
         # print('pan tilt after', self.robot.get_pan_tilt())
diff --git a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
index 968cc9a72..3bc571037 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
@@ -14,7 +14,7 @@ def capture_and_process_image(camera, mode, obj, socket, hello_robot):
     image_publisher = ImagePublisher(camera, socket)
 
     # Centering the object
-    head_tilt_angles = [0.1, 0, -0.1]
+    head_tilt_angles = [0, -0.1, 0.1]
     tilt_retries, side_retries = 1, 0
     retry_flag = True
     head_tilt = INIT_HEAD_TILT
@@ -33,17 +33,17 @@ def capture_and_process_image(camera, mode, obj, socket, hello_robot):
                                     head_tilt=head_tilt)
             time.sleep(1)
         
-        elif (retry_flag !=0 and side_retries == 2):
+        elif (retry_flag !=0 and side_retries == 3):
             print("Tried in all angles but couldn't succed")
             time.sleep(1)
             return None, None, None, None
 
-        elif (side_retries == 2 and tilt_retries == 2):
+        elif (side_retries == 2 and tilt_retries == 3):
             hello_robot.move_to_position(base_trans=0.1, head_tilt=head_tilt)
             side_retries = 3
 
         elif retry_flag == 2:
-            if (tilt_retries == 2):
+            if (tilt_retries == 3):
                 if (side_retries == 0):
                     hello_robot.move_to_position(base_trans=0.1, head_tilt=head_tilt)
                     side_retries = 1
@@ -134,7 +134,8 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
 
     # Lifting the arm to high position as part of pregrasping position
     print('pan, tilt before', robot.robot.get_pan_tilt())
-    robot.move_to_position(lift_pos = 1.05, gripper_pos = gripper_width, head_pan = None, head_tilt = None)
+    robot.move_to_position(gripper_pos = gripper_width)
+    robot.move_to_position(lift_pos = 1.05, head_pan = None, head_tilt = None)
     print('pan, tilt after', robot.robot.get_pan_tilt())
 
     # Rotation for aligning Robot gripper frame to Model gripper frame
@@ -189,8 +190,11 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
     # The distance between gripper and point is covered gradullay to allow for velocity control when it approaches the object
     # Lower velocity helps is not topping the light objects
     diff = abs(curr_diff - ref_diff)
-    if diff > 0.06:
-        dist = diff - 0.06
+    if diff > 0.08:
+        dist = diff - 0.08
+        state = robot.robot.get_six_joints()
+        state[1] += 0.02
+        robot.robot.arm_to(state, blocking = True)
         robot.move_to_pose(
             [0, 0, dist],
             [0, 0, 0],
@@ -200,7 +204,6 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
         
     while diff > 0.01:
         dist = min(0.03, diff)
-        print("Move to Secondary intermediate point with sleep 2s")
         robot.move_to_pose(
             [0, 0, dist],   
             [0, 0, 0],
@@ -218,4 +221,4 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
     robot.move_to_position(arm_pos = 0.01)
     robot.move_to_position(wrist_pitch = 0.0)
     robot.move_to_position(lift_pos = min(robot.robot.get_six_joints()[1], 0.9), wrist_yaw = 2.5)
-    robot.move_to_position(lift_pos = min(robot.robot.get_six_joints()[1], 0.6))
+    robot.move_to_position(lift_pos = min(robot.robot.get_six_joints()[1], 0.55))
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index 6e6ad3d7b..c3805c127 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -88,7 +88,7 @@ def __init__(
 
     def look_around(self):
         print("*" * 10, "Look around to check", "*" * 10)
-        for pan in [0.5, -0.5, -1.5]:
+        for pan in [0.4, -0.4, -1.2]:
             for tilt in [-0.6]:
                 start_time = time.time()
                 self.robot.head_to(pan, tilt, blocking = True)
@@ -292,15 +292,13 @@ def manipulate(self, text, init_tilt = INIT_HEAD_TILT, base_node = TOP_CAMERA_NO
             obj = text,
             socket = self.image_sender.manip_socket, 
             hello_robot = self.manip_wrapper)
-
-        print('Predicted width:', width)
     
         if rotation is None:
             return False
         
-        if width < 0.045 and self.re == 3:
+        if width < 0.05 and self.re == 3:
             gripper_width = 0.45
-        if width < 0.075 and self.re == 3: 
+        elif width < 0.075 and self.re == 3: 
             gripper_width = 0.6
         else:
             gripper_width = 1
diff --git a/src/stretch/dynav/run_ok_nav.py b/src/stretch/dynav/run_ok_nav.py
index 38d19e9e3..ae936cb89 100644
--- a/src/stretch/dynav/run_ok_nav.py
+++ b/src/stretch/dynav/run_ok_nav.py
@@ -69,7 +69,7 @@ def main(
         parameters["exploration_steps"] = explore_iter
     object_to_find, location_to_place = None, None
     robot.move_to_nav_posture()
-    robot.set_velocity(v = 15., w = 8.)
+    robot.set_velocity(v = 30., w = 15.)
 
     print("- Start robot agent with data collection")
     demo = RobotAgentMDP(
@@ -108,54 +108,58 @@ def main(
                     print('Exploration failed! Quitting!')
                     continue
         else:
-            robot.move_to_nav_posture()
-            robot.switch_to_navigation_mode()
-            text = input('Enter object name: ')
-            point = demo.navigate(text)
-            if point is None:
-                print('Navigation Failure!')
-                continue
-            robot.switch_to_navigation_mode()
-            xyt = robot.get_base_pose()
-            xyt[2] = xyt[2] + np.pi / 2
-            robot.navigate_to(xyt, blocking = True)
-            cv2.imwrite(text + '.jpg', robot.get_observation().rgb[:, :, [2, 1, 0]])
-
-            if input('You want to run manipulation: y/n') == 'n':
-                continue
-            camera_xyz = robot.get_head_pose()[:3, 3]
-            theta = compute_tilt(camera_xyz, point)
-            demo.manipulate(text, theta)
-            # robot.switch_to_navigation_mode()
-            # xyt = robot.get_base_pose()
-            # xyt[2] = xyt[2] - np.pi / 2
-            robot.look_front()
-            # robot.navigate_to(xyt, blocking = True)
+            text = None
+            point = None
+            if input('You want to run manipulation: y/n') != 'n':
+                robot.move_to_nav_posture()
+                robot.switch_to_navigation_mode()
+                text = input('Enter object name: ')
+                point = demo.navigate(text)
+                if point is None:
+                    print('Navigation Failure!')
+                robot.switch_to_navigation_mode()
+                xyt = robot.get_base_pose()
+                xyt[2] = xyt[2] + np.pi / 2
+                robot.navigate_to(xyt, blocking = True)
+                cv2.imwrite(text + '.jpg', robot.get_observation().rgb[:, :, [2, 1, 0]])
+
+            if input('You want to run manipulation: y/n') != 'n':
+                robot.switch_to_manipulation_mode()
+                if text is None:
+                    text = input('Enter object name: ')
+                camera_xyz = robot.get_head_pose()[:3, 3]
+                if point is not None:
+                    theta = compute_tilt(camera_xyz, point)
+                else:
+                    theta = -0.6
+                demo.manipulate(text, theta)
+                robot.look_front()
             
-            robot.switch_to_navigation_mode()
-            if input('You want to run placing: y/n') == 'n':
-                continue
-            text = input('Enter receptacle name: ')
-            point = demo.navigate(text)
-            if point is None:
-                print('Navigation Failure')
-                continue
-            robot.switch_to_navigation_mode()
-            xyt = robot.get_base_pose()
-            xyt[2] = xyt[2] + np.pi / 2
-            robot.navigate_to(xyt, blocking = True)
-            cv2.imwrite(text + '.jpg', robot.get_observation().rgb[:, :, [2, 1, 0]])
+            text = None
+            point = None
+            if input('You want to run placing: y/n') != 'n':
+                robot.switch_to_navigation_mode()
+                text = input('Enter receptacle name: ')
+                point = demo.navigate(text)
+                if point is None:
+                    print('Navigation Failure')
+                robot.switch_to_navigation_mode()
+                xyt = robot.get_base_pose()
+                xyt[2] = xyt[2] + np.pi / 2
+                robot.navigate_to(xyt, blocking = True)
+                cv2.imwrite(text + '.jpg', robot.get_observation().rgb[:, :, [2, 1, 0]])
         
-            if input('You want to run placing: y/n') == 'n':
-                continue
-            camera_xyz = robot.get_head_pose()[:3, 3]
-            theta = compute_tilt(camera_xyz, point)
-            demo.place(text, theta)
-            # robot.switch_to_navigation_mode()
-            # xyt = robot.get_base_pose()
-            # xyt[2] = xyt[2] - np.pi / 2
-            robot.move_to_nav_posture()
-            # robot.navigate_to(xyt, blocking = True)
+            if input('You want to run placing: y/n') != 'n':
+                robot.switch_to_manipulation_mode()
+                if text is None:
+                    text = input('Enter receptacle name: ')
+                camera_xyz = robot.get_head_pose()[:3, 3]
+                if point is not None:
+                    theta = compute_tilt(camera_xyz, point)
+                else:
+                    theta = -0.6
+                demo.place(text, theta)
+                robot.move_to_nav_posture()
 
             demo.save()
 
diff --git a/src/stretch/dynav/voxel_map_localizer.py b/src/stretch/dynav/voxel_map_localizer.py
index 373aab0c8..d43868fbd 100644
--- a/src/stretch/dynav/voxel_map_localizer.py
+++ b/src/stretch/dynav/voxel_map_localizer.py
@@ -22,6 +22,8 @@
 import math
 from PIL import Image
 
+from sklearn.cluster import DBSCAN
+
 def get_inv_intrinsics(intrinsics):
     # return intrinsics.double().inverse().to(intrinsics)
     fx, fy, ppx, ppy = intrinsics[..., 0, 0], intrinsics[..., 1, 1], intrinsics[..., 0, 2], intrinsics[..., 1, 2]
@@ -161,14 +163,16 @@ def find_obs_id_for_A(self, A):
         alignments = self.find_alignment_over_model(A).cpu()
         return obs_counts[alignments.argmax(dim = -1)].detach().cpu()
 
-    def compute_coord(self, text, obs_id, threshold = 0.2):
-        if obs_id <= 0:
+    def compute_coord(self, text, obs_id, threshold = 0.25):
+        # print(obs_id, len(self.voxel_map_wrapper.observations))
+        if obs_id <= 0 or obs_id > len(self.voxel_map_wrapper.observations):
             return None
         rgb = self.voxel_map_wrapper.observations[obs_id - 1].rgb
         pose = self.voxel_map_wrapper.observations[obs_id - 1].camera_pose
         depth = self.voxel_map_wrapper.observations[obs_id - 1].depth
         K = self.voxel_map_wrapper.observations[obs_id - 1].camera_K
         xyzs = get_xyz(depth, pose, K)[0]
+        # rgb[depth >= 2.5] = 0
         rgb = rgb.permute(2, 0, 1).to(torch.uint8)
         inputs = self.exist_processor(text=[['a photo of a ' + text]], images=rgb, return_tensors="pt")
         for input in inputs:
@@ -181,6 +185,12 @@ def compute_coord(self, text, obs_id, threshold = 0.2):
         results = self.exist_processor.image_processor.post_process_object_detection(
             outputs, threshold=threshold, target_sizes=target_sizes
         )[0]
+        # if len(results['scores']) == 0:
+        #     return None
+        # tl_x, tl_y, br_x, br_y = results['boxes'][torch.argmax(results['scores'])]
+        # w, h = depth.shape
+        # tl_x, tl_y, br_x, br_y = int(max(0, tl_x.item())), int(max(0, tl_y.item())), int(min(h, br_x.item())), int(min(w, br_y.item()))
+        # return torch.median(xyzs[tl_y: br_y, tl_x: br_x].reshape(-1, 3), dim = 0).values
         for idx, (score, bbox) in enumerate(sorted(zip(results['scores'], results['boxes']), key=lambda x: x[0], reverse=True)):
         
             tl_x, tl_y, br_x, br_y = bbox
@@ -191,9 +201,61 @@ def compute_coord(self, text, obs_id, threshold = 0.2):
                 return torch.median(xyzs[tl_y: br_y, tl_x: br_x].reshape(-1, 3), dim = 0).values
         return None
 
+    def verify_point(self, A, point, distance_threshold = 0.1, similarity_threshold = 0.14):
+        if isinstance(point, np.ndarray):
+            point = torch.from_numpy(point)
+        points, _, _, _ = self.voxel_pcd.get_pointcloud()
+        distances = torch.linalg.norm(point - points.detach().cpu(), dim = -1)
+        if torch.min(distances) > distance_threshold:
+            print('Points are so far from other points!')
+            return False
+        alignments = self.find_alignment_over_model(A).detach().cpu()[0]
+        if torch.max(alignments[distances <= distance_threshold]) < similarity_threshold:
+            print('Points close the the point are not similar to the text!')
+        return torch.max(alignments[distances < distance_threshold]) >= similarity_threshold       
+
     def localize_A(self, A, debug = True, return_debug = False):
+        # centroids, extends, similarity_max_list, points, obs_max_list, debug_text = self.find_clusters_for_A(A, return_obs_counts = True, debug = debug)
+        # if len(centroids) == 0:
+        #     if not debug:
+        #         return None
+        #     else:
+        #         return None, debug_text
+        # target_point = None
+        # obs = None
+        # similarity = None
+        # point = None
+        # for idx, (centroid, obs, similarity, point) in enumerate(sorted(zip(centroids, obs_max_list, similarity_max_list, points), key=lambda x: x[2], reverse=True)):
+        #     res = self.compute_coord(A, obs)
+        #     if res is not None:
+        #         target_point = res
+        #         debug_text += '#### - Obejct is detected in observations where instance' + str(idx + 1) + ' comes from. **😃** Directly navigate to it.\n'
+        #         break
+        #     if self.siglip:
+        #         cosine_similarity_check = similarity > 0.14
+        #     else:
+        #         cosine_similarity_check = similarity > 0.3
+        #     if cosine_similarity_check:
+        #         target_point = centroid
+
+        #         debug_text += '#### - Instance ' +  str(idx + 1) + ' has high cosine similarity (' + str(round(similarity, 3)) +  '). **😃** Directly navigate to it.\n'
+
+        #         break
+            
+        #     debug_text += '#### - Cannot verify whether this instance is the target. **😞** \n'
+
+        # if target_point is None:
+        #     debug_text += '#### - All instances are not the target! Maybe target object has not been observed yet. **😭**\n'
+        # if not debug:
+        #     return target_point
+        # elif not return_debug:
+        #     return target_point, debug_text
+        # else:
+        #     return target_point, debug_text, obs, point
         points, _, _, _ = self.voxel_pcd.get_pointcloud()
         alignments = self.find_alignment_over_model(A).cpu()
+        # alignments = alignments[0, points[:, -1] >= 0.1]
+        # points = points[points[:, -1] >= 0.1]
         point = points[alignments.argmax(dim = -1)].detach().cpu().squeeze()
         obs_counts = self.voxel_pcd._obs_counts
         image_id = obs_counts[alignments.argmax(dim = -1)].detach().cpu()
@@ -224,3 +286,92 @@ def localize_A(self, A, debug = True, return_debug = False):
             return target_point, debug_text
         else:
             return target_point, debug_text, image_id, point
+
+    def find_clusters_for_A(self, A, return_obs_counts = False, debug = False, turning_point = None):
+
+        debug_text = ''
+
+        points, features, _, _ = self.voxel_pcd.get_pointcloud()
+        alignments = self.find_alignment_over_model(A).cpu().reshape(-1).detach().numpy()
+        if turning_point is None:
+            if self.siglip:
+                turning_point = min(0.14, alignments[np.argsort(alignments)[-20]])
+            else:
+                turning_point = min(0.3, alignments[np.argsort(alignments)[-20]])
+        mask = alignments >= turning_point
+        alignments = alignments[mask]
+        points = points[mask]
+        if len(points) == 0:
+
+            debug_text += '### - No instance found! Maybe target object has not been observed yet. **😭**\n'
+
+            output = [[], [], [], []]
+            if return_obs_counts:
+                output.append([])
+            if debug:
+                output.append(debug_text)
+
+            return output
+        else:
+            if return_obs_counts:
+                obs_ids = self.voxel_pcd._obs_counts.detach().cpu().numpy()[mask]
+                centroids, extends, similarity_max_list, points, obs_max_list = find_clusters(points.detach().cpu().numpy(), alignments, obs = obs_ids)
+                output = [centroids, extends, similarity_max_list, points, obs_max_list]
+            else:
+                centroids, extends, similarity_max_list, points = find_clusters(points.detach().cpu().numpy(), alignments, obs = None)
+                output = [centroids, extends, similarity_max_list, points]
+
+            debug_text += '### - Found ' + str(len(centroids)) + ' instances that might be target object.\n'
+            if debug:
+                output.append(debug_text)
+            
+            return output
+
+
+def find_clusters(vertices: np.ndarray, similarity: np.ndarray, obs = None):
+    # Calculate the number of top values directly
+    top_positions = vertices
+    # top_values = probability_over_all_points[top_indices].flatten()
+
+    # Apply DBSCAN clustering
+    dbscan = DBSCAN(eps=0.25, min_samples=5)
+    clusters = dbscan.fit(top_positions)
+    labels = clusters.labels_
+
+    # Initialize empty lists to store centroids and extends of each cluster
+    centroids = []
+    extends = []
+    similarity_list = []
+    points = []
+    obs_max_list = []
+    
+    for cluster_id in set(labels):
+        if cluster_id == -1:  # Ignore noise
+            continue
+
+        members = top_positions[labels == cluster_id]
+        centroid = np.mean(members, axis=0)
+
+        similarity_values = similarity[labels == cluster_id]
+        simiarity = np.max(similarity_values)
+
+        if obs is not None:
+            obs_values = obs[labels == cluster_id]
+            obs_max = np.max(obs_values)
+
+        sx = np.max(members[:, 0]) - np.min(members[:, 0])
+        sy = np.max(members[:, 1]) - np.min(members[:, 1])
+        sz = np.max(members[:, 2]) - np.min(members[:, 2])
+
+        # Append centroid and extends to the lists
+        centroids.append(centroid)
+        extends.append((sx, sy, sz))
+        similarity_list.append(simiarity)
+        points.append(members)
+        if obs is not None:
+            obs_max_list.append(obs_max)
+
+    if obs is not None:
+        return centroids, extends, similarity_list, points, obs_max_list
+    else:
+        return centroids, extends, similarity_list, points
\ No newline at end of file
diff --git a/src/stretch/dynav/voxel_map_server.py b/src/stretch/dynav/voxel_map_server.py
index 49adcdcc6..1ebb17e59 100644
--- a/src/stretch/dynav/voxel_map_server.py
+++ b/src/stretch/dynav/voxel_map_server.py
@@ -48,6 +48,7 @@
 from PIL import Image
 
 from stretch.dynav.communication_util import load_socket, send_array, recv_array, send_rgb_img, recv_rgb_img, send_depth_img, recv_depth_img, send_everything, recv_everything
+# from stretch.utils.morphology import get_edges
 
 def get_inv_intrinsics(intrinsics):
     # return intrinsics.double().inverse().to(intrinsics)
@@ -106,7 +107,7 @@ def get_xyz(depth, pose, intrinsics):
 
 class ImageProcessor:
     def __init__(self,  
-        vision_method = 'mask*lip', 
+        vision_method = 'mask&*lip', 
         siglip = True,
         device = 'cuda',
         min_depth = 0.25,
@@ -144,6 +145,8 @@ def __init__(self,
         self.create_vision_model()
 
         self.voxel_map_lock = threading.Lock()  # Create a lock for synchronizing access to `self.voxel_map_localizer`
+
+        self.traj = None
         
         if open_communication:
             self.img_socket = load_socket(img_port)
@@ -232,45 +235,95 @@ def process_text(self, text, start_pose):
         debug_text = ''
         mode = 'navigation'
         obs = None
-        # Do visual grounding
-        if text is not None and text != '':
-            with self.voxel_map_lock:
-                localized_point, debug_text, obs, pointcloud = self.voxel_map_localizer.localize_A(text, debug = True, return_debug = True)
-            if localized_point is not None:
-                rr.log("/object", rr.Points3D([localized_point[0], localized_point[1], 1.5], colors=torch.Tensor([1, 0, 0]), radii=0.1), static = self.static)
-        # Do Frontier based exploration
-        if text is None or text == '' or localized_point is None:
-            debug_text += '## Navigation fails, so robot starts exploring environments.\n'
-            localized_point = self.sample_frontier(start_pose, text)
-            mode = 'exploration'
-            rr.log("/object", rr.Points3D([0, 0, 0], colors=torch.Tensor([1, 0, 0]), radii=0), static = self.static)
-            print('\n', localized_point, '\n')
-
-        if localized_point is None:
-            return []
+        localized_point = None
+        waypoints = None
         
-        if len(localized_point) == 2:
-            localized_point = np.array([localized_point[0], localized_point[1], 0])
-            
-        # if torch.linalg.norm((torch.tensor(localized_point) - torch.tensor(start_pose))[:2]) < 0.7:
-        #     return [[np.nan, np.nan, np.nan], localized_point]
-
-        point = self.sample_navigation(start_pose, localized_point)
+        if text is not None and text != '' and self.traj is not None:
+            print('saved traj', self.traj)
+            traj_target_point = self.traj[-1]
+            if self.voxel_map_localizer.verify_point(text, traj_target_point):
+                localized_point = traj_target_point
+                debug_text += '## Last visual grounding results looks fine so directly use it.\n'
+                if self.planner.verify_path(self.traj[:-2]):
+                    waypoints = self.traj[:-2]
+                    debug_text += '## Last path planning results looks fine so directly use it.\n'
+
+        if waypoints is None:
+            # Do visual grounding
+            if text is not None and text != '' and localized_point is None:
+                with self.voxel_map_lock:
+                    localized_point, debug_text, obs, pointcloud = self.voxel_map_localizer.localize_A(text, debug = True, return_debug = True)
+                if localized_point is not None:
+                    rr.log("/object", rr.Points3D([localized_point[0], localized_point[1], 1.5], colors=torch.Tensor([1, 0, 0]), radii=0.1), static = self.static)
+            # Do Frontier based exploration
+            if text is None or text == '' or localized_point is None:
+                debug_text += '## Navigation fails, so robot starts exploring environments.\n'
+                localized_point = self.sample_frontier(start_pose, text)
+                mode = 'exploration'
+                rr.log("/object", rr.Points3D([0, 0, 0], colors=torch.Tensor([1, 0, 0]), radii=0), static = self.static)
+                print('\n', localized_point, '\n')
+
+            if localized_point is None:
+                return []
+        
+            if len(localized_point) == 2:
+                localized_point = np.array([localized_point[0], localized_point[1], 0])
+
+            point = self.sample_navigation(start_pose, localized_point)
+
+            if self.rerun:
+                buf = BytesIO()
+                plt.savefig(buf, format='png')
+                buf.seek(0)
+                img = Image.open(buf)
+                img = np.array(img)
+                buf.close()
+                rr.log('2d_map', rr.Image(img), static = self.static)
+            else:
+                if text != '':
+                    plt.savefig(self.log + '/debug_' + text + str(self.obs_count) + '.png')
+                else:
+                    plt.savefig(self.log + '/debug_exploration' + str(self.obs_count) + '.png')
+            plt.clf()
+
+            if obs is not None and mode == 'navigation':
+                rgb = self.voxel_map.observations[obs - 1].rgb
+                if not self.rerun:
+                    if isinstance(rgb, torch.Tensor):
+                        rgb = np.array(rgb)
+                    cv2.imwrite(self.log + '/debug_' + text + '.png', rgb[:, :, [2, 1, 0]])
+                else:
+                    rr.log('/Past_observation_most_similar_to_text', rr.Image(rgb), static = self.static)
+            waypoints = None
 
-        if self.rerun:
-            buf = BytesIO()
-            plt.savefig(buf, format='png')
-            buf.seek(0)
-            img = Image.open(buf)
-            img = np.array(img)
-            buf.close()
-            rr.log('2d_map', rr.Image(img), static = self.static)
-        else:
-            if text != '':
-                plt.savefig(self.log + '/debug_' + text + str(self.obs_count) + '.png')
+            if point is None:
+                print('Unable to find any target point, some exception might happen')
+            else:
+                print('Target point is', point)
+                res = self.planner.plan(start_pose, point)
+            if res.success:
+                waypoints = [pt.state for pt in res.trajectory]
+            else:
+                waypoints = None
+                print('[FAILURE]', res.reason)
+        # If we are navigating to some object of interst, send (x, y, z) of 
+        # the object so that we can make sure the robot looks at the object after navigation
+        traj = []
+        if waypoints is not None:
+            finished = len(waypoints) <= 10 and mode == 'navigation'
+            if finished:
+                self.traj = None
             else:
-                plt.savefig(self.log + '/debug_exploration' + str(self.obs_count) + '.png')
-        plt.clf()
+                self.traj = waypoints[8:] + [[np.nan, np.nan, np.nan], localized_point]
+            if not finished:
+                waypoints = waypoints[:8]
+            traj = self.planner.clean_path_for_xy(waypoints)
+            if finished:
+                traj.append([np.nan, np.nan, np.nan])
+                if isinstance(localized_point, torch.Tensor):
+                    localized_point = localized_point.tolist()
+                traj.append(localized_point)
+            print('Planned trajectory:', traj)
 
         if self.rerun:
             if text is not None and text != '':
@@ -279,40 +332,6 @@ def process_text(self, text, start_pose):
                 debug_text = '### I have not received any text query from human user.\n ### So, I plan to explore the environment with Frontier-based exploration.\n'
             debug_text = '# Robot\'s monologue: \n' + debug_text
             rr.log("robot_monologue", rr.TextDocument(debug_text, media_type = rr.MediaType.MARKDOWN), static = self.static)
-
-        if obs is not None and mode == 'navigation':
-            rgb = self.voxel_map.observations[obs - 1].rgb
-            if not self.rerun:
-                if isinstance(rgb, torch.Tensor):
-                    rgb = np.array(rgb)
-                cv2.imwrite(self.log + '/debug_' + text + '.png', rgb[:, :, [2, 1, 0]])
-            else:
-                rr.log('/Past_observation_most_similar_to_text', rr.Image(rgb), static = self.static)
-        traj = []
-        waypoints = None
-
-        if point is None:
-            print('Unable to find any target point, some exception might happen')
-        else:
-            print('Target point is', point)
-            res = self.planner.plan(start_pose, point)
-            if res.success:
-                waypoints = [pt.state for pt in res.trajectory]
-                # If we are navigating to some object of interst, send (x, y, z) of 
-                # the object so that we can make sure the robot looks at the object after navigation
-                finished = len(waypoints) <= 10 and mode == 'navigation'
-                if not finished:
-                    waypoints = waypoints[:8]
-                traj = self.planner.clean_path_for_xy(waypoints)
-                # traj = traj[1:]
-                if finished:
-                    traj.append([np.nan, np.nan, np.nan])
-                    if isinstance(localized_point, torch.Tensor):
-                        localized_point = localized_point.tolist()
-                    traj.append(localized_point)
-                print('Planned trajectory:', traj)
-            else:
-                print('[FAILURE]', res.reason)
             
         if traj is not None:
             origins = []
@@ -455,10 +474,10 @@ def sample_navigation(self, start, point, mode = 'navigation'):
         start_pt = self.planner.to_pt(start)
         plt.scatter(start_pt[1], start_pt[0], s = 15, c = 'b')
         point_pt = self.planner.to_pt(point)
-        plt.scatter(point_pt[1], point_pt[0], s = 15, c = 'g')
+        plt.scatter(point_pt[1], point_pt[0], s = 15, c = 'r')
         if goal is not None:
             goal_pt = self.planner.to_pt(goal)
-            plt.scatter(goal_pt[1], goal_pt[0], s = 10, c = 'r')
+            plt.scatter(goal_pt[1], goal_pt[0], s = 10, c = 'g')
         return goal
 
     def sample_frontier(self, start_pose = [0, 0, 0], text = None):
@@ -549,7 +568,7 @@ def extract_mask_clip_features(self, x, image_shape):
     def run_mask_clip(self, rgb, mask, world_xyz):
 
         with torch.no_grad():
-            results = self.yolo_model.predict(rgb.permute(1,2,0)[:, :, [2, 1, 0]].numpy(), conf=0.15, verbose=False)
+            results = self.yolo_model.predict(rgb.permute(1,2,0)[:, :, [2, 1, 0]].numpy(), conf=0.05, verbose=False)
             xyxy_tensor = results[0].boxes.xyxy
             if len(xyxy_tensor) == 0:
                 return
@@ -581,7 +600,7 @@ def run_mask_clip(self, rgb, mask, world_xyz):
 
     def run_owl_sam_clip(self, rgb, mask, world_xyz):
         with torch.no_grad():
-            results = self.yolo_model.predict(rgb.permute(1,2,0)[:, :, [2, 1, 0]].numpy(), conf=0.15, verbose=False)
+            results = self.yolo_model.predict(rgb.permute(1,2,0)[:, :, [2, 1, 0]].numpy(), conf=0.05, verbose=False)
             xyxy_tensor = results[0].boxes.xyxy
             if len(xyxy_tensor) == 0:
                 return
@@ -686,15 +705,17 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
             scipy.ndimage.median_filter(depth, size=5)
         )
         median_filter_error = (depth - median_depth).abs()
+        # edges = get_edges(depth, threshold=0.5)
         valid_depth = torch.logical_and(depth < self.max_depth, depth > self.min_depth)
         valid_depth = (
             valid_depth
             & (median_filter_error < 0.01).bool()
+            # & ~edges
         )
         
         with self.voxel_map_lock:
             if self.vision_method == 'mask&*lip':
-                min_samples_clear = 5
+                min_samples_clear = 3
             else:
                 min_samples_clear = 10
             self.voxel_map_localizer.voxel_pcd.clear_points(depth, torch.from_numpy(intrinsics), torch.from_numpy(pose), min_samples_clear = min_samples_clear)
@@ -729,6 +750,8 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
         # print('image processing takes ' + str(end_time - start_time) + ' seconds.')
 
     def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
+        print('Reading from ', pickle_file_name)
+        rr.init('Debug', spawn = True)
         if isinstance(pickle_file_name, str):
             pickle_file_name = Path(pickle_file_name)
         assert pickle_file_name.exists(), f"No file found at {pickle_file_name}"
@@ -763,9 +786,11 @@ def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
             xyz = self.voxel_map.fix_data_type(xyz)
             rgb = self.voxel_map.fix_data_type(rgb)
             depth = self.voxel_map.fix_data_type(depth)
+            intrinsics = self.voxel_map.fix_data_type(K)
             if feats is not None:
                 feats = self.voxel_map.fix_data_type(feats)
             base_pose = self.voxel_map.fix_data_type(base_pose)
+            self.voxel_map.voxel_pcd.clear_points(depth, intrinsics, camera_pose)
             self.voxel_map.add(
                 camera_pose=camera_pose,
                 xyz=xyz,
@@ -775,6 +800,24 @@ def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
                 base_pose=base_pose,
                 camera_K=K,
             )
+            # points = self.voxel_map.voxel_pcd._points.detach().cpu()
+            # rgb = self.voxel_map.voxel_pcd._rgb.detach().cpu() / 255.
+            # rr.log("Obstalce_map/pointcloud", rr.Points3D(points[points[:, 2] > 0.1], \
+            #                                                   colors=rgb[points[:, 2] > 0.1], radii=0.03))
+            # plt.clf()
+            # obstacles, _ = self.voxel_map.get_2d_map()
+            # plt.imshow(obstacles)
+            # grid_coord = self.voxel_map.xy_to_grid_coords(camera_pose[:2, 3])
+            # plt.scatter(grid_coord[0], grid_coord[1], color = 'r')
+            # buf = BytesIO()
+            # plt.savefig(buf, format='png')
+            # buf.seek(0)
+            # img = Image.open(buf)
+            # img = np.array(img)
+            # buf.close()
+            # rr.log('2d_map', rr.Image(img))
+            # rr.log('robot_point', rr.Points3D(camera_pose[:3, 3], colors=[1, 0, 0], radii=0.2))
+
             self.obs_count += 1
         self.voxel_map_localizer.voxel_pcd._points = data["combined_xyz"]
         self.voxel_map_localizer.voxel_pcd._features = data["combined_feats"]
@@ -799,18 +842,17 @@ def write_to_pickle(self):
         data["rgb"] = []
         data["depth"] = []
         data["feats"] = []
-
-        for idx, frame in enumerate(self.voxel_map.observations):
+        for frame in self.voxel_map.observations:
             # add it to pickle
             # TODO: switch to using just Obs struct?
+            data["camera_poses"].append(frame.camera_pose)
+            data["base_poses"].append(frame.base_pose)
+            data["camera_K"].append(frame.camera_K)
             data["xyz"].append(frame.xyz)
             data["world_xyz"].append(frame.full_world_xyz)
+            data["rgb"].append(frame.rgb)
+            data["depth"].append(frame.depth)
             data["feats"].append(frame.feats)
-            data["base_poses"].append(frame.base_pose)
-            data["xyz"].append(None)
-            data["world_xyz"].append(None)
-            data["feats"].append(None)
-            data["base_poses"].append(None)
             for k, v in frame.info.items():
                 if k not in data:
                     data[k] = []
@@ -828,18 +870,18 @@ def write_to_pickle(self):
         print('write all data to', filename)
 
 # @hydra.main(version_base="1.2", config_path=".", config_name="config.yaml")
-def main(cfg):
-    torch.manual_seed(1)
-    imageProcessor = ImageProcessor(rerun = False, static = False, min_depth = 0., max_depth = 2.5)
-    # imageProcessor = ImageProcessor(rerun = cfg.rerun, static = cfg.static, min_depth = cfg.min_depth, max_depth = cfg.max_depth)
-    # if not cfg.pickle_file_name is None:
-    #     imageProcessor.read_from_pickle(cfg.pickle_file_name)
-    # print(imageProcessor.voxel_map_localizer.voxel_pcd._points)
-    # if cfg.open_communication:
-    #     while True:
-    #         imageProcessor.recv_text()
-    imageProcessor.read_from_pickle('debug/debug_2024-09-07_18-15-08.pkl', -1)
-    imageProcessor.write_to_pickle()
+# def main(cfg):
+#     torch.manual_seed(1)
+#     imageProcessor = ImageProcessor(rerun = False, static = False, min_depth = 0., max_depth = 2.5)
+#     # imageProcessor = ImageProcessor(rerun = cfg.rerun, static = cfg.static, min_depth = cfg.min_depth, max_depth = cfg.max_depth)
+#     # if not cfg.pickle_file_name is None:
+#     #     imageProcessor.read_from_pickle(cfg.pickle_file_name)
+#     # print(imageProcessor.voxel_map_localizer.voxel_pcd._points)
+#     # if cfg.open_communication:
+#     #     while True:
+#     #         imageProcessor.recv_text()
+#     imageProcessor.read_from_pickle('debug/debug_2024-09-09_07-50-41.pkl', -1)
+#     imageProcessor.write_to_pickle()
 
 if __name__ == "__main__":
     main(None)
\ No newline at end of file

From 4b207c2ca9790343fe1b03cde45f6b292e418116 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 11 Sep 2024 15:33:06 -0400
Subject: [PATCH 096/410] update code

---
 src/stretch/app/grasp_object.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/stretch/app/grasp_object.py b/src/stretch/app/grasp_object.py
index 74ecac408..490534e08 100644
--- a/src/stretch/app/grasp_object.py
+++ b/src/stretch/app/grasp_object.py
@@ -32,6 +32,7 @@ def get_task(robot, demo, target_object):
             demo,
         )
         grasp_object.configure(
+            target_object=target_object,
             show_object_to_grasp=True,
             servo_to_grasp=True,
             show_servo_gui=True,

From 0a03ea5694b299b32612536f8835e4760dbfc41a Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 11 Sep 2024 16:00:12 -0400
Subject: [PATCH 097/410] closed loop grasping class

---
 src/stretch/agent/base/managed_operation.py   |   4 +-
 src/stretch/agent/operations/__init__.py      |   1 +
 .../agent/operations/grasp_closed_loop.py     | 546 ++++++++++++++++++
 src/stretch/agent/operations/update.py        |  87 +--
 src/stretch/agent/robot_agent.py              |  18 +-
 src/stretch/app/grasp_object.py               |   1 +
 6 files changed, 616 insertions(+), 41 deletions(-)
 create mode 100644 src/stretch/agent/operations/grasp_closed_loop.py

diff --git a/src/stretch/agent/base/managed_operation.py b/src/stretch/agent/base/managed_operation.py
index dd650ae8b..be68bd477 100644
--- a/src/stretch/agent/base/managed_operation.py
+++ b/src/stretch/agent/base/managed_operation.py
@@ -50,9 +50,9 @@ def name(self) -> str:
         """
         return self._name
 
-    def update(self):
+    def update(self, **kwargs):
         print(colored("================ Updating the world model ==================", "blue"))
-        self.agent.update()
+        self.agent.update(**kwargs)
 
     def attempt(self, message: str):
         print(colored(f"Trying {self.name}:", "blue"), message)
diff --git a/src/stretch/agent/operations/__init__.py b/src/stretch/agent/operations/__init__.py
index e6b4f5f0f..71fb89fc2 100644
--- a/src/stretch/agent/operations/__init__.py
+++ b/src/stretch/agent/operations/__init__.py
@@ -16,6 +16,7 @@
     WaveOperation,
     WithdrawOperation,
 )
+from .grasp_closed_loop import ClosedLoopGraspObjectOperation
 from .grasp_object import GraspObjectOperation
 from .navigate import NavigateToObjectOperation
 from .place_object import PlaceObjectOperation
diff --git a/src/stretch/agent/operations/grasp_closed_loop.py b/src/stretch/agent/operations/grasp_closed_loop.py
new file mode 100644
index 000000000..0b98f38da
--- /dev/null
+++ b/src/stretch/agent/operations/grasp_closed_loop.py
@@ -0,0 +1,546 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import time
+import timeit
+from typing import Optional, Tuple
+
+import cv2
+import numpy as np
+
+import stretch.motion.constants as constants
+from stretch.agent.base import ManagedOperation
+from stretch.core.interfaces import Observations
+from stretch.mapping.instance import Instance
+from stretch.motion.kinematics import HelloStretchIdx
+from stretch.utils.filters import MaskTemporalFilter
+from stretch.utils.geometry import point_global_to_base
+from stretch.utils.gripper import GripperArucoDetector
+from stretch.utils.point_cloud import show_point_cloud
+
+
+class ClosedLoopGraspObjectOperation(ManagedOperation):
+    """Move the robot to grasp, using the end effector camera."""
+
+    use_pitch_from_vertical: bool = True
+    lift_distance: float = 0.2
+    servo_to_grasp: bool = False
+    _success: bool = False
+
+    # Task information
+    target_object: Optional[str] = None
+
+    # Debugging UI elements
+    show_object_to_grasp: bool = False
+    show_servo_gui: bool = True
+    show_point_cloud: bool = False
+
+    # Thresholds for centering on object
+    align_x_threshold: int = 10
+    align_y_threshold: int = 7
+    grasp_loose: bool = False
+    reset_observation: bool = False
+
+    # Visual servoing config
+    track_image_center: bool = False
+    gripper_aruco_detector: GripperArucoDetector = None
+    min_points_to_approach: int = 100
+    detected_center_offset_y: int = -40
+    median_distance_when_grasping: float = 0.175
+    percentage_of_image_when_grasping: float = 0.2
+    open_loop_z_offset: float = -0.1
+    open_loop_x_offset: float = -0.1
+    max_failed_attempts: int = 10
+
+    # Movement parameters
+    lift_arm_ratio: float = 0.08
+    base_x_step: float = 0.10
+    wrist_pitch_step: float = 0.075
+
+    # Timing issues
+    expected_network_delay = 0.4
+    open_loop: bool = False
+
+    # Observation memory
+    observations = MaskTemporalFilter(
+        observation_history_window_size_secs=5.0, observation_history_window_size_n=3
+    )
+
+    def configure(
+        self,
+        target_object: str,
+        show_object_to_grasp: bool = False,
+        servo_to_grasp: bool = False,
+        show_servo_gui: bool = True,
+        show_point_cloud: bool = False,
+        reset_observation: bool = False,
+        grasp_loose: bool = False,
+    ):
+        """Configure the operation with the given keyword arguments.
+
+        Args:
+            show_object_to_grasp (bool, optional): Show the object to grasp. Defaults to False.
+            servo_to_grasp (bool, optional): Use visual servoing to grasp. Defaults to False.
+            show_servo_gui (bool, optional): Show the servo GUI. Defaults to True.
+            show_point_cloud (bool, optional): Show the point cloud. Defaults to False.
+            reset_observation (bool, optional): Reset the observation. Defaults to False.
+            grasp_loose (bool, optional): Grasp loosely. Useful for grasping some objects like cups. Defaults to False.
+        """
+        self.target_object = target_object
+        self.show_object_to_grasp = show_object_to_grasp
+        self.servo_to_grasp = servo_to_grasp
+        self.show_servo_gui = show_servo_gui
+        self.show_point_cloud = show_point_cloud
+        self.reset_observation = reset_observation
+        self.grasp_loose = grasp_loose
+
+    def _debug_show_point_cloud(self, servo: Observations, current_xyz: np.ndarray) -> None:
+        """Show the point cloud for debugging purposes.
+
+        Args:
+            servo (Observations): Servo observation
+            current_xyz (np.ndarray): Current xyz location
+        """
+        # TODO: remove this, overrides existing servo state
+        # servo = self.robot.get_servo_observation()
+        world_xyz = servo.get_ee_xyz_in_world_frame()
+        world_xyz_head = servo.get_xyz_in_world_frame()
+        all_xyz = np.concatenate([world_xyz_head.reshape(-1, 3), world_xyz.reshape(-1, 3)], axis=0)
+        all_rgb = np.concatenate([servo.rgb.reshape(-1, 3), servo.ee_rgb.reshape(-1, 3)], axis=0)
+        show_point_cloud(all_xyz, all_rgb / 255, orig=current_xyz)
+
+    def can_start(self):
+        """Grasping can start if we have a target object picked out, and are moving to its instance, and if the robot is ready to begin manipulation."""
+        if self.target_object is None:
+            self.error("No target object set.")
+            return False
+        return self.agent.current_object is not None and self.robot.in_manipulation_mode()
+
+    def get_class_mask(self, servo: Observations) -> np.ndarray:
+        """Get the mask for the class of the object we are trying to grasp. Multiple options might be acceptable.
+
+        Args:
+            servo (Observations): Servo observation
+
+        Returns:
+            np.ndarray: Mask for the class of the object we are trying to grasp
+        """
+        mask = np.zeros_like(servo.semantic).astype(bool)  # type: ignore
+        for iid in np.unique(servo.semantic):
+            name = self.agent.semantic_sensor.get_class_name_for_id(iid)
+            if name is not None and self.target_object in name:
+                mask = np.bitwise_or(mask, servo.semantic == iid)
+        return mask
+
+    def set_target_object_class(self, target_object: str):
+        """Set the target object class.
+
+        Args:
+            target_object (str): Target object class
+        """
+        self.target_object = target_object
+
+    def get_target_mask(
+        self,
+        servo: Observations,
+        instance: Instance,
+        center: Tuple[int, int],
+        prev_mask: Optional[np.ndarray] = None,
+    ) -> Optional[np.ndarray]:
+        """Get target mask to move to. If we do not provide the mask from the previous step, we will simply find the mask with the most points of the correct class. Otherwise, we will try to find the mask that most overlaps with the previous mask. There are two options here: one where we simply find the mask with the most points, and another where we try to find the mask that most overlaps with the previous mask. This is in case we are losing track of particular objects and getting classes mixed up.
+
+        Args:
+            servo (Observations): Servo observation
+            instance (Instance): Instance we are trying to grasp
+            prev_mask (Optional[np.ndarray], optional): Mask from the previous step. Defaults to None.
+
+        Returns:
+            Optional[np.ndarray]: Target mask to move to
+        """
+        # Find the best masks
+        class_mask = self.get_class_mask(servo)
+        instance_mask = servo.instance
+        if servo.ee_xyz is None:
+            servo.compute_ee_xyz()
+
+        target_mask = None
+        target_mask_pts = float("-inf")
+        maximum_overlap_mask = None
+        maximum_overlap_pts = float("-inf")
+        center_x, center_y = center
+        for iid in np.unique(instance_mask):
+            current_instance_mask = instance_mask == iid
+
+            # If we are centered on the mask and it's the right class, just go for it
+            if class_mask[center_y, center_x] > 0 and current_instance_mask[center_y, center_x] > 0:
+                # This is the correct one - it's centered and the right class. Just go there.
+                print("!!! CENTERED ON THE RIGHT OBJECT !!!")
+                return current_instance_mask
+
+            # Option 2 - try to find the map that most overlapped with what we were just trying to grasp
+            # This is in case we are losing track of particular objects and getting classes mixed up
+            if prev_mask is not None:
+                # Find the mask with the most points
+                mask = np.bitwise_and(current_instance_mask, prev_mask)
+                mask = np.bitwise_and(mask, class_mask)
+                num_pts = sum(mask.flatten())
+
+                if num_pts > maximum_overlap_pts:
+                    maximum_overlap_pts = num_pts
+                    maximum_overlap_mask = mask
+
+            # Simply find the mask with the most points
+            mask = np.bitwise_and(current_instance_mask, class_mask)
+            num_pts = sum(mask.flatten())
+            if num_pts > target_mask_pts:
+                target_mask = mask
+                target_mask_pts = num_pts
+
+        if maximum_overlap_pts > self.min_points_to_approach:
+            return maximum_overlap_mask
+        if target_mask is not None:
+            return target_mask
+        else:
+            return prev_mask
+
+    def _grasp(self) -> bool:
+        """Helper function to close gripper around object."""
+        self.cheer("Grasping object!")
+        self.robot.close_gripper(loose=self.grasp_loose, blocking=True)
+        time.sleep(0.5)
+
+        # Get a joint state for the object
+        joint_state = self.robot.get_joint_positions()
+
+        # Lifted joint state
+        lifted_joint_state = joint_state.copy()
+        lifted_joint_state[HelloStretchIdx.LIFT] += 0.2
+        self.robot.arm_to(lifted_joint_state, head=constants.look_at_ee, blocking=True)
+        return True
+
+    def visual_servo_to_object(
+        self, instance: Instance, max_duration: float = 120.0, max_not_moving_count: int = 10
+    ) -> bool:
+        """Use visual servoing to grasp the object."""
+
+        self.intro(f"Visual servoing to grasp object {instance.global_id} {instance.category_id=}.")
+        if self.show_servo_gui:
+            self.warn("If you want to stop the visual servoing with the GUI up, press 'q'.")
+
+        t0 = timeit.default_timer()
+        aligned_once = False
+        pregrasp_done = False
+        prev_target_mask = None
+        success = False
+        prev_lift = float("Inf")
+
+        # Track the fingertips using aruco markers
+        if self.gripper_aruco_detector is None:
+            self.gripper_aruco_detector = GripperArucoDetector()
+
+        # Track the last object location and the number of times we've failed to grasp
+        current_xyz = None
+        failed_counter = 0
+        not_moving_count = 0
+        q_last = np.array([0.0 for _ in range(11)])  # 11 DOF, HelloStretchIdx
+
+        # Main loop - run unless we time out, blocking.
+        while timeit.default_timer() - t0 < max_duration:
+
+            # Get servo observation
+            servo = self.robot.get_servo_observation()
+            joint_state = self.robot.get_joint_positions()
+            world_xyz = servo.get_ee_xyz_in_world_frame()
+
+            if not self.open_loop:
+                # Now compute what to do
+                base_x = joint_state[HelloStretchIdx.BASE_X]
+                wrist_pitch = joint_state[HelloStretchIdx.WRIST_PITCH]
+                arm = joint_state[HelloStretchIdx.ARM]
+                lift = joint_state[HelloStretchIdx.LIFT]
+
+            # Compute the center of the image that we will be tracking
+            if self.track_image_center:
+                center_x, center_y = servo.ee_rgb.shape[1] // 2, servo.ee_rgb.shape[0] // 2
+            else:
+                center = self.gripper_aruco_detector.detect_center(servo.ee_rgb)
+                if center is not None:
+                    center_y, center_x = np.round(center).astype(int)
+                    center_y += self.detected_center_offset_y
+                else:
+                    center_x, center_y = servo.ee_rgb.shape[1] // 2, servo.ee_rgb.shape[0] // 2
+
+            # add offset to center
+            center_x -= 10  # move closer to top
+
+            # Run semantic segmentation on it
+            servo = self.agent.semantic_sensor.predict(servo, ee=True)
+            latest_mask = self.get_target_mask(
+                servo, instance, prev_mask=prev_target_mask, center=(center_x, center_y)
+            )
+
+            # dilate mask
+            kernel = np.ones((3, 3), np.uint8)
+            mask_np = latest_mask.astype(np.uint8)
+            dilated_mask = cv2.dilate(mask_np, kernel, iterations=1)
+            latest_mask = dilated_mask.astype(bool)
+
+            # push to history
+            self.observations.push_mask_to_observation_history(
+                observation=latest_mask,
+                timestamp=time.time(),
+                mask_size_threshold=self.min_points_to_approach,
+                acquire_lock=True,
+            )
+
+            target_mask = self.observations.get_latest_observation()
+            if target_mask is None:
+                target_mask = np.zeros([servo.ee_rgb.shape[0], servo.ee_rgb.shape[1]], dtype=bool)
+
+            # Get depth
+            center_depth = servo.ee_depth[center_y, center_x] / 1000
+
+            # Compute the center of the mask in image coords
+            mask_center = self.observations.get_latest_centroid()
+            if mask_center is None:
+                # if not aligned_once:
+                #     self.error(
+                #         "Lost track before even seeing object with EE camera. Just try open loop."
+                #     )
+                #     if self.show_servo_gui:
+                #         cv2.destroyAllWindows()
+                # return False
+                if failed_counter < self.max_failed_attempts:
+                    mask_center = np.array([center_y, center_x])
+                else:
+                    # If we are aligned, but we lost the object, just try to grasp it
+                    self.error(f"Lost track. Trying to grasp at {current_xyz}.")
+                    if current_xyz is not None:
+                        current_xyz[0] += self.open_loop_x_offset
+                        current_xyz[2] += self.open_loop_z_offset
+                    if self.show_servo_gui:
+                        cv2.destroyAllWindows()
+                    return self.grasp_open_loop(current_xyz)
+            else:
+                failed_counter = 0
+                mask_center = mask_center.astype(int)
+                assert (
+                    world_xyz.shape[0] == servo.semantic.shape[0]
+                    and world_xyz.shape[1] == servo.semantic.shape[1]
+                ), "World xyz shape does not match semantic shape."
+                current_xyz = world_xyz[int(mask_center[0]), int(mask_center[1])]
+                if self.show_point_cloud:
+                    self._debug_show_point_cloud(servo, current_xyz)
+
+            # Optionally display which object we are servoing to
+            if self.show_servo_gui:
+                servo_ee_rgb = cv2.cvtColor(servo.ee_rgb, cv2.COLOR_RGB2BGR)
+                mask = target_mask.astype(np.uint8) * 255
+                mask = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)
+                mask[:, :, 0] = 0
+                servo_ee_rgb = cv2.addWeighted(servo_ee_rgb, 0.5, mask, 0.5, 0, servo_ee_rgb)
+                # Draw the center of the image
+                servo_ee_rgb = cv2.circle(servo_ee_rgb, (center_x, center_y), 5, (255, 0, 0), -1)
+                # Draw the center of the mask
+                servo_ee_rgb = cv2.circle(
+                    servo_ee_rgb, (int(mask_center[1]), int(mask_center[0])), 5, (0, 255, 0), -1
+                )
+                cv2.imshow("servo_ee_rgb", servo_ee_rgb)
+                cv2.waitKey(1)
+                res = cv2.waitKey(1) & 0xFF  # 0xFF is a mask to get the last 8 bits
+                if res == ord("q"):
+                    break
+
+            if not pregrasp_done and current_xyz is not None:
+                self.pregrasp_open_loop(current_xyz, distance_from_object=0.075)
+                pregrasp_done = True
+            else:
+                # check not moving threshold
+                if not_moving_count > max_not_moving_count:
+                    success = self._grasp()
+                    break
+                # If we have a target mask, compute the median depth of the object
+                # Otherwise we will just try to grasp if we are close enough - assume we lost track!
+                if target_mask is not None:
+                    object_depth = servo.ee_depth[target_mask]
+                    median_object_depth = np.median(servo.ee_depth[target_mask]) / 1000
+                else:
+                    print("detected classes:", np.unique(servo.ee_semantic))
+                    if center_depth < self.median_distance_when_grasping:
+                        success = self._grasp()
+                    continue
+
+                dx, dy = mask_center[1] - center_x, mask_center[0] - center_y
+
+                # Is the center of the image part of the target mask or not?
+                center_in_mask = target_mask[int(center_y), int(center_x)] > 0
+                # TODO: add deadband bubble around this?
+
+                # Since we were able to detect it, copy over the target mask
+                prev_target_mask = target_mask
+
+                print()
+                print("----- STEP VISUAL SERVOING -----")
+                print("Observed this many target mask points:", np.sum(target_mask.flatten()))
+                print("failed =", failed_counter, "/", self.max_failed_attempts)
+                print("cur x =", base_x)
+                print(" lift =", lift)
+                print("  arm =", arm)
+                print("pitch =", wrist_pitch)
+                print(f"base_x={base_x}, wrist_pitch={wrist_pitch}, dx={dx}, dy={dy}")
+                print(f"Median distance to object is {median_object_depth}.")
+                print(f"Center distance to object is {center_depth}.")
+                print("Center in mask?", center_in_mask)
+                print("Current XYZ:", current_xyz)
+
+                aligned = (
+                    np.abs(dx) < self.align_x_threshold and np.abs(dy) < self.align_y_threshold
+                )
+
+                # Fix lift to only go down
+                lift = min(lift, prev_lift)
+
+                if aligned:
+                    # First, check to see if we are close enough to grasp
+                    if center_depth < self.median_distance_when_grasping:
+                        success = self._grasp()
+                        break
+                    # If we are aligned, step the whole thing closer by some amount
+                    # This is based on the pitch - basically
+                    aligned_once = True
+                    arm_component = np.cos(wrist_pitch) * self.lift_arm_ratio
+                    lift_component = np.sin(wrist_pitch) * self.lift_arm_ratio
+                    arm += arm_component
+                    lift += lift_component
+                else:
+                    # Add these to do some really hacky proportionate control
+                    px = max(0.25, np.abs(2 * dx / target_mask.shape[1]))
+                    py = max(0.25, np.abs(2 * dy / target_mask.shape[0]))
+
+                    # Move the base and modify the wrist pitch
+                    # TODO: remove debug code
+                    # print(f"dx={dx}, dy={dy}, px={px}, py={py}")
+                    if dx > self.align_x_threshold:
+                        # Move in x - this means translate the base
+                        base_x += -self.base_x_step * px
+                    elif dx < -1 * self.align_x_threshold:
+                        base_x += self.base_x_step * px
+                    if dy > self.align_y_threshold:
+                        # Move in y - this means translate the base
+                        wrist_pitch += -self.wrist_pitch_step * py
+                    elif dy < -1 * self.align_y_threshold:
+                        wrist_pitch += self.wrist_pitch_step * py
+
+                    # Force to reacquire the target mask if we moved the camera too much
+                    prev_target_mask = None
+
+                # safety checks
+                q = [
+                    base_x,
+                    0.0,
+                    0.0,
+                    lift,
+                    arm,
+                    0.0,
+                    0.0,
+                    wrist_pitch,
+                    0.0,
+                    0.0,
+                    0.0,
+                ]  # 11 DOF: see HelloStretchIdx
+
+                q = np.array(q)
+
+                ee_pos, ee_quat = self.robot_model.manip_fk(q)
+
+                while ee_pos[2] < 0.03:
+                    lift += 0.01
+                    q[HelloStretchIdx.LIFT] = lift
+                    ee_pos, ee_quat = self.robot_model.manip_fk(q)
+
+                print("tgt x =", base_x)
+                print(" lift =", lift)
+                print("  arm =", arm)
+                print("pitch =", wrist_pitch)
+
+                self.robot.arm_to(
+                    [base_x, lift, arm, 0, wrist_pitch, 0],
+                    head=constants.look_at_ee,
+                    blocking=False,
+                )
+                prev_lift = lift
+                time.sleep(self.expected_network_delay)
+
+                # check not moving
+                if np.linalg.norm(q - q_last) < 0.05:  # TODO: tune
+                    not_moving_count += 1
+                else:
+                    not_moving_count = 0
+
+                q_last = q
+
+        if self.show_servo_gui:
+            cv2.destroyAllWindows()
+        return success
+
+    def run(self) -> None:
+        self.intro("Grasping the object.")
+        self._success = False
+        if self.show_object_to_grasp:
+            self.show_instance(self.agent.current_object)
+
+        assert self.target_object is not None, "Target object must be set before running."
+
+        # Now we should be able to see the object if we orient gripper properly
+        # Get the end effector pose
+        obs = self.robot.get_observation()
+        joint_state = self.robot.get_joint_positions()
+        model = self.robot.get_robot_model()
+
+        if joint_state[HelloStretchIdx.GRIPPER] < 0.0:
+            self.robot.open_gripper(blocking=True)
+
+        # Get the current base pose of the robot
+        xyt = self.robot.get_base_pose()
+
+        # Note that these are in the robot's current coordinate frame; they're not global coordinates, so this is ok to use to compute motions.
+        object_xyz = self.agent.current_object.get_center()
+        relative_object_xyz = point_global_to_base(object_xyz, xyt)
+
+        # Compute the angles necessary
+        if self.use_pitch_from_vertical:
+            ee_pos, ee_rot = model.manip_fk(joint_state)
+            dy = np.abs(ee_pos[1] - relative_object_xyz[1])
+            dz = np.abs(ee_pos[2] - relative_object_xyz[2])
+            pitch_from_vertical = np.arctan2(dy, dz)
+        else:
+            pitch_from_vertical = 0.0
+
+        # Compute final pregrasp joint state goal and send the robot there
+        joint_state[HelloStretchIdx.WRIST_PITCH] = -np.pi / 2 + pitch_from_vertical
+        self.robot.arm_to(joint_state, head=constants.look_at_ee, blocking=True)
+
+        if self.servo_to_grasp:
+            # If we try to servo, then do this
+            self._success = self.visual_servo_to_object(self.agent.current_object)
+
+        if not self._success:
+            self.grasp_open_loop(object_xyz)
+
+        # clear observations
+        if self.reset_observation:
+            self.observations.clear_history()
+            self.agent.reset_object_plans()
+            self.agent.voxel_map.instances.pop_global_instance(
+                env_id=0, global_instance_id=self.agent.current_object.global_id
+            )
+
+    def was_successful(self) -> bool:
+        """Return true if successful"""
+        return self._success
diff --git a/src/stretch/agent/operations/update.py b/src/stretch/agent/operations/update.py
index 9264282fe..ae0a0a491 100644
--- a/src/stretch/agent/operations/update.py
+++ b/src/stretch/agent/operations/update.py
@@ -10,6 +10,7 @@
 # This file contains the UpdateOperation class, which is responsible for updating the world model
 # and finding objects in the environment. It is a subclass of ManagedOperation.
 import time
+from typing import Optional
 
 import numpy as np
 
@@ -21,6 +22,9 @@ class UpdateOperation(ManagedOperation):
     show_instances_detected: bool = False
     show_map_so_far: bool = False
     clear_voxel_map: bool = False
+    move_head: Optional[bool] = None
+    target_object: str = "cup"
+    match: str = "name"
 
     def set_target_object_class(self, object_class: str):
         self.warn(f"Overwriting target object class from {self.object_class} to {object_class}.")
@@ -29,6 +33,21 @@ def set_target_object_class(self, object_class: str):
     def can_start(self):
         return True
 
+    def configure(
+        self,
+        move_head=False,
+        show_instances_detected=False,
+        show_map_so_far=False,
+        clear_voxel_map=False,
+        target_object: str = "cup",
+    ):
+        """Configure the operation with the given parameters."""
+        self.move_head = move_head
+        self.show_instances_detected = show_instances_detected
+        self.show_map_so_far = show_map_so_far
+        self.clear_voxel_map = clear_voxel_map
+        self.target_object = target_object
+
     def run(self):
         self.intro("Updating the world model.")
         if self.clear_voxel_map:
@@ -40,7 +59,7 @@ def run(self):
         self.robot.arm_to([0.0, 0.4, 0.05, 0, -np.pi / 4, 0], blocking=True)
         xyt = self.robot.get_base_pose()
         # Now update the world
-        self.update()
+        self.update(move_head=self.move_head)
         # Delete observations near us, since they contain the arm!!
         self.agent.voxel_map.delete_obstacles(point=xyt[:2], radius=0.7)
 
@@ -79,46 +98,46 @@ def run(self):
         object_options = []
         dist_to_object = float("inf")
 
+        # Find the object we want to manipulate
+        if self.match_method == "class":
+            instances = self.agent.voxel_map.instances.get_instances_by_class(self.target_object)
+            scores = np.ones(len(instances))
+        elif self.match_method == "name":
+            scores, instances = self.agent.get_instances_from_text(self.target_object)
+            # self.agent.voxel_map.show(orig=np.zeros(3), xyt=start, footprint=self.robot_model.get_footprint(), planner_visuals=True)
+        else:
+            raise ValueError(f"Unknown match type {self.match_method}")
+
+        if len(instances) == 0:
+            self.warn(f"Could not find any instances of {self.target_object}.")
+            return
+
         print("Check explored instances for reachable receptacles:")
-        for i, instance in enumerate(instances):
+        for i, (score, instance) in enumerate(zip(scores, instances)):
             name = self.agent.semantic_sensor.get_class_name_for_id(instance.category_id)
             print(f" - Found instance {i} with name {name} and global id {instance.global_id}.")
 
             if self.show_instances_detected:
                 self.show_instance(instance)
 
-            # Find a box
-            if "box" in name or "tray" in name:
-                receptacle_options.append(instance)
-
-                # Check to see if we can motion plan to box or not
-                plan = self.plan_to_instance_for_manipulation(instance, start=start)
-                if plan.success:
-                    print(f" - Found a reachable box at {instance.get_best_view().get_pose()}.")
-                    self.agent.current_receptacle = instance
-                else:
-                    self.warn(f" - Found a receptacle but could not reach it.")
-            elif self.agent.target_object in name:
-                relations = scene_graph.get_matching_relations(instance.global_id, "floor", "on")
-                if len(relations) == 0:
-                    # This may or may not be what we want, but it certainly is not on the floor
-                    continue
-
-                object_options.append(instance)
-                dist = np.linalg.norm(
-                    instance.point_cloud.mean(axis=0).cpu().numpy()[:2] - start[:2]
-                )
-                plan = self.plan_to_instance_for_manipulation(instance, start=start)
-                if plan.success:
-                    print(f" - Found a reachable object at {instance.get_best_view().get_pose()}.")
-                    if dist < dist_to_object:
-                        print(
-                            f" - This object is closer than the previous one: {dist} < {dist_to_object}."
-                        )
-                        self.agent.current_object = instance
-                        dist_to_object = dist
-                else:
-                    self.warn(f" - Found an object of class {name} but could not reach it.")
+            relations = scene_graph.get_matching_relations(instance.global_id, "floor", "on")
+            if len(relations) == 0:
+                # This may or may not be what we want, but it certainly is not on the floor
+                continue
+
+            object_options.append(instance)
+            dist = np.linalg.norm(instance.point_cloud.mean(axis=0).cpu().numpy()[:2] - start[:2])
+            plan = self.plan_to_instance_for_manipulation(instance, start=start)
+            if plan.success:
+                print(f" - Found a reachable object at {instance.get_best_view().get_pose()}.")
+                if dist < dist_to_object:
+                    print(
+                        f" - This object is closer than the previous one: {dist} < {dist_to_object}."
+                    )
+                    self.agent.current_object = instance
+                    dist_to_object = dist
+            else:
+                self.warn(f" - Found an object of class {name} but could not reach it.")
 
     def was_successful(self):
         """We're just taking an image so this is always going to be a success"""
diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 8f721e847..7b6a26f05 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -410,13 +410,21 @@ def show_map(self):
             instances=self.semantic_sensor is not None,
         )
 
-    def update(self, visualize_map: bool = False, debug_instances: bool = False):
+    def update(
+        self,
+        visualize_map: bool = False,
+        debug_instances: bool = False,
+        move_head: Optional[bool] = None,
+    ):
         """Step the data collector. Get a single observation of the world. Remove bad points, such as those from too far or too near the camera. Update the 3d world representation."""
         obs = None
         t0 = timeit.default_timer()
 
         steps = 0
-        if self._sweep_head_on_update:
+        print("Move head:", move_head)
+        move_head = (move_head is None and self._sweep_head_on_update) or move_head is True
+        print("Move head:", move_head)
+        if move_head:
             num_steps = 5
         else:
             num_steps = 1
@@ -430,7 +438,7 @@ def update(self, visualize_map: bool = False, debug_instances: bool = False):
 
         tilt = -1 * np.pi / 4
         for i in range(num_steps):
-            if self._sweep_head_on_update:
+            if move_head:
                 pan = -1 * i * np.pi / 4
                 print(f"[UPDATE] Head sweep {i} at {pan}, {tilt}")
                 self.robot.head_to(pan, tilt, blocking=True)
@@ -449,10 +457,10 @@ def update(self, visualize_map: bool = False, debug_instances: bool = False):
             self.voxel_map.add_obs(obs)
             t3 = timeit.default_timer()
 
-            if not self._sweep_head_on_update:
+            if not move_head:
                 break
 
-        if self._sweep_head_on_update:
+        if move_head:
             self.robot.head_to(0, tilt, blocking=True)
             x, y, theta = self.robot.get_base_pose()
             self.voxel_map.delete_obstacles(
diff --git a/src/stretch/app/grasp_object.py b/src/stretch/app/grasp_object.py
index 490534e08..003afc49a 100644
--- a/src/stretch/app/grasp_object.py
+++ b/src/stretch/app/grasp_object.py
@@ -27,6 +27,7 @@ def get_task(robot, demo, target_object):
     try:
         task = Task()
         update = UpdateOperation("update_scene", demo, retry_on_failure=True)
+        update.configure(move_head=False, target_object=target_object)
         grasp_object = GraspObjectOperation(
             "grasp_the_object",
             demo,

From fd328ee83fbb3fb37afeecfe419a9571adc7da7b Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 11 Sep 2024 16:00:34 -0400
Subject: [PATCH 098/410] Update all object information

---
 src/stretch/agent/operations/update.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/agent/operations/update.py b/src/stretch/agent/operations/update.py
index ae0a0a491..63e950a0f 100644
--- a/src/stretch/agent/operations/update.py
+++ b/src/stretch/agent/operations/update.py
@@ -24,7 +24,7 @@ class UpdateOperation(ManagedOperation):
     clear_voxel_map: bool = False
     move_head: Optional[bool] = None
     target_object: str = "cup"
-    match: str = "name"
+    match_method: str = "name"
 
     def set_target_object_class(self, object_class: str):
         self.warn(f"Overwriting target object class from {self.object_class} to {object_class}.")

From f0774db89e44ef5ed512338f334036a994ea0f49 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 11 Sep 2024 16:11:40 -0400
Subject: [PATCH 099/410] update code

---
 src/stretch/agent/operations/update.py            | 5 +++++
 src/stretch/agent/robot_agent.py                  | 6 +++++-
 src/stretch/perception/encoders/siglip_encoder.py | 2 +-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/operations/update.py b/src/stretch/agent/operations/update.py
index 63e950a0f..e18a54f5e 100644
--- a/src/stretch/agent/operations/update.py
+++ b/src/stretch/agent/operations/update.py
@@ -110,6 +110,11 @@ def run(self):
 
         if len(instances) == 0:
             self.warn(f"Could not find any instances of {self.target_object}.")
+            import matplotlib.pyplot as plt
+
+            for instance in self.agent.get_instances():
+                plt.imshow(instance.get_best_view().get_image())
+                plt.show()
             return
 
         print("Check explored instances for reachable receptacles:")
diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 7b6a26f05..5ece7df8c 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -250,6 +250,10 @@ def get_instance_from_text(
                 print(f" - Instance {ins} has activation {activation.item()}")
         return best_activation, best_instance
 
+    def get_instances(self) -> List[Instance]:
+        """Return all instances in the voxel map."""
+        return self.voxel_map.get_instances()
+
     def get_instances_from_text(
         self,
         text_query: str,
@@ -292,9 +296,9 @@ def get_instances_from_text(
             # TODO: this is hacky - should probably just not support other encoders this way
             # if hasattr(self.encoder, "classify"):
             #    prob = self.encoder.classify(instance.get_best_view().get_image(), text_query)
+            #    activation = prob
             # else:
             activation = self.encoder.compute_score(emb, encoded_text)
-            # activation = prob
 
             # Add the instance to the list of matches if the cosine similarity is above the threshold
             if activation.item() > threshold:
diff --git a/src/stretch/perception/encoders/siglip_encoder.py b/src/stretch/perception/encoders/siglip_encoder.py
index 683bda49f..fc8cdb313 100644
--- a/src/stretch/perception/encoders/siglip_encoder.py
+++ b/src/stretch/perception/encoders/siglip_encoder.py
@@ -27,7 +27,7 @@ class SiglipEncoder(BaseImageTextEncoder):
     Generally, these features are much better than OpenAI CLIP for open-vocabulary object detection.
     """
 
-    def __init__(self, normalize: bool = True, device: Optional[str] = None, **kwargs) -> None:
+    def __init__(self, normalize: bool = False, device: Optional[str] = None, **kwargs) -> None:
         if device is None:
             device = "cuda" if torch.cuda.is_available() else "cpu"
         self.device = device

From 1c6fc5e51071e9518de8aa8f7bfaa8f961fa9a36 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 11 Sep 2024 16:13:19 -0400
Subject: [PATCH 100/410] update

---
 src/stretch/agent/operations/update.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/stretch/agent/operations/update.py b/src/stretch/agent/operations/update.py
index e18a54f5e..2443a396a 100644
--- a/src/stretch/agent/operations/update.py
+++ b/src/stretch/agent/operations/update.py
@@ -47,6 +47,7 @@ def configure(
         self.show_map_so_far = show_map_so_far
         self.clear_voxel_map = clear_voxel_map
         self.target_object = target_object
+        print("Target object is set to", self.target_object)
 
     def run(self):
         self.intro("Updating the world model.")

From a009d5ead03bc850f79c0b50e1e769c0dc54b825 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 11 Sep 2024 16:27:24 -0400
Subject: [PATCH 101/410] Updating grasping code

---
 src/stretch/agent/operations/update.py        |  6 ++++++
 src/stretch/app/grasp_object.py               | 21 ++++---------------
 .../perception/encoders/siglip_encoder.py     |  2 +-
 3 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/src/stretch/agent/operations/update.py b/src/stretch/agent/operations/update.py
index 2443a396a..c1a43ca38 100644
--- a/src/stretch/agent/operations/update.py
+++ b/src/stretch/agent/operations/update.py
@@ -47,7 +47,13 @@ def configure(
         self.show_map_so_far = show_map_so_far
         self.clear_voxel_map = clear_voxel_map
         self.target_object = target_object
+        print("---- CONFIGURING UPDATE OPERATION ----")
+        print("Move head is set to", self.move_head)
+        print("Show instances detected is set to", self.show_instances_detected)
+        print("Show map so far is set to", self.show_map_so_far)
+        print("Clear voxel map is set to", self.clear_voxel_map)
         print("Target object is set to", self.target_object)
+        print("--------------------------------------")
 
     def run(self):
         self.intro("Updating the world model.")
diff --git a/src/stretch/app/grasp_object.py b/src/stretch/app/grasp_object.py
index 003afc49a..28d6914db 100644
--- a/src/stretch/app/grasp_object.py
+++ b/src/stretch/app/grasp_object.py
@@ -9,10 +9,7 @@
 # Some code may be adapted from other open-source works with their respective licenses. Original
 # license information maybe found below, if so.
 
-import time
-
 import click
-import numpy as np
 
 from stretch.agent.operations import GraspObjectOperation, UpdateOperation
 from stretch.agent.robot_agent import RobotAgent
@@ -24,6 +21,7 @@
 
 def get_task(robot, demo, target_object):
     """Create a very simple task just to test visual servoing to grasp."""
+    print("[GRASP OBJECT APP] Target object is set to", target_object)
     try:
         task = Task()
         update = UpdateOperation("update_scene", demo, retry_on_failure=True)
@@ -97,20 +95,9 @@ def main(
     demo = RobotAgent(robot, parameters, semantic_sensor, grasp_client=grasp_client)
     demo.start(visualize_map_at_start=show_intermediate_maps)
 
-    targets = ["cup", "hand_towel", "screwdriver"]
-
-    # for _ in range(repeat_count):
-    for target_object in targets:
-        if reset:
-            robot.move_to_nav_posture()
-            robot.navigate_to([0.0, 0.0, 0.0], blocking=True, timeout=30.0)
-            # robot.arm_to([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], blocking=True)
-            robot.arm_to([0.0, 0.78, 0.05, 0, -3 * np.pi / 8, 0], blocking=True)
-            time.sleep(3.0)
-
-        task = get_task(robot, demo, target_object)
-        task.run()
-        robot.open_gripper()
+    task = get_task(robot, demo, target_object)
+    task.run()
+    robot.open_gripper()
 
     if reset:
         robot.move_to_nav_posture()
diff --git a/src/stretch/perception/encoders/siglip_encoder.py b/src/stretch/perception/encoders/siglip_encoder.py
index fc8cdb313..683bda49f 100644
--- a/src/stretch/perception/encoders/siglip_encoder.py
+++ b/src/stretch/perception/encoders/siglip_encoder.py
@@ -27,7 +27,7 @@ class SiglipEncoder(BaseImageTextEncoder):
     Generally, these features are much better than OpenAI CLIP for open-vocabulary object detection.
     """
 
-    def __init__(self, normalize: bool = False, device: Optional[str] = None, **kwargs) -> None:
+    def __init__(self, normalize: bool = True, device: Optional[str] = None, **kwargs) -> None:
         if device is None:
             device = "cuda" if torch.cuda.is_available() else "cpu"
         self.device = device

From 04e5d4568966609f7bc20b9e645a32cb56772628 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 11 Sep 2024 16:31:46 -0400
Subject: [PATCH 102/410] Updates to configuration

---
 src/stretch/agent/operations/update.py | 6 ------
 src/stretch/app/grasp_object.py        | 8 +++++++-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/stretch/agent/operations/update.py b/src/stretch/agent/operations/update.py
index c1a43ca38..f873ee12c 100644
--- a/src/stretch/agent/operations/update.py
+++ b/src/stretch/agent/operations/update.py
@@ -117,12 +117,6 @@ def run(self):
 
         if len(instances) == 0:
             self.warn(f"Could not find any instances of {self.target_object}.")
-            import matplotlib.pyplot as plt
-
-            for instance in self.agent.get_instances():
-                plt.imshow(instance.get_best_view().get_image())
-                plt.show()
-            return
 
         print("Check explored instances for reachable receptacles:")
         for i, (score, instance) in enumerate(zip(scores, instances)):
diff --git a/src/stretch/app/grasp_object.py b/src/stretch/app/grasp_object.py
index 28d6914db..3ade650c9 100644
--- a/src/stretch/app/grasp_object.py
+++ b/src/stretch/app/grasp_object.py
@@ -25,7 +25,13 @@ def get_task(robot, demo, target_object):
     try:
         task = Task()
         update = UpdateOperation("update_scene", demo, retry_on_failure=True)
-        update.configure(move_head=False, target_object=target_object)
+        update.configure(
+            move_head=False,
+            target_object=target_object,
+            show_map_so_far=False,
+            clear_voxel_map=True,
+            show_instances_detected=False,
+        )
         grasp_object = GraspObjectOperation(
             "grasp_the_object",
             demo,

From 0fe2637d17742da757516f1bf0a2b290d494478d Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 11 Sep 2024 16:34:33 -0400
Subject: [PATCH 103/410] update

---
 src/stretch/agent/operations/grasp_object.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index 6bb9d8c84..5bb082056 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -363,6 +363,7 @@ def visual_servo_to_object(
             else:
                 # check not moving threshold
                 if not_moving_count > max_not_moving_count:
+                    self.info("Not moving; try to grasp.")
                     success = self._grasp()
                     break
                 # If we have a target mask, compute the median depth of the object
@@ -409,6 +410,7 @@ def visual_servo_to_object(
                 if aligned:
                     # First, check to see if we are close enough to grasp
                     if center_depth < self.median_distance_when_grasping:
+                        self.info("Aligned and close enough to grasp.")
                         success = self._grasp()
                         break
                     # If we are aligned, step the whole thing closer by some amount

From 1596ba1b27e5c3a55e6fffa4bd48fe75aae87c9f Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 11 Sep 2024 16:37:22 -0400
Subject: [PATCH 104/410] Fix early grasping; was becasue the robot was too far
 away

---
 src/stretch/agent/operations/grasp_object.py | 5 ++++-
 src/stretch/agent/robot_agent.py             | 2 --
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index 5bb082056..eae6cee2b 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -304,7 +304,7 @@ def visual_servo_to_object(
                 target_mask = np.zeros([servo.ee_rgb.shape[0], servo.ee_rgb.shape[1]], dtype=bool)
 
             # Get depth
-            center_depth = servo.ee_depth[center_y, center_x] / 1000
+            center_depth = servo.ee_depth[center_y, center_x]
 
             # Compute the center of the mask in image coords
             mask_center = self.observations.get_latest_centroid()
@@ -410,6 +410,9 @@ def visual_servo_to_object(
                 if aligned:
                     # First, check to see if we are close enough to grasp
                     if center_depth < self.median_distance_when_grasping:
+                        print(
+                            f"Center depth of {center_depth} is close enough to grasp; less than {self.median_distance_when_grasping}."
+                        )
                         self.info("Aligned and close enough to grasp.")
                         success = self._grasp()
                         break
diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 5ece7df8c..a122597b7 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -425,9 +425,7 @@ def update(
         t0 = timeit.default_timer()
 
         steps = 0
-        print("Move head:", move_head)
         move_head = (move_head is None and self._sweep_head_on_update) or move_head is True
-        print("Move head:", move_head)
         if move_head:
             num_steps = 5
         else:

From 6cc9a0e8315cafe8fdb5c128fa145f9691ce70f0 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Thu, 12 Sep 2024 00:28:21 -0700
Subject: [PATCH 105/410] Init dinobot proto

---
 src/stretch/agent/manipulation/dinobot.py     | 11 ++-
 .../agent/manipulation/dinobot_prototype.py   | 93 +++++++++++++++++++
 2 files changed, 101 insertions(+), 3 deletions(-)
 create mode 100644 src/stretch/agent/manipulation/dinobot_prototype.py

diff --git a/src/stretch/agent/manipulation/dinobot.py b/src/stretch/agent/manipulation/dinobot.py
index 5ecbeeaf1..74459e337 100644
--- a/src/stretch/agent/manipulation/dinobot.py
+++ b/src/stretch/agent/manipulation/dinobot.py
@@ -8,17 +8,19 @@
 # license information maybe found below, if so.
 
 import glob
+import sys
 
 import click
 import matplotlib.pyplot as plt
 import numpy as np
 import torch
-
+sys.path.append('/home/hello-robot/repos/dino-vit-features')
 # Install this DINO repo to extract correspondences: https://github.com/ShirAmir/dino-vit-features
 from correspondences import draw_correspondences, find_correspondences
 from PIL import Image
 
 from stretch.agent import HomeRobotZmqClient
+import sys
 
 # Hyperparameters for DINO correspondences extraction
 # num_pairs = 8
@@ -61,8 +63,8 @@ def load_demo(path_to_demo_folder):
         demo_deltas.append(delta)
 
     # pull out the first frame in the videos for the bottleneck images
-    rgb_bn = Image.open("bottleneck_rgb.png")
-    depth_bn = Image.open("bottleneck_depth.png")
+    rgb_bn = Image.open("ee_rgb_0.jpg")
+    depth_bn = Image.open("ee_rgb_1.jpg")
 
     return rgb_bn, depth_bn, demo_deltas
 
@@ -233,3 +235,6 @@ def main(robot_ip, local, path_to_demo_folder):
 
     # Start the demo
     dinobot_demo(robot, path_to_demo_folder)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
new file mode 100644
index 000000000..441532e84
--- /dev/null
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -0,0 +1,93 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import cv2
+import sys
+sys.path.append('/home/hello-robot/repos/dino-vit-features')
+from correspondences import find_correspondences, visualize_correspondences
+from extractor import ViTExtractor
+import torch
+import time
+import numpy as np
+from stretch.agent import RobotClient
+
+# image_path1 = "/home/hello-robot/ee_rgb_1.png" #@param
+# image_path2 = "/home/hello-robot/ee_rgb_0.png" #@param
+# image1 = cv2.imread(image_path1)
+# image2 = cv2.imread(image_path2)
+# image1 = cv2.cvtColor(image1, cv2.COLOR_BGR2RGB)
+# image2 = cv2.cvtColor(image2, cv2.COLOR_BGR2RGB)
+
+#@markdown Choose number of points to output:
+num_pairs = 10 #@param
+#@markdown Choose loading size:
+load_size = 224 #@param
+#@markdown Choose layer of descriptor:
+layer = 9 #@param
+#@markdown Choose facet of descriptor:
+facet = 'key' #@param
+#@markdown Choose if to use a binned descriptor:
+bin=True #@param
+#@markdown Choose fg / bg threshold:
+thresh=0.05 #@param
+#@markdown Choose model type:
+model_type='dino_vits8' #@param
+#@markdown Choose stride:
+stride=4 #@param
+
+class Dinobot:
+    def __init__(self, model_type: str = 'dino_vits8', stride: int = 4):
+        self.robot = RobotClient(robot_ip="10.0.0.14")
+        self.bottleneck_image = None
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.feature_extractor = ViTExtractor(model_type = model_type, 
+                                              stride = stride, 
+                                              device = self.device)
+    
+    def get_correspondences(self, image1, image2,
+                            num_pairs=10, load_size=224, layer=9,
+                            facet='key', bin=True, thresh=0.05):
+        points1, points2, image1_pil, image2_pil = find_correspondences(self.feature_extractor,
+                                                                        image1, image2, 
+                                                                        num_pairs, 
+                                                                        load_size, 
+                                                                        layer, 
+                                                                        facet, 
+                                                                        bin, 
+                                                                        thresh)
+        return points1, points2
+
+    def run(self):
+        print("Running Dinobot")
+        while True:
+            obs = self.robot.get_observation()
+            servo = self.robot.get_servo_observation()
+            ee_rgb = cv2.cvtColor(servo.ee_rgb, cv2.COLOR_BGR2RGB)
+            ee_depth = servo.ee_depth
+
+            if not isinstance(self.bottleneck_image,type(None)):
+                start = time.perf_counter()
+                with torch.no_grad():
+                    points1, points2 = self.get_correspondences(self.bottleneck_image, servo.ee_rgb)
+                    if len(points1) == len(points2):
+                        im1, im2 = visualize_correspondences(points1, points2, self.bottleneck_image, servo.ee_rgb)
+                        cv2.imshow("Correspondences", cv2.cvtColor(np.hstack([im1,im2]), cv2.COLOR_BGR2RGB))
+                    else:
+                        print("No correspondences found")
+                inf_ts = (time.perf_counter() - start) * 1000
+                print(f"\n  current: {inf_ts} ms")
+            if cv2.waitKey(1) & 0xFF == ord("q"):
+                cv2.destroyAllWindows()
+                self.robot.stop()
+                
+
+if __name__ == "__main__":
+    dinobot = Dinobot()
+    dinobot.bottleneck_image = dinobot.robot.get_servo_observation().ee_rgb
+    dinobot.run()
\ No newline at end of file

From 88cee15298dfff1a214764a8c5d832e5058c3e37 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Thu, 12 Sep 2024 00:47:47 -0700
Subject: [PATCH 106/410] Add mask to dinobot

---
 .../agent/manipulation/dinobot_prototype.py   | 41 ++++++++++++++-----
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index 441532e84..dccf600d5 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -12,10 +12,12 @@
 sys.path.append('/home/hello-robot/repos/dino-vit-features')
 from correspondences import find_correspondences, visualize_correspondences
 from extractor import ViTExtractor
+import matplotlib.pyplot as plt
 import torch
 import time
 import numpy as np
 from stretch.agent import RobotClient
+from stretch.perception.detection.detic import DeticPerception
 
 # image_path1 = "/home/hello-robot/ee_rgb_1.png" #@param
 # image_path2 = "/home/hello-robot/ee_rgb_0.png" #@param
@@ -41,6 +43,10 @@
 #@markdown Choose stride:
 stride=4 #@param
 
+
+track_object_id = 41  # detic object id for cup
+
+
 class Dinobot:
     def __init__(self, model_type: str = 'dino_vits8', stride: int = 4):
         self.robot = RobotClient(robot_ip="10.0.0.14")
@@ -63,31 +69,46 @@ def get_correspondences(self, image1, image2,
                                                                         thresh)
         return points1, points2
 
-    def run(self):
+    def run(self, visualize=False):
         print("Running Dinobot")
         while True:
             obs = self.robot.get_observation()
             servo = self.robot.get_servo_observation()
             ee_rgb = cv2.cvtColor(servo.ee_rgb, cv2.COLOR_BGR2RGB)
             ee_depth = servo.ee_depth
-
             if not isinstance(self.bottleneck_image,type(None)):
                 start = time.perf_counter()
                 with torch.no_grad():
                     points1, points2 = self.get_correspondences(self.bottleneck_image, servo.ee_rgb)
+                inf_ts = (time.perf_counter() - start) * 1000
+                print(f"\n  current: {inf_ts} ms")
+                if visualize:
                     if len(points1) == len(points2):
                         im1, im2 = visualize_correspondences(points1, points2, self.bottleneck_image, servo.ee_rgb)
-                        cv2.imshow("Correspondences", cv2.cvtColor(np.hstack([im1,im2]), cv2.COLOR_BGR2RGB))
+                        fig, axes = plt.subplots(1, 2, figsize=(10, 5))
+                        axes[0].imshow(im1)
+                        axes[0].set_title('Bottleneck Image')
+                        axes[0].axis('off')
+                        axes[1].imshow(im2)
+                        axes[1].set_title('Live Image')
+                        axes[1].axis('off')
+                        plt.show()
                     else:
                         print("No correspondences found")
-                inf_ts = (time.perf_counter() - start) * 1000
-                print(f"\n  current: {inf_ts} ms")
-            if cv2.waitKey(1) & 0xFF == ord("q"):
-                cv2.destroyAllWindows()
-                self.robot.stop()
+    
+    def update_bottleneck_image(self, image):
+        self.bottleneck_image = image
                 
 
 if __name__ == "__main__":
     dinobot = Dinobot()
-    dinobot.bottleneck_image = dinobot.robot.get_servo_observation().ee_rgb
-    dinobot.run()
\ No newline at end of file
+    detic = DeticPerception()
+
+    bottleneck_image = dinobot.robot.get_servo_observation().ee_rgb
+    semantic, instance, task_observations = detic.predict(bottleneck_image)
+    if track_object_id in task_observations["instance_classes"]:
+        object_mask = semantic == track_object_id
+        bottleneck_image[~object_mask] = [0, 0, 0]
+
+    dinobot.update_bottleneck_image(bottleneck_image)
+    dinobot.run(visualize=True)
\ No newline at end of file

From 4b55a762cc026a23efb9ea38d5ed877ba54749f3 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Thu, 12 Sep 2024 01:55:35 -0700
Subject: [PATCH 107/410] dinobot transformation compute working

---
 .../agent/manipulation/dinobot_prototype.py   | 188 +++++++++++-------
 1 file changed, 115 insertions(+), 73 deletions(-)

diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index dccf600d5..2dff7748f 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -7,108 +7,150 @@
 # Some code may be adapted from other open-source works with their respective licenses. Original
 # license information maybe found below, if so.
 
-import cv2
 import sys
-sys.path.append('/home/hello-robot/repos/dino-vit-features')
-from correspondences import find_correspondences, visualize_correspondences
-from extractor import ViTExtractor
-import matplotlib.pyplot as plt
-import torch
+
+sys.path.append("/home/hello-robot/repos/dino-vit-features")
 import time
+
+import matplotlib.pyplot as plt
 import numpy as np
+import torch
+from correspondences import find_correspondences, visualize_correspondences
+from extractor import ViTExtractor
+
 from stretch.agent import RobotClient
+from stretch.agent.manipulation.dinobot import (
+    compute_error,
+    extract_3d_coordinates,
+    find_transformation,
+)
 from stretch.perception.detection.detic import DeticPerception
 
-# image_path1 = "/home/hello-robot/ee_rgb_1.png" #@param
-# image_path2 = "/home/hello-robot/ee_rgb_0.png" #@param
-# image1 = cv2.imread(image_path1)
-# image2 = cv2.imread(image_path2)
-# image1 = cv2.cvtColor(image1, cv2.COLOR_BGR2RGB)
-# image2 = cv2.cvtColor(image2, cv2.COLOR_BGR2RGB)
-
-#@markdown Choose number of points to output:
-num_pairs = 10 #@param
-#@markdown Choose loading size:
-load_size = 224 #@param
-#@markdown Choose layer of descriptor:
-layer = 9 #@param
-#@markdown Choose facet of descriptor:
-facet = 'key' #@param
-#@markdown Choose if to use a binned descriptor:
-bin=True #@param
-#@markdown Choose fg / bg threshold:
-thresh=0.05 #@param
-#@markdown Choose model type:
-model_type='dino_vits8' #@param
-#@markdown Choose stride:
-stride=4 #@param
-
-
-track_object_id = 41  # detic object id for cup
-
 
 class Dinobot:
-    def __init__(self, model_type: str = 'dino_vits8', stride: int = 4):
-        self.robot = RobotClient(robot_ip="10.0.0.14")
-        self.bottleneck_image = None
-        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
-        self.feature_extractor = ViTExtractor(model_type = model_type, 
-                                              stride = stride, 
-                                              device = self.device)
-    
-    def get_correspondences(self, image1, image2,
-                            num_pairs=10, load_size=224, layer=9,
-                            facet='key', bin=True, thresh=0.05):
-        points1, points2, image1_pil, image2_pil = find_correspondences(self.feature_extractor,
-                                                                        image1, image2, 
-                                                                        num_pairs, 
-                                                                        load_size, 
-                                                                        layer, 
-                                                                        facet, 
-                                                                        bin, 
-                                                                        thresh)
+    def __init__(self, model_type: str = "dino_vits8", stride: int = 4):
+        self.bottleneck_image_rgb = None
+        self.bottleneck_image_depth = None
+        self.bottleneck_image_camera_K = None
+
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.feature_extractor = ViTExtractor(
+            model_type=model_type, stride=stride, device=self.device
+        )
+
+    def get_correspondences(
+        self,
+        image1,
+        image2,
+        num_pairs=10,
+        load_size=224,
+        layer=9,
+        facet="key",
+        bin=True,
+        thresh=0.05,
+    ):
+        points1, points2, image1_pil, image2_pil = find_correspondences(
+            self.feature_extractor, image1, image2, num_pairs, load_size, layer, facet, bin, thresh
+        )
         return points1, points2
 
-    def run(self, visualize=False):
+    def run(self, robot: RobotClient, visualize=False):
         print("Running Dinobot")
         while True:
-            obs = self.robot.get_observation()
-            servo = self.robot.get_servo_observation()
-            ee_rgb = cv2.cvtColor(servo.ee_rgb, cv2.COLOR_BGR2RGB)
+            servo = robot.get_servo_observation()
             ee_depth = servo.ee_depth
-            if not isinstance(self.bottleneck_image,type(None)):
+            if not isinstance(self.bottleneck_image_rgb, type(None)):
                 start = time.perf_counter()
                 with torch.no_grad():
-                    points1, points2 = self.get_correspondences(self.bottleneck_image, servo.ee_rgb)
+                    points1, points2 = self.get_correspondences(
+                        self.bottleneck_image_rgb, servo.ee_rgb
+                    )
+                    self.move_to_bottleneck(None, points1, points2, ee_depth)
                 inf_ts = (time.perf_counter() - start) * 1000
                 print(f"\n  current: {inf_ts} ms")
                 if visualize:
                     if len(points1) == len(points2):
-                        im1, im2 = visualize_correspondences(points1, points2, self.bottleneck_image, servo.ee_rgb)
+                        im1, im2 = visualize_correspondences(
+                            points1, points2, self.bottleneck_image_rgb, servo.ee_rgb
+                        )
                         fig, axes = plt.subplots(1, 2, figsize=(10, 5))
                         axes[0].imshow(im1)
-                        axes[0].set_title('Bottleneck Image')
-                        axes[0].axis('off')
+                        axes[0].set_title("Bottleneck Image")
+                        axes[0].axis("off")
                         axes[1].imshow(im2)
-                        axes[1].set_title('Live Image')
-                        axes[1].axis('off')
+                        axes[1].set_title("Live Image")
+                        axes[1].axis("off")
                         plt.show()
                     else:
                         print("No correspondences found")
-    
-    def update_bottleneck_image(self, image):
-        self.bottleneck_image = image
-                
+            else:
+                print("No bottleneck image found")
+
+    def update_bottleneck_image(
+        self, image: np.ndarray, depth: np.ndarray, camera_K: np.ndarray, mask: np.ndarray
+    ):
+        image[~mask] = [0, 0, 0]
+        self.bottleneck_image_rgb = image.copy()
+        self.bottleneck_image_depth = depth.copy()
+        self.bottleneck_image_camera_K = camera_K.copy()
+
+    def move_to_bottleneck(self, robot: RobotClient, bottleneck_points, live_points, live_depth):
+        # Given the pixel coordinates of the correspondences, and their depth values,
+        # project the points to 3D space.
+        bottleneck_xyz = depth_to_xyz(self.bottleneck_image_depth, self.bottleneck_image_camera_K)
+        points1 = extract_3d_coordinates(bottleneck_points, bottleneck_xyz)
+        live_xyz = depth_to_xyz(live_depth, self.bottleneck_image_camera_K)
+        points2 = extract_3d_coordinates(live_points, live_xyz)
+        # Find rigid translation and rotation that aligns the points by minimising error, using SVD.
+        R, t = find_transformation(points1, points2)
+        print(f"Robot needs to R: {R}, T: {t}")
+        # TODO: Move robot
+        error = compute_error(points1, points2)
+        return error
+
+
+def depth_to_xyz(depth_image, camera_matrix):
+    # Convert depth image to point cloud
+    h, w = depth_image.shape
+    i, j = np.indices((h, w))
+    z = depth_image
+    x = (j - camera_matrix[0, 2]) * z / camera_matrix[0, 0]
+    y = (i - camera_matrix[1, 2]) * z / camera_matrix[1, 1]
+    xyz = np.stack((x, y, z), axis=-1)
+    return xyz
+
+
+def depth_to_xyz(depth, camera_K):
+    """get depth from numpy using simple pinhole camera model"""
+    h, w = depth.shape
+    indices = np.indices((h, w), dtype=np.float32).transpose(1, 2, 0)
+    z = depth
+    px, py = camera_K[0, 2], camera_K[1, 2]
+    fx, fy = camera_K[0, 0], camera_K[1, 1]
+    # pixel indices start at top-left corner. for these equations, it starts at bottom-left
+    x = (indices[:, :, 1] - px) * (z / fx)
+    y = (indices[:, :, 0] - py) * (z / fy)
+    # Should now be height x width x 3, after this:
+    xyz = np.stack([x, y, z], axis=-1)
+    return xyz
+
 
 if __name__ == "__main__":
+    robot = RobotClient(robot_ip="10.0.0.14")
     dinobot = Dinobot()
     detic = DeticPerception()
+    track_object_id = 41  # detic object id for cup
 
-    bottleneck_image = dinobot.robot.get_servo_observation().ee_rgb
-    semantic, instance, task_observations = detic.predict(bottleneck_image)
+    # First frame is the bottleneck image
+    bottleneck_image_rgb = robot.get_servo_observation().ee_rgb
+    bottleneck_image_depth = robot.get_servo_observation().ee_depth
+    bottleneck_image_camera_K = robot.get_servo_observation().ee_camera_K
+    semantic, instance, task_observations = detic.predict(bottleneck_image_rgb)
     if track_object_id in task_observations["instance_classes"]:
         object_mask = semantic == track_object_id
-        bottleneck_image[~object_mask] = [0, 0, 0]
-
-    dinobot.update_bottleneck_image(bottleneck_image)
-    dinobot.run(visualize=True)
\ No newline at end of file
+        dinobot.update_bottleneck_image(
+            bottleneck_image_rgb, bottleneck_image_depth, bottleneck_image_camera_K, object_mask
+        )
+        dinobot.run(robot, visualize=True)
+    else:
+        print(f"Object ID: {track_object_id} not found in the image")

From aa571e300b8ec2d6964cd603523c97ad84319244 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Thu, 12 Sep 2024 02:29:42 -0700
Subject: [PATCH 108/410] Demo instance class

---
 .../agent/manipulation/dinobot_prototype.py   | 133 ++++++++++++------
 1 file changed, 90 insertions(+), 43 deletions(-)

diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index 2dff7748f..de5619d2e 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -11,6 +11,7 @@
 
 sys.path.append("/home/hello-robot/repos/dino-vit-features")
 import time
+from typing import List, Tuple
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -27,12 +28,49 @@
 from stretch.perception.detection.detic import DeticPerception
 
 
+class Demo:
+    """
+    A demonstration from bottleneck pose
+    """
+
+    def __init__(
+        self, image: np.ndarray, depth: np.ndarray, camera_K: np.ndarray, mask: np.ndarray
+    ):
+        """
+        Initialize the Demo class
+        Args:
+            image (np.ndarray): The bottleneck image
+            depth (np.ndarray): The bottleneck depth image
+            camera_K (np.ndarray): The camera intrinsics
+            mask (np.ndarray): The object mask
+        """
+        image[~mask] = [0, 0, 0]
+        self.bottleneck_image_rgb = image.copy()
+        self.bottleneck_image_depth = depth.copy()
+        self.bottleneck_image_camera_K = camera_K.copy()
+        self.object_mask = mask.copy()
+        self.trajectories = {}
+
+    @staticmethod
+    def load_demo(self, path_to_demo_folder=None):
+        # Load a demonstration from a folder containing data
+        # TODO
+        raise NotImplementedError
+
+
 class Dinobot:
-    def __init__(self, model_type: str = "dino_vits8", stride: int = 4):
-        self.bottleneck_image_rgb = None
-        self.bottleneck_image_depth = None
-        self.bottleneck_image_camera_K = None
+    """
+    Dinobot is a class that uses DINO correspondences to move the robot to the bottleneck pose.
+    And replay a demonstration.
+    """
 
+    def __init__(self, model_type: str = "dino_vits8", stride: int = 4):
+        """
+        Initialize the Dinobot class
+        Args:
+            model_type (str): The model type to use for feature extraction
+            stride (int): The stride to use for feature extraction
+        """
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.feature_extractor = ViTExtractor(
             model_type=model_type, stride=stride, device=self.device
@@ -40,38 +78,48 @@ def __init__(self, model_type: str = "dino_vits8", stride: int = 4):
 
     def get_correspondences(
         self,
-        image1,
-        image2,
-        num_pairs=10,
-        load_size=224,
-        layer=9,
-        facet="key",
-        bin=True,
-        thresh=0.05,
-    ):
+        image1: np.ndarray,
+        image2: np.ndarray,
+        num_pairs: int = 10,
+        load_size: int = 224,
+        layer: int = 9,
+        facet: str = "key",
+        bin: bool = True,
+        thresh: float = 0.05,
+    ) -> Tuple[List, List]:
+        """
+        Get correspondences key points between two images
+        """
         points1, points2, image1_pil, image2_pil = find_correspondences(
             self.feature_extractor, image1, image2, num_pairs, load_size, layer, facet, bin, thresh
         )
         return points1, points2
 
-    def run(self, robot: RobotClient, visualize=False):
+    def run(self, robot: RobotClient, demo: Demo, visualize: bool = False):
+        """
+        Run the Dinobot algorithm
+        Args:
+            robot (RobotClient): The robot client to use
+            demo (Demo): The demonstration to replay
+            visualize (bool): Whether to visualize the correspondences
+        """
         print("Running Dinobot")
         while True:
             servo = robot.get_servo_observation()
             ee_depth = servo.ee_depth
-            if not isinstance(self.bottleneck_image_rgb, type(None)):
+            if not isinstance(demo.bottleneck_image_rgb, type(None)):
                 start = time.perf_counter()
                 with torch.no_grad():
                     points1, points2 = self.get_correspondences(
-                        self.bottleneck_image_rgb, servo.ee_rgb
+                        demo.bottleneck_image_rgb, servo.ee_rgb
                     )
-                    self.move_to_bottleneck(None, points1, points2, ee_depth)
+                    self.move_to_bottleneck(None, points1, points2, ee_depth, demo)
                 inf_ts = (time.perf_counter() - start) * 1000
                 print(f"\n  current: {inf_ts} ms")
                 if visualize:
                     if len(points1) == len(points2):
                         im1, im2 = visualize_correspondences(
-                            points1, points2, self.bottleneck_image_rgb, servo.ee_rgb
+                            points1, points2, demo.bottleneck_image_rgb, servo.ee_rgb
                         )
                         fig, axes = plt.subplots(1, 2, figsize=(10, 5))
                         axes[0].imshow(im1)
@@ -86,20 +134,30 @@ def run(self, robot: RobotClient, visualize=False):
             else:
                 print("No bottleneck image found")
 
-    def update_bottleneck_image(
-        self, image: np.ndarray, depth: np.ndarray, camera_K: np.ndarray, mask: np.ndarray
-    ):
-        image[~mask] = [0, 0, 0]
-        self.bottleneck_image_rgb = image.copy()
-        self.bottleneck_image_depth = depth.copy()
-        self.bottleneck_image_camera_K = camera_K.copy()
-
-    def move_to_bottleneck(self, robot: RobotClient, bottleneck_points, live_points, live_depth):
+    def move_to_bottleneck(
+        self,
+        robot: RobotClient,
+        bottleneck_points: List,
+        live_points: List,
+        live_depth: np.ndarray,
+        demo: Demo,
+    ) -> float:
+        """
+        Move the robot to the bottleneck pose
+        Args:
+            robot (RobotClient): The robot client to use
+            bottleneck_points (List): The bottleneck points
+            live_points (List): The live points
+            live_depth (np.ndarray): The depth image
+            demo (Demo): The demonstration to replay
+        Returns:
+            float: The error between the bottleneck and live points
+        """
         # Given the pixel coordinates of the correspondences, and their depth values,
         # project the points to 3D space.
-        bottleneck_xyz = depth_to_xyz(self.bottleneck_image_depth, self.bottleneck_image_camera_K)
+        bottleneck_xyz = depth_to_xyz(demo.bottleneck_image_depth, demo.bottleneck_image_camera_K)
         points1 = extract_3d_coordinates(bottleneck_points, bottleneck_xyz)
-        live_xyz = depth_to_xyz(live_depth, self.bottleneck_image_camera_K)
+        live_xyz = depth_to_xyz(live_depth, demo.bottleneck_image_camera_K)
         points2 = extract_3d_coordinates(live_points, live_xyz)
         # Find rigid translation and rotation that aligns the points by minimising error, using SVD.
         R, t = find_transformation(points1, points2)
@@ -109,18 +167,7 @@ def move_to_bottleneck(self, robot: RobotClient, bottleneck_points, live_points,
         return error
 
 
-def depth_to_xyz(depth_image, camera_matrix):
-    # Convert depth image to point cloud
-    h, w = depth_image.shape
-    i, j = np.indices((h, w))
-    z = depth_image
-    x = (j - camera_matrix[0, 2]) * z / camera_matrix[0, 0]
-    y = (i - camera_matrix[1, 2]) * z / camera_matrix[1, 1]
-    xyz = np.stack((x, y, z), axis=-1)
-    return xyz
-
-
-def depth_to_xyz(depth, camera_K):
+def depth_to_xyz(depth: np.ndarray, camera_K: np.ndarray) -> np.ndarray:
     """get depth from numpy using simple pinhole camera model"""
     h, w = depth.shape
     indices = np.indices((h, w), dtype=np.float32).transpose(1, 2, 0)
@@ -148,9 +195,9 @@ def depth_to_xyz(depth, camera_K):
     semantic, instance, task_observations = detic.predict(bottleneck_image_rgb)
     if track_object_id in task_observations["instance_classes"]:
         object_mask = semantic == track_object_id
-        dinobot.update_bottleneck_image(
+        demo = Demo(
             bottleneck_image_rgb, bottleneck_image_depth, bottleneck_image_camera_K, object_mask
         )
-        dinobot.run(robot, visualize=True)
+        dinobot.run(robot, demo, visualize=True)
     else:
         print(f"Object ID: {track_object_id} not found in the image")

From 45b90a204bbafc53f8e1f13f77e6dfc8cd36d6da Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 11:55:29 -0400
Subject: [PATCH 109/410] Tweaking setup

---
 src/stretch/app/pickup_all.py           | 15 +--------------
 src/stretch/config/default_planner.yaml |  2 +-
 2 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/src/stretch/app/pickup_all.py b/src/stretch/app/pickup_all.py
index 01f1ff5f7..82e777c88 100644
--- a/src/stretch/app/pickup_all.py
+++ b/src/stretch/app/pickup_all.py
@@ -12,7 +12,6 @@
 import click
 
 from stretch.agent.robot_agent import RobotAgent
-from stretch.agent.task.pickup import PickupTask
 from stretch.agent.zmq_client import HomeRobotZmqClient
 from stretch.core import get_parameters
 from stretch.perception import create_semantic_sensor
@@ -79,23 +78,11 @@ def main(
     # Agents wrap the robot high level planning interface for now
     agent = RobotAgent(robot, parameters, semantic_sensor, grasp_client=grasp_client)
     agent.start(visualize_map_at_start=show_intermediate_maps)
-    if reset:
-        agent.move_closed_loop([0, 0, 0], max_time=60.0)
-
-    # After the robot has started...
-    try:
-        pickup_task = PickupTask(agent, target_object=target_object, destination=destination)
-        task = pickup_task.get_task(add_rotate=force_rotate, mode=mode)
-    except Exception as e:
-        print(f"Error creating task: {e}")
-        robot.stop()
-        raise e
-
-    task.run()
 
     if reset:
         # Send the robot home at the end!
         agent.go_home()
+        agent.move_closed_loop([0, 0, 0], max_time=60.0)
 
     # At the end, disable everything
     robot.stop()
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index baee56895..a48593292 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -115,7 +115,7 @@ motion_planner:
     # Distance away you search for frontier points
     min_dist: 0.1
     # Subsampling frontier space at this discretization
-    step_dist: 0.1
+    step_dist: 0.2
   goals:
     manipulation_radius: 0.55
 

From edbb4fe1fdb388416622a6d3ec47fed4059ddf40 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 12:12:42 -0400
Subject: [PATCH 110/410] update agent

---
 src/stretch/agent/robot_agent.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index a122597b7..71e937389 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1036,7 +1036,6 @@ def choose_best_goal_instance(self, goal: str, debug: bool = False) -> Instance:
     def plan_to_frontier(
         self,
         start: np.ndarray,
-        rate: int = 10,
         manual_wait: bool = False,
         random_goals: bool = False,
         try_to_plan_iter: int = 10,

From a79a0a88f507e0aa45b2b719b5902847f5d8b013 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 12:21:26 -0400
Subject: [PATCH 111/410] try to plan iter

---
 src/stretch/agent/robot_agent.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 71e937389..fad07c3f1 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1048,6 +1048,7 @@ def plan_to_frontier(
         if not start_is_valid:
             if verbose:
                 print("Start not valid. back up a bit.")
+            self.robot.navigate_to([-0.1, 0, 0], relative=True)
             return PlanResult(False, reason="invalid start state")
 
         # sample a goal
@@ -1079,7 +1080,6 @@ def plan_to_frontier(
 
     def run_exploration(
         self,
-        rate: int = 10,
         manual_wait: bool = False,
         explore_iter: int = 3,
         try_to_plan_iter: int = 10,
@@ -1118,7 +1118,7 @@ def run_exploration(
             print("       Start:", start)
             self.print_found_classes(task_goal)
             res = self.plan_to_frontier(
-                start=start, rate=rate, random_goals=random_goals, try_to_plan_iter=try_to_plan_iter
+                start=start, random_goals=random_goals, try_to_plan_iter=try_to_plan_iter
             )
 
             # if it succeeds, execute a trajectory to this position

From 23367e6313c369fc5a3fba0026fc6682a27387a4 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 12:22:47 -0400
Subject: [PATCH 112/410] finish removing "rate" since its not necessary

---
 src/stretch/app/mapping.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/stretch/app/mapping.py b/src/stretch/app/mapping.py
index 54df9bb93..20b25912d 100644
--- a/src/stretch/app/mapping.py
+++ b/src/stretch/app/mapping.py
@@ -26,7 +26,6 @@
 @click.command()
 @click.option("--local", is_flag=True, help="Run code locally on the robot.")
 @click.option("--robot_ip", default="")
-@click.option("--rate", default=5, type=int)
 @click.option("--visualize", default=False, is_flag=True)
 @click.option("--manual-wait", default=False, is_flag=True)
 @click.option("--output-filename", default="stretch_output", type=str)
@@ -47,7 +46,6 @@
 @click.option("--parameter-file", default="default_planner.yaml")
 @click.option("--reset", is_flag=True, help="Reset the robot to origin before starting")
 def main(
-    rate,
     visualize,
     manual_wait,
     output_filename,
@@ -82,7 +80,6 @@ def main(
     demo_main(
         robot,
         parameters=parameters,
-        rate=rate,
         visualize=visualize,
         manual_wait=manual_wait,
         output_filename=output_filename,
@@ -105,7 +102,6 @@ def main(
 
 def demo_main(
     robot: AbstractRobotClient,
-    rate,
     visualize,
     manual_wait,
     output_filename,
@@ -189,7 +185,6 @@ def demo_main(
             if object_to_find is not None:
                 print(f"Exploring for {object_to_find}...")
             demo.run_exploration(
-                rate,
                 manual_wait,
                 explore_iter=parameters["exploration_steps"],
                 task_goal=object_to_find,

From 0206e7b3ecdf280fc9070aab8febde67daabf7d6 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 12:23:56 -0400
Subject: [PATCH 113/410] update information

---
 src/stretch/agent/zmq_client.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 82c4e1875..18171b93a 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -695,9 +695,6 @@ def _wait_for_action(
         t0 = timeit.default_timer()
         close_to_goal = False
 
-        # TODO: this is useful for debugging
-        # verbose = True
-
         while True:
 
             # Minor delay at the end - give it time to get new messages

From d266bf7943dcaa006168a6990cec36f3be92d2b6 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 12:40:54 -0400
Subject: [PATCH 114/410] update

---
 src/stretch/agent/robot_agent.py | 54 +++++++++++++++++++++++++++-----
 src/stretch/app/read_map.py      |  9 ++++--
 2 files changed, 52 insertions(+), 11 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index fad07c3f1..2f4111a0e 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -33,7 +33,7 @@
 from stretch.motion.algo import RRTConnect, Shortcut, SimplifyXYT
 from stretch.perception.encoders import BaseImageTextEncoder, get_encoder
 from stretch.perception.wrapper import OvmmPerception
-from stretch.utils.geometry import angle_difference
+from stretch.utils.geometry import angle_difference, xyt_base_to_global
 from stretch.utils.point_cloud import ransac_transform
 
 
@@ -722,6 +722,44 @@ def move_to_instance(self, instance: Instance, max_try_per_instance=10) -> bool:
                 return True
         return False
 
+    def recover_from_invalid_start(self) -> bool:
+        """Try to recover from an invalid start state.
+
+        Returns:
+            bool: whether the robot recovered from the invalid start state
+        """
+
+        # Get current invalid pose
+        start = self.robot.get_base_pose()
+
+        # Apply relative transformation to XYT
+        forward = np.array([-0.1, 0, 0])
+        backward = np.array([0.1, 0, 0])
+
+        xyt_goal_forward = xyt_base_to_global(forward, start)
+        xyt_goal_backward = xyt_base_to_global(backward, start)
+
+        # Is this valid?
+        if self.space.is_valid(xyt_goal_backward, verbose=True):
+            logger.warning("Trying to move backwards...")
+            # Compute the position forward or backward from the robot
+            self.robot.navigate_to(xyt_goal_backward, relative=False)
+        elif self.space.is_valid(xyt_goal_forward, verbose=True):
+            logger.warning("Trying to move forward...")
+            # Compute the position forward or backward from the robot
+            self.robot.navigate_to(xyt_goal_forward, relative=False)
+        else:
+            logger.warning("Could not recover from invalid start state!")
+            return False
+
+        # Get the current position in case we are still invalid
+        start = self.robot.get_base_pose()
+        start_is_valid = self.space.is_valid(start, verbose=True)
+        if not start_is_valid:
+            logger.warning("Tried and failed to recover from invalid start state!")
+            return False
+        return start_is_valid
+
     def move_to_any_instance(
         self, matches: List[Tuple[int, Instance]], max_try_per_instance=10, verbose: bool = False
     ) -> bool:
@@ -733,12 +771,11 @@ def move_to_any_instance(
         start_is_valid_retries = 5
         while not start_is_valid and start_is_valid_retries > 0:
             if verbose:
-                print(f"Start {start} is not valid. back up a bit.")
-            self.robot.navigate_to([-0.1, 0, 0], relative=True)
-            # Get the current position in case we are still invalid
-            start = self.robot.get_base_pose()
-            start_is_valid = self.space.is_valid(start, verbose=True)
+                print(f"Start {start} is not valid. Either back up a bit or go forward.")
+            ok = self.recover_from_invalid_start()
             start_is_valid_retries -= 1
+            start_is_valid = ok
+
         res = None
 
         # Just terminate here - motion planning issues apparently!
@@ -1048,8 +1085,9 @@ def plan_to_frontier(
         if not start_is_valid:
             if verbose:
                 print("Start not valid. back up a bit.")
-            self.robot.navigate_to([-0.1, 0, 0], relative=True)
-            return PlanResult(False, reason="invalid start state")
+            ok = self.recover_from_invalid_start()
+            if not ok:
+                return PlanResult(False, reason="invalid start state")
 
         # sample a goal
         if random_goals:
diff --git a/src/stretch/app/read_map.py b/src/stretch/app/read_map.py
index bd6ce5608..8ec71c254 100644
--- a/src/stretch/app/read_map.py
+++ b/src/stretch/app/read_map.py
@@ -119,8 +119,8 @@ def plan_to_deltas(xyt0, plan):
     "--start",
     "-x",
     type=str,
-    default="0,0,0",
-    help="start pose for planning as a tuple X,Y,Theta in meters and radians",
+    default="",
+    help="start pose for planning as a tuple X,Y,Theta in meters and radians. Empty will parse from file.",
 )
 @click.option(
     "--test-remove",
@@ -200,7 +200,10 @@ def main(
             resolution=voxel_size, use_instance_memory=(run_segmentation or show_instances)
         )
 
-    x0 = np.array([float(x) for x in start.split(",")])
+    if len(start) > 0:
+        x0 = np.array([float(x) for x in start.split(",")])
+    else:
+        x0 = voxel_map.observations[-1].base_pose
     assert len(x0) == 3, "start pose must be 3 values: x, y, theta"
     start_xyz = [x0[0], x0[1], 0]
 

From c57e47da0a41185327d4004d2270e4d5ce249f47 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 12:42:41 -0400
Subject: [PATCH 115/410] update information

---
 src/stretch/config/default_planner.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index a48593292..8ecf829a0 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -110,7 +110,7 @@ motion_planner:
   # Parameters for frontier exploration using the motion planner.
   frontier:
     dilate_frontier_size: 4  # Used to shrink the frontier back from the edges of the world
-    dilate_obstacle_size: 5  # Used when selecting goals and computing what the "frontier" is 
+    dilate_obstacle_size: 4  # Used when selecting goals and computing what the "frontier" is 
     default_expand_frontier_size: 10  # margin along the frontier where final robot position can be
     # Distance away you search for frontier points
     min_dist: 0.1

From fc7c2eab1a8d8c280b56af32972a7ca5e60cb0ef Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 12:44:13 -0400
Subject: [PATCH 116/410] update default planner paarams for testing

---
 src/stretch/config/default_planner.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 8ecf829a0..57cb24c77 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -16,7 +16,7 @@ obs_min_density: 50  # This many points makes it an obstacle
 min_points_per_voxel: 50  # Drop things below this density per voxel
 
 # Padding
-pad_obstacles: 2  # Add this many units (voxel_size) to the area around obstacles
+pad_obstacles: 3  # Add this many units (voxel_size) to the area around obstacles
 # pad_obstacles: 1  # Add this many units (voxel_size) to the area around obstacles
 min_pad_obstacles: 0  # Do not pad LESS than this amount, for safety.
 
@@ -111,7 +111,7 @@ motion_planner:
   frontier:
     dilate_frontier_size: 4  # Used to shrink the frontier back from the edges of the world
     dilate_obstacle_size: 4  # Used when selecting goals and computing what the "frontier" is 
-    default_expand_frontier_size: 10  # margin along the frontier where final robot position can be
+    default_expand_frontier_size: 12  # margin along the frontier where final robot position can be
     # Distance away you search for frontier points
     min_dist: 0.1
     # Subsampling frontier space at this discretization

From 3fe0f350dc398a1e11c602298463b75db711fbd0 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 13:21:33 -0400
Subject: [PATCH 117/410] various motion planning changes to try to improve
 caution in different envronments

---
 src/stretch/agent/robot_agent.py        | 24 +++++++++++++++++++++++-
 src/stretch/app/read_map.py             |  2 +-
 src/stretch/config/default_planner.yaml |  6 +++---
 src/stretch/mapping/voxel/voxel_map.py  | 23 +++++++++++++++++++++--
 src/stretch/motion/algo/rrt_connect.py  | 11 ++++++++++-
 src/stretch/motion/algo/simplify.py     |  4 ++++
 6 files changed, 62 insertions(+), 8 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 2f4111a0e..1b1ee7d42 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -341,7 +341,7 @@ def grasp_object(self, object_goal: Optional[str] = None, **kwargs) -> bool:
         return self.grasp_client.try_grasping(object_goal=object_goal, **kwargs)
 
     def rotate_in_place(
-        self, steps: Optional[int] = -1, visualize: bool = False, verbose: bool = True
+        self, steps: Optional[int] = -1, visualize: bool = False, verbose: bool = False
     ) -> bool:
         """Simple helper function to make the robot rotate in place. Do a 360 degree turn to get some observations (this helps debug the robot and create a nice map).
 
@@ -1088,6 +1088,8 @@ def plan_to_frontier(
             ok = self.recover_from_invalid_start()
             if not ok:
                 return PlanResult(False, reason="invalid start state")
+        else:
+            print("Planning to frontier...")
 
         # sample a goal
         if random_goals:
@@ -1106,16 +1108,36 @@ def plan_to_frontier(
                     # No more positions to sample
                     break
 
+                if self.space.is_valid(goal.cpu().numpy(), verbose=True):
+                    if verbose:
+                        print("       Start:", start)
+                        print("Sampled Goal:", goal.cpu().numpy())
+                else:
+                    continue
+
                 # For debugging, we can set the random seed to 0
                 if fix_random_seed:
                     np.random.seed(0)
                     random.seed(0)
 
+                self.planner.space.push_locations_to_stack(self.get_history(reversed=True))
                 res = self.planner.plan(start, goal.cpu().numpy())
                 if res.success:
                     return res
+                else:
+                    if verbose:
+                        print("Plan failed. Reason:", res.reason)
         return PlanResult(False, reason="no valid plans found")
 
+    def get_history(self, reversed: bool = False) -> List[np.ndarray]:
+        """Get the history of the robot's positions."""
+        history = []
+        for obs in self.voxel_map.observations:
+            history.append(obs.base_pose)
+        if reversed:
+            history.reverse()
+        return history
+
     def run_exploration(
         self,
         manual_wait: bool = False,
diff --git a/src/stretch/app/read_map.py b/src/stretch/app/read_map.py
index 8ec71c254..e586b7a85 100644
--- a/src/stretch/app/read_map.py
+++ b/src/stretch/app/read_map.py
@@ -203,7 +203,7 @@ def main(
     if len(start) > 0:
         x0 = np.array([float(x) for x in start.split(",")])
     else:
-        x0 = voxel_map.observations[-1].base_pose
+        x0 = voxel_map.observations[-1].base_pose.numpy()
     assert len(x0) == 3, "start pose must be 3 values: x, y, theta"
     start_xyz = [x0[0], x0[1], 0]
 
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 57cb24c77..34cb0a0a3 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -109,9 +109,9 @@ motion_planner:
   shortcut_iter: 100
   # Parameters for frontier exploration using the motion planner.
   frontier:
-    dilate_frontier_size: 4  # Used to shrink the frontier back from the edges of the world
-    dilate_obstacle_size: 4  # Used when selecting goals and computing what the "frontier" is 
-    default_expand_frontier_size: 12  # margin along the frontier where final robot position can be
+    dilate_frontier_size: 5  # Used to shrink the frontier back from the edges of the world
+    dilate_obstacle_size: 6  # Used when selecting goals and computing what the "frontier" is 
+    default_expand_frontier_size: 10  # margin along the frontier where final robot position can be
     # Distance away you search for frontier points
     min_dist: 0.1
     # Subsampling frontier space at this discretization
diff --git a/src/stretch/mapping/voxel/voxel_map.py b/src/stretch/mapping/voxel/voxel_map.py
index d13e7c94b..b23f4c40a 100644
--- a/src/stretch/mapping/voxel/voxel_map.py
+++ b/src/stretch/mapping/voxel/voxel_map.py
@@ -12,7 +12,8 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 import math
-from typing import Optional, Tuple
+from collections import deque
+from typing import Optional, Tuple, Union
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -64,6 +65,9 @@ def __init__(
             grid = self.voxel_map.grid
         self.grid = grid
 
+        # Create a stack for storing states to sample
+        self._stack = deque()
+
         # Always use 3d states
         self.use_orientation = use_orientation
         if self.use_orientation:
@@ -808,14 +812,29 @@ def sample_valid_location(self, max_tries: int = 100) -> Optional[torch.Tensor]:
         else:
             yield None
 
+    def push_locations_to_stack(self, locations: list[Union[np.ndarray, torch.Tensor]]):
+        """Push locations to stack for sampling.
+
+        Args:
+            locations(list): list of locations to push to stack
+        """
+        for loc in locations:
+            if isinstance(loc, torch.Tensor):
+                loc = loc.cpu().numpy()
+            self._stack.append(loc)
+
     def sample(self) -> np.ndarray:
         """Sample any position that corresponds to an "explored" location. Goals are valid if they are within a reasonable distance of explored locations. Paths through free space are ok and don't collide.
 
         Since our motion planners currently use numpy, we'll stick with that for the return type for now.
         """
 
+        if len(self._stack) > 0:
+            state = self._stack.pop()
+            return state
+
         # Sample any point which is explored and not an obstacle
-        # Sampled points are convertd to CPU for now
+        # Sampled points are converted to CPU for now
         point = self.voxel_map.sample_explored()
 
         # Create holder
diff --git a/src/stretch/motion/algo/rrt_connect.py b/src/stretch/motion/algo/rrt_connect.py
index 57646a99f..28d4382fb 100644
--- a/src/stretch/motion/algo/rrt_connect.py
+++ b/src/stretch/motion/algo/rrt_connect.py
@@ -77,6 +77,11 @@ def plan(self, start, goal, verbose: bool = False, **kwargs) -> PlanResult:
                 nodes0, nodes1 = self.nodes_fwd, self.nodes_rev
             # Sample a random point and try to connect both trees
             next_state = self.space.sample()
+
+            # is it valid
+            if not self.validate(next_state):
+                continue
+
             # If they both connect, you won!
             res0, closest_node = self.step_planner(nodes=nodes0, next_state=next_state)
             res1, final_node = self.step_planner(nodes=nodes1, next_state=closest_node.state)
@@ -100,4 +105,8 @@ def plan(self, start, goal, verbose: bool = False, **kwargs) -> PlanResult:
                     path_fwd.append(new_node)
                     parent = new_node
                 return PlanResult(True, path_fwd, planner=self)
-        return PlanResult(False, planner=self)
+        return PlanResult(
+            False,
+            planner=self,
+            reason=f"max_iter reached with nodes fwd = {len(self.nodes_fwd)} and nodes rev = {len(self.nodes_rev)}",
+        )
diff --git a/src/stretch/motion/algo/simplify.py b/src/stretch/motion/algo/simplify.py
index 273f9b7eb..9eee76fa6 100644
--- a/src/stretch/motion/algo/simplify.py
+++ b/src/stretch/motion/algo/simplify.py
@@ -50,6 +50,10 @@ def __init__(
     def reset(self):
         self.nodes = None
 
+    @property
+    def space(self):
+        return self.planner.space
+
     def _verify(self, new_nodes):
         """Check to see if new nodes are spaced enough apart and nothing is within min_dist"""
         prev_node = None

From 09271f4a186e5532223c17565d4fc93bfe8ab65e Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 13:30:02 -0400
Subject: [PATCH 118/410] update robot motions

---
 src/stretch/agent/robot_agent.py        | 2 +-
 src/stretch/config/default_planner.yaml | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 1b1ee7d42..54c245423 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1077,7 +1077,7 @@ def plan_to_frontier(
         random_goals: bool = False,
         try_to_plan_iter: int = 10,
         fix_random_seed: bool = False,
-        verbose: bool = False,
+        verbose: bool = True,
     ) -> PlanResult:
         """Motion plan to a frontier location."""
         start_is_valid = self.space.is_valid(start, verbose=True)
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 34cb0a0a3..630f53894 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -16,8 +16,7 @@ obs_min_density: 50  # This many points makes it an obstacle
 min_points_per_voxel: 50  # Drop things below this density per voxel
 
 # Padding
-pad_obstacles: 3  # Add this many units (voxel_size) to the area around obstacles
-# pad_obstacles: 1  # Add this many units (voxel_size) to the area around obstacles
+pad_obstacles: 2  # Add this many units (voxel_size) to the area around obstacles
 min_pad_obstacles: 0  # Do not pad LESS than this amount, for safety.
 
 local_radius: 0.8  # Area around the robot to mark as explored (kind of a hack)

From 4823fdd38944bc57fa5e3ff6ce0fcd6a8c6977a8 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 13:32:22 -0400
Subject: [PATCH 119/410] update robot code

---
 src/stretch/app/pickup_all.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/stretch/app/pickup_all.py b/src/stretch/app/pickup_all.py
index 82e777c88..7f34b53c9 100644
--- a/src/stretch/app/pickup_all.py
+++ b/src/stretch/app/pickup_all.py
@@ -72,11 +72,8 @@ def main(
         verbose=verbose,
     )
 
-    # Start moving the robot around
-    grasp_client = None
-
     # Agents wrap the robot high level planning interface for now
-    agent = RobotAgent(robot, parameters, semantic_sensor, grasp_client=grasp_client)
+    agent = RobotAgent(robot, parameters, semantic_sensor)
     agent.start(visualize_map_at_start=show_intermediate_maps)
 
     if reset:

From b11a3285915fa583a8918fc87916a9e144b3a9b0 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 14:06:57 -0400
Subject: [PATCH 120/410] update example

---
 src/stretch/app/pickup_all.py | 50 ++++++++++++++++++++++++++++-------
 1 file changed, 41 insertions(+), 9 deletions(-)

diff --git a/src/stretch/app/pickup_all.py b/src/stretch/app/pickup_all.py
index 7f34b53c9..a88884fb4 100644
--- a/src/stretch/app/pickup_all.py
+++ b/src/stretch/app/pickup_all.py
@@ -12,6 +12,7 @@
 import click
 
 from stretch.agent.robot_agent import RobotAgent
+from stretch.agent.task.pickup import PickupTask
 from stretch.agent.zmq_client import HomeRobotZmqClient
 from stretch.core import get_parameters
 from stretch.perception import create_semantic_sensor
@@ -26,7 +27,19 @@
     help="Set if we are executing on the robot and not on a remote computer",
 )
 @click.option("--parameter_file", default="default_planner.yaml", help="Path to parameter file")
-@click.option("--target_object", type=str, default="toy", help="Type of object to pick up and move")
+@click.option(
+    "--target_object",
+    type=str,
+    default="toy",
+    help="Type of object to pick up from the floor and move",
+)
+@click.option(
+    "--receptacle",
+    "--target_receptacle",
+    type=str,
+    default="box",
+    help="Type of receptacle to place the object in",
+)
 @click.option(
     "--force-rotate",
     "--force_rotate",
@@ -34,10 +47,11 @@
     help="Force the robot to rotate in place before doing anything.",
 )
 @click.option(
-    "--destination",
-    type=str,
-    default="box",
-    help="Where to put the objects once you have found them",
+    "--match_method",
+    type=click.Choice(["class", "feature"]),
+    default="class",
+    help="Method to match objects to pick up. Options: class, feature.",
+    show_default=True,
 )
 @click.option(
     "--mode",
@@ -54,9 +68,10 @@ def main(
     show_intermediate_maps: bool = False,
     reset: bool = False,
     target_object: str = "shoe",
-    destination: str = "box",
+    receptacle: str = "box",
     force_rotate: bool = False,
     mode: str = "one_shot",
+    match_method: str = "feature",
 ):
     """Set up the robot, create a task plan, and execute it."""
     # Create robot
@@ -67,19 +82,36 @@ def main(
         parameters=parameters,
     )
     semantic_sensor = create_semantic_sensor(
-        parameters,
+        parameters=parameters,
         device_id=device_id,
         verbose=verbose,
     )
 
+    # Start moving the robot around
+    grasp_client = None
+
     # Agents wrap the robot high level planning interface for now
-    agent = RobotAgent(robot, parameters, semantic_sensor)
+    agent = RobotAgent(robot, parameters, semantic_sensor, grasp_client=grasp_client)
     agent.start(visualize_map_at_start=show_intermediate_maps)
+    if reset:
+        agent.move_closed_loop([0, 0, 0], max_time=60.0)
+
+    # After the robot has started...
+    try:
+        pickup_task = PickupTask(
+            agent, target_object=target_object, target_receptacle=receptacle, matching=match_method
+        )
+        task = pickup_task.get_task(add_rotate=force_rotate, mode=mode)
+    except Exception as e:
+        print(f"Error creating task: {e}")
+        robot.stop()
+        raise e
+
+    task.run()
 
     if reset:
         # Send the robot home at the end!
         agent.go_home()
-        agent.move_closed_loop([0, 0, 0], max_time=60.0)
 
     # At the end, disable everything
     robot.stop()

From c810a2e1d6151d1e3d3f9a9d06084619c62619ff Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 14:15:50 -0400
Subject: [PATCH 121/410] retrieve an llm client

---
 src/stretch/agent/zmq_client.py |  4 ++
 src/stretch/app/pickup_all.py   | 68 ++++++++++++++++-----------------
 src/stretch/llms/__init__.py    | 31 +++++++++++++++
 3 files changed, 69 insertions(+), 34 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 18171b93a..f5ec1bc25 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -1103,6 +1103,10 @@ def running(self) -> bool:
         """Is the client running"""
         return not self._finish
 
+    def is_running(self) -> bool:
+        """Is the client running"""
+        return not self._finish
+
     def blocking_spin_state(self, verbose: bool = False):
         """Listen for incoming observations and update internal state"""
 
diff --git a/src/stretch/app/pickup_all.py b/src/stretch/app/pickup_all.py
index a88884fb4..9c0139516 100644
--- a/src/stretch/app/pickup_all.py
+++ b/src/stretch/app/pickup_all.py
@@ -15,6 +15,7 @@
 from stretch.agent.task.pickup import PickupTask
 from stretch.agent.zmq_client import HomeRobotZmqClient
 from stretch.core import get_parameters
+from stretch.llm import get_llm_client
 from stretch.perception import create_semantic_sensor
 
 
@@ -27,25 +28,6 @@
     help="Set if we are executing on the robot and not on a remote computer",
 )
 @click.option("--parameter_file", default="default_planner.yaml", help="Path to parameter file")
-@click.option(
-    "--target_object",
-    type=str,
-    default="toy",
-    help="Type of object to pick up from the floor and move",
-)
-@click.option(
-    "--receptacle",
-    "--target_receptacle",
-    type=str,
-    default="box",
-    help="Type of receptacle to place the object in",
-)
-@click.option(
-    "--force-rotate",
-    "--force_rotate",
-    is_flag=True,
-    help="Force the robot to rotate in place before doing anything.",
-)
 @click.option(
     "--match_method",
     type=click.Choice(["class", "feature"]),
@@ -59,6 +41,12 @@
     help="Mode of operation for the robot.",
     type=click.Choice(["one_shot", "all"]),
 )
+@click.option(
+    "--llm",
+    default="gemma",
+    help="Client to use for language model.",
+    type=click.Choice(["gemma", "llama", "openai"]),
+)
 def main(
     robot_ip: str = "192.168.1.15",
     local: bool = False,
@@ -69,9 +57,9 @@ def main(
     reset: bool = False,
     target_object: str = "shoe",
     receptacle: str = "box",
-    force_rotate: bool = False,
     mode: str = "one_shot",
     match_method: str = "feature",
+    llm: str = "gemma",
 ):
     """Set up the robot, create a task plan, and execute it."""
     # Create robot
@@ -96,22 +84,34 @@ def main(
     if reset:
         agent.move_closed_loop([0, 0, 0], max_time=60.0)
 
-    # After the robot has started...
-    try:
-        pickup_task = PickupTask(
-            agent, target_object=target_object, target_receptacle=receptacle, matching=match_method
-        )
-        task = pickup_task.get_task(add_rotate=force_rotate, mode=mode)
-    except Exception as e:
-        print(f"Error creating task: {e}")
-        robot.stop()
-        raise e
+    llm = get_llm_client("gemma")
 
-    task.run()
+    # Parse things and listen to the user
+    while robot.running:
 
-    if reset:
-        # Send the robot home at the end!
-        agent.go_home()
+        # Call the LLM client and parse
+
+        # After the robot has started...
+        try:
+            pickup_task = PickupTask(
+                agent,
+                target_object=target_object,
+                target_receptacle=receptacle,
+                matching=match_method,
+            )
+            task = pickup_task.get_task(add_rotate=True, mode=mode)
+        except Exception as e:
+            print(f"Error creating task: {e}")
+            robot.stop()
+            raise e
+
+        task.run()
+
+        if reset:
+            # Send the robot home at the end!
+            agent.go_home()
+
+        break
 
     # At the end, disable everything
     robot.stop()
diff --git a/src/stretch/llms/__init__.py b/src/stretch/llms/__init__.py
index c6bfcf21b..48cbaf574 100644
--- a/src/stretch/llms/__init__.py
+++ b/src/stretch/llms/__init__.py
@@ -7,8 +7,39 @@
 # Some code may be adapted from other open-source works with their respective licenses. Original
 # license information maybe found below, if so.
 
+from .base import AbstractLLMClient, AbstractPromptBuilder
 from .gemma_client import Gemma2bClient
 from .llama_client import LlamaClient
 from .openai_client import OpenaiClient
 from .prompts.object_manip_nav_prompt import ObjectManipNavPromptBuilder
 from .prompts.simple_prompt import SimpleStretchPromptBuilder
+
+# This is a list of all the modules that are imported when you use the import * syntax.
+# The __all__ variable is used to define what symbols get exported when from a module when you use the import * syntax.
+__all__ = [
+    "Gemma2bClient",
+    "LlamaClient",
+    "OpenaiClient",
+    "ObjectManipNavPromptBuilder",
+    "SimpleStretchPromptBuilder",
+]
+
+
+def get_llm_client(client_type: str, **kwargs) -> AbstractLLMClient:
+    """Return an LLM client of the specified type.
+
+    Args:
+        client_type: The type of client to create.
+        kwargs: Additional keyword arguments to pass to the client constructor.
+
+    Returns:
+        An LLM client.
+    """
+    if client_type == "gemma2b":
+        return Gemma2bClient(**kwargs)
+    elif client_type == "llama":
+        return LlamaClient(**kwargs)
+    elif client_type == "openai":
+        return OpenaiClient(**kwargs)
+    else:
+        raise ValueError(f"Invalid client type: {client_type}")

From 845a0729c7f1d270fb016fb4f96a5fed86061ef2 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 14:18:24 -0400
Subject: [PATCH 122/410] update

---
 src/stretch/app/pickup_all.py |  6 +++---
 src/stretch/llms/__init__.py  | 11 +++++++++++
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/stretch/app/pickup_all.py b/src/stretch/app/pickup_all.py
index 9c0139516..712003371 100644
--- a/src/stretch/app/pickup_all.py
+++ b/src/stretch/app/pickup_all.py
@@ -15,7 +15,7 @@
 from stretch.agent.task.pickup import PickupTask
 from stretch.agent.zmq_client import HomeRobotZmqClient
 from stretch.core import get_parameters
-from stretch.llm import get_llm_client
+from stretch.llms import get_llm_choices, get_llm_client
 from stretch.perception import create_semantic_sensor
 
 
@@ -43,9 +43,9 @@
 )
 @click.option(
     "--llm",
-    default="gemma",
+    default="gemma2b",
     help="Client to use for language model.",
-    type=click.Choice(["gemma", "llama", "openai"]),
+    type=click.Choice(get_llm_choices()),
 )
 def main(
     robot_ip: str = "192.168.1.15",
diff --git a/src/stretch/llms/__init__.py b/src/stretch/llms/__init__.py
index 48cbaf574..72a18359e 100644
--- a/src/stretch/llms/__init__.py
+++ b/src/stretch/llms/__init__.py
@@ -24,6 +24,17 @@
     "SimpleStretchPromptBuilder",
 ]
 
+llms = {
+    "gemma2b": Gemma2bClient,
+    "llama": LlamaClient,
+    "openai": OpenaiClient,
+}
+
+
+def get_llm_choices():
+    """Return a list of available LLM clients."""
+    return llms.keys()
+
 
 def get_llm_client(client_type: str, **kwargs) -> AbstractLLMClient:
     """Return an LLM client of the specified type.

From 56c962beb0a5069dd0d4b9e946bc42efe012f0f0 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 14:18:51 -0400
Subject: [PATCH 123/410] Update

---
 src/stretch/app/pickup_all.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/app/pickup_all.py b/src/stretch/app/pickup_all.py
index 712003371..40d598459 100644
--- a/src/stretch/app/pickup_all.py
+++ b/src/stretch/app/pickup_all.py
@@ -84,7 +84,7 @@ def main(
     if reset:
         agent.move_closed_loop([0, 0, 0], max_time=60.0)
 
-    llm = get_llm_client("gemma")
+    llm = get_llm_client(llm)
 
     # Parse things and listen to the user
     while robot.running:

From fe955867f25870523862da4940cd719cf1335881 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 14:28:23 -0400
Subject: [PATCH 124/410] update robot agent stuff

---
 src/stretch/agent/robot_agent.py |  1 +
 src/stretch/app/pickup_all.py    | 40 ++++++++++++++++++++++++++++----
 src/stretch/llms/__init__.py     |  5 +++-
 3 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 54c245423..32ce3a6f3 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1313,6 +1313,7 @@ def reset(self, verbose: bool = True):
                 "[WARNING] Resetting the robot's spatial memory. Everything it knows will go away!"
             )
         self.voxel_map.reset()
+        self.reset_object_plans()
 
     def save_instance_images(self, root: Union[Path, str] = ".", verbose: bool = False) -> None:
         """Save out instance images from the voxel map that we have collected while exploring."""
diff --git a/src/stretch/app/pickup_all.py b/src/stretch/app/pickup_all.py
index 40d598459..ee7d5e9a1 100644
--- a/src/stretch/app/pickup_all.py
+++ b/src/stretch/app/pickup_all.py
@@ -31,7 +31,7 @@
 @click.option(
     "--match_method",
     type=click.Choice(["class", "feature"]),
-    default="class",
+    default="feature",
     help="Method to match objects to pick up. Options: class, feature.",
     show_default=True,
 )
@@ -47,6 +47,31 @@
     help="Client to use for language model.",
     type=click.Choice(get_llm_choices()),
 )
+@click.option(
+    "--device_id",
+    default=0,
+    help="ID of the device to use for perception",
+)
+@click.option(
+    "--verbose",
+    is_flag=True,
+    help="Set to print debug information",
+)
+@click.option(
+    "--show_intermediate_maps",
+    is_flag=True,
+    help="Set to visualize intermediate maps",
+)
+@click.option(
+    "--target_object",
+    default="",
+    help="Name of the object to pick up",
+)
+@click.option(
+    "--receptacle",
+    default="",
+    help="Name of the receptacle to place the object in",
+)
 def main(
     robot_ip: str = "192.168.1.15",
     local: bool = False,
@@ -55,8 +80,8 @@ def main(
     verbose: bool = False,
     show_intermediate_maps: bool = False,
     reset: bool = False,
-    target_object: str = "shoe",
-    receptacle: str = "box",
+    target_object: str = "",
+    receptacle: str = "",
     mode: str = "one_shot",
     match_method: str = "feature",
     llm: str = "gemma",
@@ -84,12 +109,19 @@ def main(
     if reset:
         agent.move_closed_loop([0, 0, 0], max_time=60.0)
 
-    llm = get_llm_client(llm)
+    prompt = None
+    if prompt is not None:
+        llm = get_llm_client(llm)
 
     # Parse things and listen to the user
     while robot.running:
+        agent.reset()
 
         # Call the LLM client and parse
+        if len(target_object) == 0:
+            target_object = input("Enter the target object: ")
+        if len(receptacle) == 0:
+            receptacle = input("Enter the target receptacle: ")
 
         # After the robot has started...
         try:
diff --git a/src/stretch/llms/__init__.py b/src/stretch/llms/__init__.py
index 72a18359e..2d35a6a0a 100644
--- a/src/stretch/llms/__init__.py
+++ b/src/stretch/llms/__init__.py
@@ -6,6 +6,7 @@
 #
 # Some code may be adapted from other open-source works with their respective licenses. Original
 # license information maybe found below, if so.
+from typing import Union
 
 from .base import AbstractLLMClient, AbstractPromptBuilder
 from .gemma_client import Gemma2bClient
@@ -36,7 +37,9 @@ def get_llm_choices():
     return llms.keys()
 
 
-def get_llm_client(client_type: str, **kwargs) -> AbstractLLMClient:
+def get_llm_client(
+    client_type: str, prompt: Union[str, AbstractPromptBuilder], **kwargs
+) -> AbstractLLMClient:
     """Return an LLM client of the specified type.
 
     Args:

From 02d523fc674f0289a58087b1e2d2fea6e3f3fff7 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Thu, 12 Sep 2024 11:43:30 -0700
Subject: [PATCH 125/410] apply object mask callback added

---
 .../agent/manipulation/dinobot_prototype.py   | 38 +++++++++++++------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index de5619d2e..a937e8ce7 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -18,6 +18,7 @@
 import torch
 from correspondences import find_correspondences, visualize_correspondences
 from extractor import ViTExtractor
+from scipy.spatial.transform import Rotation
 
 from stretch.agent import RobotClient
 from stretch.agent.manipulation.dinobot import (
@@ -95,7 +96,9 @@ def get_correspondences(
         )
         return points1, points2
 
-    def run(self, robot: RobotClient, demo: Demo, visualize: bool = False):
+    def run(
+        self, robot: RobotClient, demo: Demo, visualize: bool = False, apply_mask_callback=None
+    ):
         """
         Run the Dinobot algorithm
         Args:
@@ -106,20 +109,20 @@ def run(self, robot: RobotClient, demo: Demo, visualize: bool = False):
         print("Running Dinobot")
         while True:
             servo = robot.get_servo_observation()
-            ee_depth = servo.ee_depth
+            ee_depth = servo.ee_depth.copy()
+            ee_rgb = servo.ee_rgb.copy()
+            if apply_mask_callback is not None:
+                ee_rgb = apply_mask_callback(ee_rgb)
             if not isinstance(demo.bottleneck_image_rgb, type(None)):
                 start = time.perf_counter()
                 with torch.no_grad():
-                    points1, points2 = self.get_correspondences(
-                        demo.bottleneck_image_rgb, servo.ee_rgb
-                    )
-                    self.move_to_bottleneck(None, points1, points2, ee_depth, demo)
+                    points1, points2 = self.get_correspondences(demo.bottleneck_image_rgb, ee_rgb)
                 inf_ts = (time.perf_counter() - start) * 1000
                 print(f"\n  current: {inf_ts} ms")
                 if visualize:
                     if len(points1) == len(points2):
                         im1, im2 = visualize_correspondences(
-                            points1, points2, demo.bottleneck_image_rgb, servo.ee_rgb
+                            points1, points2, demo.bottleneck_image_rgb, ee_rgb
                         )
                         fig, axes = plt.subplots(1, 2, figsize=(10, 5))
                         axes[0].imshow(im1)
@@ -131,6 +134,8 @@ def run(self, robot: RobotClient, demo: Demo, visualize: bool = False):
                         plt.show()
                     else:
                         print("No correspondences found")
+                # Move the robot to the bottleneck pose
+                self.move_to_bottleneck(None, points1, points2, ee_depth, demo)
             else:
                 print("No bottleneck image found")
 
@@ -161,8 +166,9 @@ def move_to_bottleneck(
         points2 = extract_3d_coordinates(live_points, live_xyz)
         # Find rigid translation and rotation that aligns the points by minimising error, using SVD.
         R, t = find_transformation(points1, points2)
-        print(f"Robot needs to R: {R}, T: {t}")
-        # TODO: Move robot
+        r = Rotation.from_matrix(R)
+        angles = r.as_euler("xyz")
+        print(f"Robot needs to Rotate: {angles}, T: {t}")
         error = compute_error(points1, points2)
         return error
 
@@ -188,16 +194,26 @@ def depth_to_xyz(depth: np.ndarray, camera_K: np.ndarray) -> np.ndarray:
     detic = DeticPerception()
     track_object_id = 41  # detic object id for cup
 
-    # First frame is the bottleneck image
+    # First frame is the bottleneck image for now
     bottleneck_image_rgb = robot.get_servo_observation().ee_rgb
     bottleneck_image_depth = robot.get_servo_observation().ee_depth
     bottleneck_image_camera_K = robot.get_servo_observation().ee_camera_K
     semantic, instance, task_observations = detic.predict(bottleneck_image_rgb)
+
+    def apply_mask_callback(image: np.ndarray) -> np.ndarray:
+        semantic, instance, task_observations = detic.predict(image)
+        if track_object_id in task_observations["instance_classes"]:
+            object_mask = semantic == track_object_id
+            image[~object_mask] = [0, 0, 0]
+        else:
+            print(f"Object ID: {track_object_id} not found in the live image")
+        return image
+
     if track_object_id in task_observations["instance_classes"]:
         object_mask = semantic == track_object_id
         demo = Demo(
             bottleneck_image_rgb, bottleneck_image_depth, bottleneck_image_camera_K, object_mask
         )
-        dinobot.run(robot, demo, visualize=True)
+        dinobot.run(robot, demo, visualize=True, apply_mask_callback=apply_mask_callback)
     else:
         print(f"Object ID: {track_object_id} not found in the image")

From b5273f505e2cb132d142197fa807bfdee3ad2fd3 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 14:46:54 -0400
Subject: [PATCH 126/410] Update navigation and handling not doing a motion
 plan if not necessray

---
 src/stretch/agent/operations/navigate.py | 11 ++++++++++-
 src/stretch/agent/robot_agent.py         | 18 ++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/operations/navigate.py b/src/stretch/agent/operations/navigate.py
index 10434eee6..971d3bac4 100644
--- a/src/stretch/agent/operations/navigate.py
+++ b/src/stretch/agent/operations/navigate.py
@@ -43,8 +43,13 @@ def can_start(self):
             )
             breakpoint()
 
+        if self.agent.within_reach_of(self.get_target()):
+            self.cheer("Already within reach of object!")
+            return True
+
         # Motion plan to the object
-        plan = self.plan_to_instance_for_manipulation(self.get_target(), start=start)
+        plan = self.agent.plan_to_instance_for_manipulation(self.get_target(), start=start)
+
         if plan.success:
             self.plan = plan
             self.cheer("Found plan to object!")
@@ -58,6 +63,10 @@ def run(self):
         self.intro("executing motion plan to the object.")
         self.robot.move_to_nav_posture()
 
+        if self.agent.within_reach_of(self.get_target()):
+            self.warn("Already within reach of object!")
+            return
+
         # Execute the trajectory
         assert (
             self.plan is not None
diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 32ce3a6f3..57392a459 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -570,6 +570,24 @@ def plan_to_instance_for_manipulation(
             use_cache=use_cache,
         )
 
+    def within_reach_of(
+        self, instance: Instance, start: Optional[np.ndarray] = None, verbose: bool = False
+    ) -> bool:
+        """Check if the instance is within manipulation range.
+
+        Args:
+            instance(Instance): an object in the world
+            start(np.ndarray): the start position
+            verbose(bool): extra info is printed
+
+        Returns:
+            bool: whether the instance is within manipulation range
+        """
+
+        start = start if start is not None else self.robot.get_base_pose()
+
+        bounds = instance.object_xyz
+
     def plan_to_instance(
         self,
         instance: Instance,

From 20cedcfd014ccffb483d3b6c487fbbc10a60ff06 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 15:01:42 -0400
Subject: [PATCH 127/410] update navigation

---
 src/stretch/agent/operations/navigate.py | 18 +++++++++++++++++-
 src/stretch/agent/robot_agent.py         | 14 ++++++++++++--
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/src/stretch/agent/operations/navigate.py b/src/stretch/agent/operations/navigate.py
index 971d3bac4..36917d0b4 100644
--- a/src/stretch/agent/operations/navigate.py
+++ b/src/stretch/agent/operations/navigate.py
@@ -8,6 +8,10 @@
 # license information maybe found below, if so.
 
 
+import math
+
+import numpy as np
+
 from stretch.agent.base import ManagedOperation
 
 
@@ -45,7 +49,6 @@ def can_start(self):
 
         if self.agent.within_reach_of(self.get_target()):
             self.cheer("Already within reach of object!")
-            return True
 
         # Motion plan to the object
         plan = self.agent.plan_to_instance_for_manipulation(self.get_target(), start=start)
@@ -60,11 +63,24 @@ def can_start(self):
         return False
 
     def run(self):
+        """Navigate the robot to the object. If the object is within reach, we will not move the robot, but will rotate it to face the object.
+
+        Returns:
+            bool: True if the operation was successful, False otherwise.
+        """
+
         self.intro("executing motion plan to the object.")
         self.robot.move_to_nav_posture()
 
         if self.agent.within_reach_of(self.get_target()):
             self.warn("Already within reach of object!")
+            xyz = self.get_target().get_center()
+            theta = math.atan2(
+                xyz[1] - self.robot.get_base_pose()[1], xyz[0] - self.robot.get_base_pose()[0]
+            )
+            self.robot.navigate_to(
+                np.array([xyz[0], xyz[1], theta + np.pi / 2]), blocking=True, timeout=30.0
+            )
             return
 
         # Execute the trajectory
diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 57392a459..04167b7d1 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -585,8 +585,18 @@ def within_reach_of(
         """
 
         start = start if start is not None else self.robot.get_base_pose()
-
-        bounds = instance.object_xyz
+        if isinstance(start, np.ndarray):
+            start = torch.tensor(start, device=self.voxel_map.device)
+        object_xyz = instance.get_center()
+        if np.linalg.norm(start[:2] - object_xyz[:2]) < self._manipulation_radius:
+            return True
+        if (
+            ((instance.point_cloud[:, :2] - start[:2]).norm(dim=-1) < self._manipulation_radius)
+            .any()
+            .item()
+        ):
+            return True
+        return False
 
     def plan_to_instance(
         self,

From 2662e6174feac217ab6ea84ae778e11b921147da Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 15:04:25 -0400
Subject: [PATCH 128/410] navigation

---
 src/stretch/agent/operations/navigate.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/operations/navigate.py b/src/stretch/agent/operations/navigate.py
index 36917d0b4..f6a8a4238 100644
--- a/src/stretch/agent/operations/navigate.py
+++ b/src/stretch/agent/operations/navigate.py
@@ -75,11 +75,14 @@ def run(self):
         if self.agent.within_reach_of(self.get_target()):
             self.warn("Already within reach of object!")
             xyz = self.get_target().get_center()
+            start_xyz = self.robot.get_base_pose()[:2]
             theta = math.atan2(
                 xyz[1] - self.robot.get_base_pose()[1], xyz[0] - self.robot.get_base_pose()[0]
             )
             self.robot.navigate_to(
-                np.array([xyz[0], xyz[1], theta + np.pi / 2]), blocking=True, timeout=30.0
+                np.array([start_xyz[0], start_xyz[1], theta + np.pi / 2]),
+                blocking=True,
+                timeout=30.0,
             )
             return
 

From 1fd37741707c131599d18b8e63e0136c63cee3b0 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 15:25:00 -0400
Subject: [PATCH 129/410] update

---
 src/stretch/app/pickup_all.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/stretch/app/pickup_all.py b/src/stretch/app/pickup_all.py
index ee7d5e9a1..2f19d1cd3 100644
--- a/src/stretch/app/pickup_all.py
+++ b/src/stretch/app/pickup_all.py
@@ -109,7 +109,10 @@ def main(
     if reset:
         agent.move_closed_loop([0, 0, 0], max_time=60.0)
 
+    # TODO: Add a prompt for the user to enter the target object and receptacle
     prompt = None
+
+    # Get the LLM client
     if prompt is not None:
         llm = get_llm_client(llm)
 

From 7fe4fa63011942c1a13287126fd086e1689c9172 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 15:25:34 -0400
Subject: [PATCH 130/410] update pickup

---
 src/stretch/app/pickup_all.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/stretch/app/pickup_all.py b/src/stretch/app/pickup_all.py
index 2f19d1cd3..ecf9c80b7 100644
--- a/src/stretch/app/pickup_all.py
+++ b/src/stretch/app/pickup_all.py
@@ -120,11 +120,14 @@ def main(
     while robot.running:
         agent.reset()
 
-        # Call the LLM client and parse
-        if len(target_object) == 0:
-            target_object = input("Enter the target object: ")
-        if len(receptacle) == 0:
-            receptacle = input("Enter the target receptacle: ")
+        if llm is None:
+            # Call the LLM client and parse
+            if len(target_object) == 0:
+                target_object = input("Enter the target object: ")
+            if len(receptacle) == 0:
+                receptacle = input("Enter the target receptacle: ")
+        else:
+            raise NotImplementedError("LLM client not implemented yet")
 
         # After the robot has started...
         try:

From f1cc31fd39df05f2c8d899319cc03d6c108b4eff Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 15:41:56 -0400
Subject: [PATCH 131/410] update pickup

---
 src/stretch/app/pickup_all.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/stretch/app/pickup_all.py b/src/stretch/app/pickup_all.py
index ecf9c80b7..4ff2b693e 100644
--- a/src/stretch/app/pickup_all.py
+++ b/src/stretch/app/pickup_all.py
@@ -114,7 +114,7 @@ def main(
 
     # Get the LLM client
     if prompt is not None:
-        llm = get_llm_client(llm)
+        llm = get_llm_client(llm, prompt=prompt)
 
     # Parse things and listen to the user
     while robot.running:
@@ -126,8 +126,6 @@ def main(
                 target_object = input("Enter the target object: ")
             if len(receptacle) == 0:
                 receptacle = input("Enter the target receptacle: ")
-        else:
-            raise NotImplementedError("LLM client not implemented yet")
 
         # After the robot has started...
         try:

From 62b45ef4c7e4240d3eba905df0acc8c6d065164c Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 16:13:16 -0400
Subject: [PATCH 132/410] Update the robot agent

---
 src/stretch/agent/robot_agent.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 04167b7d1..88617ec9d 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -635,6 +635,7 @@ def plan_to_instance(
             if verbose:
                 print(f"- try retrieving cached plan for {instance_id}: {has_plan=}")
 
+        # Plan to the instance
         if not has_plan:
             # Call planner
             res = self.plan_to_bounds(

From bcc89929cf5c6a373e71172b26724066192572ce Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 16:21:02 -0400
Subject: [PATCH 133/410] update

---
 src/stretch/agent/task/pickup/pickup_task.py | 4 ++--
 src/stretch/app/pickup_all.py                | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/stretch/agent/task/pickup/pickup_task.py b/src/stretch/agent/task/pickup/pickup_task.py
index 23c101da2..7b1dd0c78 100644
--- a/src/stretch/agent/task/pickup/pickup_task.py
+++ b/src/stretch/agent/task/pickup/pickup_task.py
@@ -110,7 +110,7 @@ def get_one_shot_task(self, add_rotate: bool = False, matching: str = "feature")
 
         # Look for the target receptacle
         search_for_receptacle = SearchForReceptacleOperation(
-            "search_for_box",
+            "search_for_{self.target_receptacle}",
             self.agent,
             parent=rotate_in_place if add_rotate else go_to_navigation_mode,
             retry_on_failure=True,
@@ -119,7 +119,7 @@ def get_one_shot_task(self, add_rotate: bool = False, matching: str = "feature")
 
         # Try to expand the frontier and find an object; or just wander around for a while.
         search_for_object = SearchForObjectOnFloorOperation(
-            "search_for_objects_on_floor",
+            "search_for_{self.target_object}_on_floor",
             self.agent,
             retry_on_failure=True,
             match_method=matching,
diff --git a/src/stretch/app/pickup_all.py b/src/stretch/app/pickup_all.py
index 4ff2b693e..f67181606 100644
--- a/src/stretch/app/pickup_all.py
+++ b/src/stretch/app/pickup_all.py
@@ -113,14 +113,15 @@ def main(
     prompt = None
 
     # Get the LLM client
+    llm_client = None
     if prompt is not None:
-        llm = get_llm_client(llm, prompt=prompt)
+        llm_client = get_llm_client(llm, prompt=prompt)
 
     # Parse things and listen to the user
     while robot.running:
         agent.reset()
 
-        if llm is None:
+        if llm_client is None:
             # Call the LLM client and parse
             if len(target_object) == 0:
                 target_object = input("Enter the target object: ")

From 328d8d3e6e4ee9f5d8026e22fe1e35f4ef529075 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 12 Sep 2024 16:25:08 -0400
Subject: [PATCH 134/410] update pickup all

---
 src/stretch/app/pickup_all.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/stretch/app/pickup_all.py b/src/stretch/app/pickup_all.py
index f67181606..316a9892a 100644
--- a/src/stretch/app/pickup_all.py
+++ b/src/stretch/app/pickup_all.py
@@ -11,6 +11,7 @@
 
 import click
 
+import stretch.utils.logger as logger
 from stretch.agent.robot_agent import RobotAgent
 from stretch.agent.task.pickup import PickupTask
 from stretch.agent.zmq_client import HomeRobotZmqClient
@@ -128,6 +129,10 @@ def main(
             if len(receptacle) == 0:
                 receptacle = input("Enter the target receptacle: ")
 
+        if len(target_object) == 0 or len(receptacle) == 0:
+            logger.error("You need to enter a target object and receptacle")
+            break
+
         # After the robot has started...
         try:
             pickup_task = PickupTask(

From 2ea8edd47218359d4c0e478359f264416b3a2bd7 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Thu, 12 Sep 2024 16:44:10 -0700
Subject: [PATCH 135/410] Fix depth transforms

---
 .../RenderOption_2024-09-12-15-31-05.json     | 41 +++++++++++
 src/stretch/agent/manipulation/dinobot.py     | 21 +++---
 .../agent/manipulation/dinobot_prototype.py   | 72 +++++++++++++++++++
 3 files changed, 125 insertions(+), 9 deletions(-)
 create mode 100644 src/stretch/agent/manipulation/RenderOption_2024-09-12-15-31-05.json

diff --git a/src/stretch/agent/manipulation/RenderOption_2024-09-12-15-31-05.json b/src/stretch/agent/manipulation/RenderOption_2024-09-12-15-31-05.json
new file mode 100644
index 000000000..26cf14f8d
--- /dev/null
+++ b/src/stretch/agent/manipulation/RenderOption_2024-09-12-15-31-05.json
@@ -0,0 +1,41 @@
+{
+	"background_color" : [ 1.0, 1.0, 1.0 ],
+	"class_name" : "RenderOption",
+	"default_mesh_color" : [ 0.69999999999999996, 0.69999999999999996, 0.69999999999999996 ],
+	"image_max_depth" : 3000,
+	"image_stretch_option" : 1,
+	"interpolation_option" : 0,
+	"light0_color" : [ 1.0, 1.0, 1.0 ],
+	"light0_diffuse_power" : 0.66000000000000003,
+	"light0_position" : [ 0.0, 0.0, 2.0 ],
+	"light0_specular_power" : 0.20000000000000001,
+	"light0_specular_shininess" : 100.0,
+	"light1_color" : [ 1.0, 1.0, 1.0 ],
+	"light1_diffuse_power" : 0.66000000000000003,
+	"light1_position" : [ 0.0, 0.0, 2.0 ],
+	"light1_specular_power" : 0.20000000000000001,
+	"light1_specular_shininess" : 100.0,
+	"light2_color" : [ 1.0, 1.0, 1.0 ],
+	"light2_diffuse_power" : 0.66000000000000003,
+	"light2_position" : [ 0.0, 0.0, -2.0 ],
+	"light2_specular_power" : 0.20000000000000001,
+	"light2_specular_shininess" : 100.0,
+	"light3_color" : [ 1.0, 1.0, 1.0 ],
+	"light3_diffuse_power" : 0.66000000000000003,
+	"light3_position" : [ 0.0, 0.0, -2.0 ],
+	"light3_specular_power" : 0.20000000000000001,
+	"light3_specular_shininess" : 100.0,
+	"light_ambient_color" : [ 0.0, 0.0, 0.0 ],
+	"light_on" : true,
+	"line_width" : 1.0,
+	"mesh_color_option" : 1,
+	"mesh_shade_option" : 0,
+	"mesh_show_back_face" : false,
+	"mesh_show_wireframe" : false,
+	"point_color_option" : 0,
+	"point_show_normal" : false,
+	"point_size" : 5.0,
+	"show_coordinate_frame" : false,
+	"version_major" : 1,
+	"version_minor" : 0
+}
\ No newline at end of file
diff --git a/src/stretch/agent/manipulation/dinobot.py b/src/stretch/agent/manipulation/dinobot.py
index 74459e337..d8e724dfe 100644
--- a/src/stretch/agent/manipulation/dinobot.py
+++ b/src/stretch/agent/manipulation/dinobot.py
@@ -14,13 +14,15 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import torch
-sys.path.append('/home/hello-robot/repos/dino-vit-features')
+
+sys.path.append("/home/hello-robot/repos/dino-vit-features")
+import sys
+
 # Install this DINO repo to extract correspondences: https://github.com/ShirAmir/dino-vit-features
 from correspondences import draw_correspondences, find_correspondences
 from PIL import Image
 
 from stretch.agent import HomeRobotZmqClient
-import sys
 
 # Hyperparameters for DINO correspondences extraction
 # num_pairs = 8
@@ -132,15 +134,15 @@ def extract_3d_coordinates(points, xyz):
     depths = []
     for point in points:
         x, y = point
-        depths.append(xyz[y, x, 2])
+        depths.append(xyz[x][y])
 
     # Create a new array of shape (n, 3) with the 3d coordinates
-    points_3d = []
-    for i, point in enumerate(points):
-        x, y = point
-        points_3d.append([x, y, depths[i]])
+    # points_3d = []
+    # for i, point in enumerate(points):
+    #     x, y = point
+    #     points_3d.append()
 
-    return np.array(points_3d)
+    return np.array(depths)
 
 
 def compute_error(points1: np.ndarray, points2: np.ndarray) -> float:
@@ -236,5 +238,6 @@ def main(robot_ip, local, path_to_demo_folder):
     # Start the demo
     dinobot_demo(robot, path_to_demo_folder)
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index a937e8ce7..2beeba946 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -15,6 +15,7 @@
 
 import matplotlib.pyplot as plt
 import numpy as np
+import open3d as o3d
 import torch
 from correspondences import find_correspondences, visualize_correspondences
 from extractor import ViTExtractor
@@ -28,6 +29,8 @@
 )
 from stretch.perception.detection.detic import DeticPerception
 
+DEBUG_VISUALIZATION = False
+
 
 class Demo:
     """
@@ -164,12 +167,69 @@ def move_to_bottleneck(
         points1 = extract_3d_coordinates(bottleneck_points, bottleneck_xyz)
         live_xyz = depth_to_xyz(live_depth, demo.bottleneck_image_camera_K)
         points2 = extract_3d_coordinates(live_points, live_xyz)
+
+        invalid_depth_ids = []
+        for i, point in enumerate(points1):
+            print(point)
+            if np.mean(point) == 0:
+                invalid_depth_ids.append(i)
+        for i, point in enumerate(points2):
+            if np.mean(point) == 0:
+                invalid_depth_ids.append(i)
+        invalid_depth_ids = list(set(invalid_depth_ids))
+        points1 = np.delete(points1, invalid_depth_ids, axis=0)
+        points2 = np.delete(points2, invalid_depth_ids, axis=0)
+        print(f" Number of valid correspondences: {len(points1)}")
+
         # Find rigid translation and rotation that aligns the points by minimising error, using SVD.
         R, t = find_transformation(points1, points2)
         r = Rotation.from_matrix(R)
         angles = r.as_euler("xyz")
         print(f"Robot needs to Rotate: {angles}, T: {t}")
         error = compute_error(points1, points2)
+        print(f"Error: {error}")
+
+        # Example usage
+        unique_colors = generate_unique_colors(len(points1))
+
+        # Debug 3D Visualization
+        if DEBUG_VISUALIZATION:
+            origin_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(
+                size=0.03, origin=[0, 0, 0]
+            )
+
+            bottleneck_frame = o3d.geometry.PointCloud()
+            bottleneck_frame.points = o3d.utility.Vector3dVector(
+                bottleneck_xyz.reshape((bottleneck_xyz.shape[0] * bottleneck_xyz.shape[1], 3))
+            )
+            points1_frame = o3d.geometry.PointCloud()
+            points1_frame.points = o3d.utility.Vector3dVector(points1)
+
+            spheres = []
+            for point, color in zip(points1, unique_colors):
+                sphere = o3d.geometry.TriangleMesh.create_sphere(radius=0.01)
+                sphere.paint_uniform_color(color)
+                sphere.translate(point)
+                spheres.append(sphere)
+            components = [origin_frame, bottleneck_frame]
+            components.extend(spheres)
+            o3d.visualization.draw_geometries(components)
+
+            # live_frame = o3d.geometry.PointCloud()
+            # live_frame.points = o3d.utility.Vector3dVector(live_xyz.reshape((live_xyz.shape[0]*live_xyz.shape[1],3)))
+            # points2_frame = o3d.geometry.PointCloud()
+            # points2_frame.points = o3d.utility.Vector3dVector(points2)
+
+            # spheres = []
+            # for point,color in zip(points2,unique_colors):
+            #     sphere = o3d.geometry.TriangleMesh.create_sphere(radius=0.01)
+            #     sphere.paint_uniform_color(color)
+            #     sphere.translate(point)
+            #     spheres.append(sphere)
+            # components = [origin_frame,live_frame]
+            # components.extend(spheres)
+            # o3d.visualization.draw_geometries(components)
+
         return error
 
 
@@ -188,6 +248,18 @@ def depth_to_xyz(depth: np.ndarray, camera_K: np.ndarray) -> np.ndarray:
     return xyz
 
 
+def generate_unique_colors(n: int) -> List[Tuple[int, int, int]]:
+    """
+    Generate N unique colors.
+    Args:
+        n (int): Number of unique colors to generate
+    Returns:
+        List[Tuple[int, int, int]]: List of RGB color tuples
+    """
+    colors = plt.cm.get_cmap("hsv", n)
+    return [tuple(c for c in colors(i)[:3]) for i in range(n)]
+
+
 if __name__ == "__main__":
     robot = RobotClient(robot_ip="10.0.0.14")
     dinobot = Dinobot()

From 976b007b16a9660a79e777d1b1f7de42bebefb54 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Thu, 12 Sep 2024 18:59:55 -0700
Subject: [PATCH 136/410] wip

---
 .../agent/manipulation/dinobot_prototype.py   | 26 +++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index 2beeba946..585af51ef 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -27,6 +27,7 @@
     extract_3d_coordinates,
     find_transformation,
 )
+from stretch.motion import HelloStretchIdx
 from stretch.perception.detection.detic import DeticPerception
 
 DEBUG_VISUALIZATION = False
@@ -138,10 +139,21 @@ def run(
                     else:
                         print("No correspondences found")
                 # Move the robot to the bottleneck pose
-                self.move_to_bottleneck(None, points1, points2, ee_depth, demo)
+                self.move_to_bottleneck(robot, points1, points2, ee_depth, demo)
             else:
                 print("No bottleneck image found")
 
+    def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray) -> float:
+        """
+        Move the robot to the bottleneck pose
+        """
+        robot.switch_to_manipulation_mode()
+        joint_state = robot.get_joint_positions().copy()
+        # joint_state[HelloStretchIdx.ARM] = joint_state[HelloStretchIdx.ARM] + t[2]
+        # joint_state[HelloStretchIdx.LIFT] = joint_state[HelloStretchIdx.LIFT] + t[1]
+        joint_state[HelloStretchIdx.BASE_X] = joint_state[HelloStretchIdx.BASE_X] + t[0]
+        robot.arm_to(joint_state, blocking=True)
+
     def move_to_bottleneck(
         self,
         robot: RobotClient,
@@ -151,7 +163,7 @@ def move_to_bottleneck(
         demo: Demo,
     ) -> float:
         """
-        Move the robot to the bottleneck pose
+        Compute the transformation to move the robot to the bottleneck pose
         Args:
             robot (RobotClient): The robot client to use
             bottleneck_points (List): The bottleneck points
@@ -189,15 +201,14 @@ def move_to_bottleneck(
         error = compute_error(points1, points2)
         print(f"Error: {error}")
 
-        # Example usage
-        unique_colors = generate_unique_colors(len(points1))
-
         # Debug 3D Visualization
         if DEBUG_VISUALIZATION:
+            unique_colors = generate_unique_colors(len(points1))
             origin_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(
                 size=0.03, origin=[0, 0, 0]
             )
 
+            # Bottleneck frame
             bottleneck_frame = o3d.geometry.PointCloud()
             bottleneck_frame.points = o3d.utility.Vector3dVector(
                 bottleneck_xyz.reshape((bottleneck_xyz.shape[0] * bottleneck_xyz.shape[1], 3))
@@ -215,6 +226,7 @@ def move_to_bottleneck(
             components.extend(spheres)
             o3d.visualization.draw_geometries(components)
 
+            # Live frame
             # live_frame = o3d.geometry.PointCloud()
             # live_frame.points = o3d.utility.Vector3dVector(live_xyz.reshape((live_xyz.shape[0]*live_xyz.shape[1],3)))
             # points2_frame = o3d.geometry.PointCloud()
@@ -230,6 +242,8 @@ def move_to_bottleneck(
             # components.extend(spheres)
             # o3d.visualization.draw_geometries(components)
 
+        self.move_robot(robot, R, t)
+
         return error
 
 
@@ -279,6 +293,7 @@ def apply_mask_callback(image: np.ndarray) -> np.ndarray:
             image[~object_mask] = [0, 0, 0]
         else:
             print(f"Object ID: {track_object_id} not found in the live image")
+            breakpoint()
         return image
 
     if track_object_id in task_observations["instance_classes"]:
@@ -286,6 +301,7 @@ def apply_mask_callback(image: np.ndarray) -> np.ndarray:
         demo = Demo(
             bottleneck_image_rgb, bottleneck_image_depth, bottleneck_image_camera_K, object_mask
         )
+        input("Displace the object")
         dinobot.run(robot, demo, visualize=True, apply_mask_callback=apply_mask_callback)
     else:
         print(f"Object ID: {track_object_id} not found in the image")

From 599754414ce032d8369ba532802ab9f8afcf3d5e Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Thu, 12 Sep 2024 21:19:18 -0700
Subject: [PATCH 137/410] wip

---
 .../RenderOption_2024-09-12-15-31-05.json     | 41 -----------------
 .../agent/manipulation/dinobot_prototype.py   | 46 ++++++++++++++-----
 2 files changed, 35 insertions(+), 52 deletions(-)
 delete mode 100644 src/stretch/agent/manipulation/RenderOption_2024-09-12-15-31-05.json

diff --git a/src/stretch/agent/manipulation/RenderOption_2024-09-12-15-31-05.json b/src/stretch/agent/manipulation/RenderOption_2024-09-12-15-31-05.json
deleted file mode 100644
index 26cf14f8d..000000000
--- a/src/stretch/agent/manipulation/RenderOption_2024-09-12-15-31-05.json
+++ /dev/null
@@ -1,41 +0,0 @@
-{
-	"background_color" : [ 1.0, 1.0, 1.0 ],
-	"class_name" : "RenderOption",
-	"default_mesh_color" : [ 0.69999999999999996, 0.69999999999999996, 0.69999999999999996 ],
-	"image_max_depth" : 3000,
-	"image_stretch_option" : 1,
-	"interpolation_option" : 0,
-	"light0_color" : [ 1.0, 1.0, 1.0 ],
-	"light0_diffuse_power" : 0.66000000000000003,
-	"light0_position" : [ 0.0, 0.0, 2.0 ],
-	"light0_specular_power" : 0.20000000000000001,
-	"light0_specular_shininess" : 100.0,
-	"light1_color" : [ 1.0, 1.0, 1.0 ],
-	"light1_diffuse_power" : 0.66000000000000003,
-	"light1_position" : [ 0.0, 0.0, 2.0 ],
-	"light1_specular_power" : 0.20000000000000001,
-	"light1_specular_shininess" : 100.0,
-	"light2_color" : [ 1.0, 1.0, 1.0 ],
-	"light2_diffuse_power" : 0.66000000000000003,
-	"light2_position" : [ 0.0, 0.0, -2.0 ],
-	"light2_specular_power" : 0.20000000000000001,
-	"light2_specular_shininess" : 100.0,
-	"light3_color" : [ 1.0, 1.0, 1.0 ],
-	"light3_diffuse_power" : 0.66000000000000003,
-	"light3_position" : [ 0.0, 0.0, -2.0 ],
-	"light3_specular_power" : 0.20000000000000001,
-	"light3_specular_shininess" : 100.0,
-	"light_ambient_color" : [ 0.0, 0.0, 0.0 ],
-	"light_on" : true,
-	"line_width" : 1.0,
-	"mesh_color_option" : 1,
-	"mesh_shade_option" : 0,
-	"mesh_show_back_face" : false,
-	"mesh_show_wireframe" : false,
-	"point_color_option" : 0,
-	"point_show_normal" : false,
-	"point_size" : 5.0,
-	"show_coordinate_frame" : false,
-	"version_major" : 1,
-	"version_minor" : 0
-}
\ No newline at end of file
diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index 585af51ef..576a6ee51 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -10,6 +10,7 @@
 import sys
 
 sys.path.append("/home/hello-robot/repos/dino-vit-features")
+sys.path.append("/home/hello-robot/repos/gripper_grasp_space")
 import time
 from typing import List, Tuple
 
@@ -20,6 +21,7 @@
 from correspondences import find_correspondences, visualize_correspondences
 from extractor import ViTExtractor
 from scipy.spatial.transform import Rotation
+from urdf_utils import get_stretch_3_urdf
 
 from stretch.agent import RobotClient
 from stretch.agent.manipulation.dinobot import (
@@ -80,12 +82,13 @@ def __init__(self, model_type: str = "dino_vits8", stride: int = 4):
         self.feature_extractor = ViTExtractor(
             model_type=model_type, stride=stride, device=self.device
         )
+        self.urdf_model = get_stretch_3_urdf()
 
     def get_correspondences(
         self,
         image1: np.ndarray,
         image2: np.ndarray,
-        num_pairs: int = 10,
+        num_pairs: int = 20,
         load_size: int = 224,
         layer: int = 9,
         facet: str = "key",
@@ -101,7 +104,12 @@ def get_correspondences(
         return points1, points2
 
     def run(
-        self, robot: RobotClient, demo: Demo, visualize: bool = False, apply_mask_callback=None
+        self,
+        robot: RobotClient,
+        demo: Demo,
+        visualize: bool = False,
+        apply_mask_callback=None,
+        error_threshold: float = 0.01,
     ):
         """
         Run the Dinobot algorithm
@@ -111,7 +119,8 @@ def run(
             visualize (bool): Whether to visualize the correspondences
         """
         print("Running Dinobot")
-        while True:
+        error = 100000
+        while error > error_threshold:
             servo = robot.get_servo_observation()
             ee_depth = servo.ee_depth.copy()
             ee_rgb = servo.ee_rgb.copy()
@@ -139,19 +148,36 @@ def run(
                     else:
                         print("No correspondences found")
                 # Move the robot to the bottleneck pose
-                self.move_to_bottleneck(robot, points1, points2, ee_depth, demo)
+                error = self.move_to_bottleneck(robot, points1, points2, ee_depth, demo)
             else:
                 print("No bottleneck image found")
+        print("Dinobot finished")
 
     def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray) -> float:
         """
         Move the robot to the bottleneck pose
         """
+        ee_transform = np.eye(4)
+        ee_transform[:3, :3] = R
+        ee_transform[:3, 3] = t
+        T_d405_to_wrist_yaw = self.urdf_model.get_transform(
+            "gripper_camera_color_optical_frame", "link_wrist_yaw"
+        )
+        ee_transform_wrist_yaw_T = T_d405_to_wrist_yaw @ ee_transform
+        translation_vector = ee_transform_wrist_yaw_T[:3, 3]
+        rotation_mat = ee_transform_wrist_yaw_T[:3, :3]
         robot.switch_to_manipulation_mode()
         joint_state = robot.get_joint_positions().copy()
-        # joint_state[HelloStretchIdx.ARM] = joint_state[HelloStretchIdx.ARM] + t[2]
-        # joint_state[HelloStretchIdx.LIFT] = joint_state[HelloStretchIdx.LIFT] + t[1]
-        joint_state[HelloStretchIdx.BASE_X] = joint_state[HelloStretchIdx.BASE_X] + t[0]
+        print(f"TRanslation: {translation_vector}")
+        breakpoint()
+        # joint_state[HelloStretchIdx.ARM] = joint_state[HelloStretchIdx.ARM] + ee_transform_wrist_yaw_T[:3][3]
+        # joint_state[HelloStretchIdx.LIFT] = joint_state[HelloStretchIdx.LIFT] + ee_transform_wrist_yaw_T[:3][2]
+        joint_state[HelloStretchIdx.BASE_X] = (
+            joint_state[HelloStretchIdx.BASE_X] - translation_vector[0]
+        )
+        # joint_state[HelloStretchIdx.WRIST_YAW] = joint_state[HelloStretchIdx.WRIST_YAW] + R[0]
+        # joint_state[HelloStretchIdx.WRIST_PITCH] = joint_state[HelloStretchIdx.WRIST_PITCH] + R[1]
+        # joint_state[HelloStretchIdx.WRIST_ROLL] = joint_state[HelloStretchIdx.WRIST_ROLL] + R[2]
         robot.arm_to(joint_state, blocking=True)
 
     def move_to_bottleneck(
@@ -182,7 +208,6 @@ def move_to_bottleneck(
 
         invalid_depth_ids = []
         for i, point in enumerate(points1):
-            print(point)
             if np.mean(point) == 0:
                 invalid_depth_ids.append(i)
         for i, point in enumerate(points2):
@@ -197,7 +222,7 @@ def move_to_bottleneck(
         R, t = find_transformation(points1, points2)
         r = Rotation.from_matrix(R)
         angles = r.as_euler("xyz")
-        print(f"Robot needs to Rotate: {angles}, T: {t}")
+        print(f"Camera frame to Rot: {R}, Trans: {t}")
         error = compute_error(points1, points2)
         print(f"Error: {error}")
 
@@ -242,8 +267,7 @@ def move_to_bottleneck(
             # components.extend(spheres)
             # o3d.visualization.draw_geometries(components)
 
-        self.move_robot(robot, R, t)
-
+        self.move_robot(robot, angles, t)
         return error
 
 

From f2a0e1f93f2339bcb2ce0c796065dea502e9f98a Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Thu, 12 Sep 2024 21:50:10 -0700
Subject: [PATCH 138/410] wip

---
 src/stretch/agent/manipulation/dinobot_prototype.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index 576a6ee51..b5c405326 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -170,7 +170,7 @@ def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray) -> float:
         joint_state = robot.get_joint_positions().copy()
         print(f"TRanslation: {translation_vector}")
         breakpoint()
-        # joint_state[HelloStretchIdx.ARM] = joint_state[HelloStretchIdx.ARM] + ee_transform_wrist_yaw_T[:3][3]
+        joint_state[HelloStretchIdx.ARM] = joint_state[HelloStretchIdx.ARM] - translation_vector[1]
         # joint_state[HelloStretchIdx.LIFT] = joint_state[HelloStretchIdx.LIFT] + ee_transform_wrist_yaw_T[:3][2]
         joint_state[HelloStretchIdx.BASE_X] = (
             joint_state[HelloStretchIdx.BASE_X] - translation_vector[0]

From 4b183019c9a267d5b7c93e9544a8e00cf5100737 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 13 Sep 2024 11:21:41 -0400
Subject: [PATCH 139/410] update

---
 src/stretch/agent/operations/grasp_object.py | 46 ++++++++++++++++++--
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index eae6cee2b..09322c388 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -7,6 +7,14 @@
 # Some code may be adapted from other open-source works with their respective licenses. Original
 # license information maybe found below, if so.
 
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 import time
 import timeit
 from typing import Optional, Tuple
@@ -34,6 +42,7 @@ class GraspObjectOperation(ManagedOperation):
     _success: bool = False
 
     # Task information
+    match_method: str = "class"
     target_object: Optional[str] = None
 
     # Debugging UI elements
@@ -132,10 +141,36 @@ def get_class_mask(self, servo: Observations) -> np.ndarray:
             np.ndarray: Mask for the class of the object we are trying to grasp
         """
         mask = np.zeros_like(servo.semantic).astype(bool)  # type: ignore
-        for iid in np.unique(servo.semantic):
-            name = self.agent.semantic_sensor.get_class_name_for_id(iid)
-            if name is not None and self.target_object in name:
-                mask = np.bitwise_or(mask, servo.semantic == iid)
+
+        if self.match_method == "class":
+
+            # Get the target class
+            if self.agent.current_object is not None:
+                target_class_id = self.agent.current_object.category_id
+                target_class = self.agent.semantic_sensor.get_class_name_for_id(target_class_id)
+            else:
+                target_class = self.target_object
+
+            # Now find the mask with that class
+            for iid in np.unique(servo.semantic):
+                name = self.agent.semantic_sensor.get_class_name_for_id(iid)
+                if name is not None and target_class in name:
+                    mask = np.bitwise_or(mask, servo.semantic == iid)
+
+        elif self.match_method == "feature":
+            if self.target_object is None:
+                raise ValueError(
+                    f"Target object must be set before running match method {self.match_method}."
+                )
+            for iid in np.unique(servo.instance):
+                rgb = servo.ee_rgb[servo.instance == iid]
+                import matplotlib.pyplot as plt
+
+                plt.imshow(rgb)
+                plt.show()
+        else:
+            raise ValueError(f"Invalid matching method {self.match_method}.")
+
         return mask
 
     def set_target_object_class(self, target_object: str):
@@ -174,6 +209,8 @@ def get_target_mask(
         maximum_overlap_mask = None
         maximum_overlap_pts = float("-inf")
         center_x, center_y = center
+
+        # Loop over all detected instances -- finding the one that matches our target object
         for iid in np.unique(instance_mask):
             current_instance_mask = instance_mask == iid
 
@@ -284,6 +321,7 @@ def visual_servo_to_object(
             latest_mask = self.get_target_mask(
                 servo, instance, prev_mask=prev_target_mask, center=(center_x, center_y)
             )
+            print("latest mask size:", np.sum(latest_mask.flatten()))
 
             # dilate mask
             kernel = np.ones((3, 3), np.uint8)

From d7390ff91722ed8af9861881ee3905d473625e65 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 13 Sep 2024 13:11:50 -0400
Subject: [PATCH 140/410] updates

---
 src/stretch/agent/task/pickup/pickup_task.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/task/pickup/pickup_task.py b/src/stretch/agent/task/pickup/pickup_task.py
index 7b1dd0c78..f520a8d21 100644
--- a/src/stretch/agent/task/pickup/pickup_task.py
+++ b/src/stretch/agent/task/pickup/pickup_task.py
@@ -110,7 +110,7 @@ def get_one_shot_task(self, add_rotate: bool = False, matching: str = "feature")
 
         # Look for the target receptacle
         search_for_receptacle = SearchForReceptacleOperation(
-            "search_for_{self.target_receptacle}",
+            f"search_for_{self.target_receptacle}",
             self.agent,
             parent=rotate_in_place if add_rotate else go_to_navigation_mode,
             retry_on_failure=True,
@@ -119,7 +119,7 @@ def get_one_shot_task(self, add_rotate: bool = False, matching: str = "feature")
 
         # Try to expand the frontier and find an object; or just wander around for a while.
         search_for_object = SearchForObjectOnFloorOperation(
-            "search_for_{self.target_object}_on_floor",
+            f"search_for_{self.target_object}_on_floor",
             self.agent,
             retry_on_failure=True,
             match_method=matching,

From aea504f8bd68ce61720b171c8324524cb36bab3b Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 13 Sep 2024 14:25:27 -0400
Subject: [PATCH 141/410] create task information

---
 src/stretch/agent/operations/grasp_object.py | 5 ++++-
 src/stretch/agent/task/pickup/pickup_task.py | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index 09322c388..ae388441a 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -61,12 +61,15 @@ class GraspObjectOperation(ManagedOperation):
     gripper_aruco_detector: GripperArucoDetector = None
     min_points_to_approach: int = 100
     detected_center_offset_y: int = -40
-    median_distance_when_grasping: float = 0.175
     percentage_of_image_when_grasping: float = 0.2
     open_loop_z_offset: float = -0.1
     open_loop_x_offset: float = -0.1
     max_failed_attempts: int = 10
 
+    # This is the distance at which we close the gripper when visual servoing
+    # median_distance_when_grasping: float = 0.175
+    median_distance_when_grasping: float = 0.15
+
     # Movement parameters
     lift_arm_ratio: float = 0.08
     base_x_step: float = 0.10
diff --git a/src/stretch/agent/task/pickup/pickup_task.py b/src/stretch/agent/task/pickup/pickup_task.py
index f520a8d21..657b399b0 100644
--- a/src/stretch/agent/task/pickup/pickup_task.py
+++ b/src/stretch/agent/task/pickup/pickup_task.py
@@ -41,6 +41,8 @@ def __init__(
         # Task information
         self.agent.target_object = target_object
         self.agent.target_receptacle = target_receptacle
+        self.target_object = target_object
+        self.target_receptacle = target_receptacle
         self.use_visual_servoing_for_grasp = use_visual_servoing_for_grasp
 
         assert matching in ["feature", "class"], f"Invalid instance matching method: {matching}"

From 8c858b66251b20689540e6f53832d976652e73f0 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 13 Sep 2024 15:09:19 -0400
Subject: [PATCH 142/410] update grasp code etc

---
 src/stretch/agent/operations/grasp_object.py  | 42 ++++++++++++++-----
 .../agent/operations/search_for_object.py     | 14 ++++---
 2 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index ae388441a..50e90036e 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -23,6 +23,7 @@
 import numpy as np
 
 import stretch.motion.constants as constants
+import stretch.utils.logger as logger
 from stretch.agent.base import ManagedOperation
 from stretch.core.interfaces import Observations
 from stretch.mapping.instance import Instance
@@ -65,6 +66,7 @@ class GraspObjectOperation(ManagedOperation):
     open_loop_z_offset: float = -0.1
     open_loop_x_offset: float = -0.1
     max_failed_attempts: int = 10
+    max_random_motions: int = 10
 
     # This is the distance at which we close the gripper when visual servoing
     # median_distance_when_grasping: float = 0.175
@@ -265,7 +267,7 @@ def _grasp(self) -> bool:
         return True
 
     def visual_servo_to_object(
-        self, instance: Instance, max_duration: float = 120.0, max_not_moving_count: int = 10
+        self, instance: Instance, max_duration: float = 120.0, max_not_moving_count: int = 50
     ) -> bool:
         """Use visual servoing to grasp the object."""
 
@@ -289,6 +291,7 @@ def visual_servo_to_object(
         failed_counter = 0
         not_moving_count = 0
         q_last = np.array([0.0 for _ in range(11)])  # 11 DOF, HelloStretchIdx
+        random_motion_counter = 0
 
         # Main loop - run unless we time out, blocking.
         while timeit.default_timer() - t0 < max_duration:
@@ -427,6 +430,11 @@ def visual_servo_to_object(
                 # Since we were able to detect it, copy over the target mask
                 prev_target_mask = target_mask
 
+                # Are we aligned to the object
+                aligned = (
+                    np.abs(dx) < self.align_x_threshold and np.abs(dy) < self.align_y_threshold
+                )
+
                 print()
                 print("----- STEP VISUAL SERVOING -----")
                 print("Observed this many target mask points:", np.sum(target_mask.flatten()))
@@ -440,21 +448,34 @@ def visual_servo_to_object(
                 print(f"Center distance to object is {center_depth}.")
                 print("Center in mask?", center_in_mask)
                 print("Current XYZ:", current_xyz)
-
-                aligned = (
-                    np.abs(dx) < self.align_x_threshold and np.abs(dy) < self.align_y_threshold
-                )
+                print("Aligned?", aligned)
 
                 # Fix lift to only go down
                 lift = min(lift, prev_lift)
 
-                if aligned:
+                if not center_in_mask:
+                    # If the center of the image is not in the mask, we are not aligned
+                    # We move base, or wrist at random
+                    logger.warning("Center not in mask; moving randomly.")
+                    r = np.random.rand()
+                    if r < 0.25:
+                        base_x += -self.base_x_step / 2
+                    elif r < 0.5:
+                        base_x += self.base_x_step / 2
+                    elif r < 0.75:
+                        wrist_pitch += -self.wrist_pitch_step / 2
+                    else:
+                        wrist_pitch += self.wrist_pitch_step / 2
+                    random_motion_counter += 1
+                    input("---")
+                elif aligned:
                     # First, check to see if we are close enough to grasp
                     if center_depth < self.median_distance_when_grasping:
                         print(
                             f"Center depth of {center_depth} is close enough to grasp; less than {self.median_distance_when_grasping}."
                         )
                         self.info("Aligned and close enough to grasp.")
+                        breakpoint()
                         success = self._grasp()
                         break
                     # If we are aligned, step the whole thing closer by some amount
@@ -531,6 +552,10 @@ def visual_servo_to_object(
 
                 q_last = q
 
+            if random_motion_counter > self.max_random_motions:
+                self.error("Failed to align to object after 10 random motions.")
+                break
+
         if self.show_servo_gui:
             cv2.destroyAllWindows()
         return success
@@ -576,9 +601,6 @@ def run(self) -> None:
             # If we try to servo, then do this
             self._success = self.visual_servo_to_object(self.agent.current_object)
 
-        if not self._success:
-            self.grasp_open_loop(object_xyz)
-
         # clear observations
         if self.reset_observation:
             self.observations.clear_history()
@@ -587,7 +609,7 @@ def run(self) -> None:
                 env_id=0, global_instance_id=self.agent.current_object.global_id
             )
 
-    def pregrasp_open_loop(self, object_xyz: np.ndarray, distance_from_object: float = 0.1):
+    def pregrasp_open_loop(self, object_xyz: np.ndarray, distance_from_object: float = 0.2):
         xyt = self.robot.get_base_pose()
         relative_object_xyz = point_global_to_base(object_xyz, xyt)
 
diff --git a/src/stretch/agent/operations/search_for_object.py b/src/stretch/agent/operations/search_for_object.py
index 51f53f2ff..6d94e23d6 100644
--- a/src/stretch/agent/operations/search_for_object.py
+++ b/src/stretch/agent/operations/search_for_object.py
@@ -178,6 +178,7 @@ class SearchForObjectOnFloorOperation(ManagedSearchOperation):
     """Search for an object on the floor"""
 
     plan_for_manipulation: bool = True
+    update_at_start: bool = False
 
     def can_start(self) -> bool:
         self.attempt("If receptacle is found, we can start searching for objects.")
@@ -194,11 +195,14 @@ def run(self) -> None:
         # Clear the current object
         self.agent.current_object = None
 
-        # Update world map
-        # Switch to navigation posture
-        self.robot.move_to_nav_posture()
-        # Do not update until you are in nav posture
-        self.update()
+        if self.update_at_start:
+            # Update world map
+            # Switch to navigation posture
+            self.robot.move_to_nav_posture()
+
+            # Do not update until you are in nav posture
+
+            self.update()
 
         if self.show_map_so_far:
             # This shows us what the robot has found so far

From 3e1ef128d4f59b412632d114c7b402d76dd46b61 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 13 Sep 2024 15:16:54 -0400
Subject: [PATCH 143/410] updates

---
 src/stretch/agent/operations/grasp_object.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index 50e90036e..652a4bd54 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -75,7 +75,8 @@ class GraspObjectOperation(ManagedOperation):
     # Movement parameters
     lift_arm_ratio: float = 0.08
     base_x_step: float = 0.10
-    wrist_pitch_step: float = 0.075
+    # wrist_pitch_step: float = 0.075
+    wrist_pitch_step: float = 0.05
 
     # Timing issues
     expected_network_delay = 0.4
@@ -254,6 +255,21 @@ def get_target_mask(
     def _grasp(self) -> bool:
         """Helper function to close gripper around object."""
         self.cheer("Grasping object!")
+
+        if not self.open_loop:
+            joint_state = self.robot.get_joint_positions()
+            # Now compute what to do
+            base_x = joint_state[HelloStretchIdx.BASE_X]
+            wrist_pitch = joint_state[HelloStretchIdx.WRIST_PITCH]
+            arm = joint_state[HelloStretchIdx.ARM]
+            lift = joint_state[HelloStretchIdx.LIFT]
+
+            self.robot.arm_to(
+                [base_x, lift - 0.03, arm + 0.05, 0, wrist_pitch, 0],
+                head=constants.look_at_ee,
+                blocking=False,
+            )
+
         self.robot.close_gripper(loose=self.grasp_loose, blocking=True)
         time.sleep(0.5)
 

From 262bda9bfdf5bcf7ba020f1616e5cda7e2544775 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 13 Sep 2024 15:25:08 -0400
Subject: [PATCH 144/410] Update grasp code

---
 src/stretch/agent/operations/grasp_object.py | 43 +++++++++-----------
 1 file changed, 20 insertions(+), 23 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index 652a4bd54..40872f29f 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -23,7 +23,6 @@
 import numpy as np
 
 import stretch.motion.constants as constants
-import stretch.utils.logger as logger
 from stretch.agent.base import ManagedOperation
 from stretch.core.interfaces import Observations
 from stretch.mapping.instance import Instance
@@ -67,6 +66,8 @@ class GraspObjectOperation(ManagedOperation):
     open_loop_x_offset: float = -0.1
     max_failed_attempts: int = 10
     max_random_motions: int = 10
+    # pregrasp_distance_from_object: float = 0.075
+    pregrasp_distance_from_object: float = 0.1
 
     # This is the distance at which we close the gripper when visual servoing
     # median_distance_when_grasping: float = 0.175
@@ -75,8 +76,9 @@ class GraspObjectOperation(ManagedOperation):
     # Movement parameters
     lift_arm_ratio: float = 0.08
     base_x_step: float = 0.10
-    # wrist_pitch_step: float = 0.075
-    wrist_pitch_step: float = 0.05
+    # wrist_pitch_step: float = 0.075  # Maybe too fast
+    wrist_pitch_step: float = 0.06
+    # wrist_pitch_step: float = 0.05  # Too slow
 
     # Timing issues
     expected_network_delay = 0.4
@@ -264,11 +266,13 @@ def _grasp(self) -> bool:
             arm = joint_state[HelloStretchIdx.ARM]
             lift = joint_state[HelloStretchIdx.LIFT]
 
+            # Move the arm in closer
             self.robot.arm_to(
-                [base_x, lift - 0.03, arm + 0.05, 0, wrist_pitch, 0],
+                [base_x, lift - 0.05, arm + 0.1, 0, wrist_pitch, 0],
                 head=constants.look_at_ee,
-                blocking=False,
+                blocking=True,
             )
+            time.sleep(0.5)
 
         self.robot.close_gripper(loose=self.grasp_loose, blocking=True)
         time.sleep(0.5)
@@ -418,7 +422,9 @@ def visual_servo_to_object(
                     break
 
             if not pregrasp_done and current_xyz is not None:
-                self.pregrasp_open_loop(current_xyz, distance_from_object=0.075)
+                self.pregrasp_open_loop(
+                    current_xyz, distance_from_object=self.pregrasp_distance_from_object
+                )
                 pregrasp_done = True
             else:
                 # check not moving threshold
@@ -469,22 +475,7 @@ def visual_servo_to_object(
                 # Fix lift to only go down
                 lift = min(lift, prev_lift)
 
-                if not center_in_mask:
-                    # If the center of the image is not in the mask, we are not aligned
-                    # We move base, or wrist at random
-                    logger.warning("Center not in mask; moving randomly.")
-                    r = np.random.rand()
-                    if r < 0.25:
-                        base_x += -self.base_x_step / 2
-                    elif r < 0.5:
-                        base_x += self.base_x_step / 2
-                    elif r < 0.75:
-                        wrist_pitch += -self.wrist_pitch_step / 2
-                    else:
-                        wrist_pitch += self.wrist_pitch_step / 2
-                    random_motion_counter += 1
-                    input("---")
-                elif aligned:
+                if aligned:
                     # First, check to see if we are close enough to grasp
                     if center_depth < self.median_distance_when_grasping:
                         print(
@@ -625,7 +616,13 @@ def run(self) -> None:
                 env_id=0, global_instance_id=self.agent.current_object.global_id
             )
 
-    def pregrasp_open_loop(self, object_xyz: np.ndarray, distance_from_object: float = 0.2):
+    def pregrasp_open_loop(self, object_xyz: np.ndarray, distance_from_object: float = 0.25):
+        """Move to a pregrasp position in an open loop manner.
+
+        Args:
+            object_xyz (np.ndarray): Location to grasp
+            distance_from_object (float, optional): Distance from object. Defaults to 0.2.
+        """
         xyt = self.robot.get_base_pose()
         relative_object_xyz = point_global_to_base(object_xyz, xyt)
 

From 5c870db561537451113dfd7767fe889d9c29181d Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 13 Sep 2024 15:40:16 -0400
Subject: [PATCH 145/410] remove a breakpoint

---
 src/stretch/agent/operations/grasp_object.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index 40872f29f..cef1e0843 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -482,7 +482,6 @@ def visual_servo_to_object(
                             f"Center depth of {center_depth} is close enough to grasp; less than {self.median_distance_when_grasping}."
                         )
                         self.info("Aligned and close enough to grasp.")
-                        breakpoint()
                         success = self._grasp()
                         break
                     # If we are aligned, step the whole thing closer by some amount

From f986b71a215b7352f99283b2745cb9ee9cbe3eab Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 13 Sep 2024 15:46:34 -0400
Subject: [PATCH 146/410] Update grasping

---
 src/stretch/agent/operations/grasp_object.py | 39 +++++++++++++-------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index cef1e0843..989a4c14a 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -50,22 +50,17 @@ class GraspObjectOperation(ManagedOperation):
     show_servo_gui: bool = True
     show_point_cloud: bool = False
 
+    # ------------------------
+    # These are the most important parameters for tuning to make the grasping "feel" nice
+    # This is almost certainly worth tuning a bit.
+
     # Thresholds for centering on object
-    align_x_threshold: int = 10
-    align_y_threshold: int = 7
-    grasp_loose: bool = False
-    reset_observation: bool = False
+    # align_x_threshold: int = 10
+    # align_y_threshold: int = 7
+    # These are the values used to decide when it's aligned enough to grasp
+    align_x_threshold: int = 14
+    align_y_threshold: int = 10
 
-    # Visual servoing config
-    track_image_center: bool = False
-    gripper_aruco_detector: GripperArucoDetector = None
-    min_points_to_approach: int = 100
-    detected_center_offset_y: int = -40
-    percentage_of_image_when_grasping: float = 0.2
-    open_loop_z_offset: float = -0.1
-    open_loop_x_offset: float = -0.1
-    max_failed_attempts: int = 10
-    max_random_motions: int = 10
     # pregrasp_distance_from_object: float = 0.075
     pregrasp_distance_from_object: float = 0.1
 
@@ -79,6 +74,22 @@ class GraspObjectOperation(ManagedOperation):
     # wrist_pitch_step: float = 0.075  # Maybe too fast
     wrist_pitch_step: float = 0.06
     # wrist_pitch_step: float = 0.05  # Too slow
+    # ------------------------
+
+    # Parameters about how to grasp - less important
+    grasp_loose: bool = False
+    reset_observation: bool = False
+
+    # Visual servoing config
+    track_image_center: bool = False
+    gripper_aruco_detector: GripperArucoDetector = None
+    min_points_to_approach: int = 100
+    detected_center_offset_y: int = -40
+    percentage_of_image_when_grasping: float = 0.2
+    open_loop_z_offset: float = -0.1
+    open_loop_x_offset: float = -0.1
+    max_failed_attempts: int = 10
+    max_random_motions: int = 10
 
     # Timing issues
     expected_network_delay = 0.4

From e9661a1d6a1d5c0988a8cb2a163af8e634e8d237 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 13 Sep 2024 15:51:27 -0400
Subject: [PATCH 147/410] update grasping operations

---
 src/stretch/agent/operations/grasp_object.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index 989a4c14a..0898b24be 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -58,8 +58,8 @@ class GraspObjectOperation(ManagedOperation):
     # align_x_threshold: int = 10
     # align_y_threshold: int = 7
     # These are the values used to decide when it's aligned enough to grasp
-    align_x_threshold: int = 14
-    align_y_threshold: int = 10
+    align_x_threshold: int = 15
+    align_y_threshold: int = 15
 
     # pregrasp_distance_from_object: float = 0.075
     pregrasp_distance_from_object: float = 0.1
@@ -71,8 +71,8 @@ class GraspObjectOperation(ManagedOperation):
     # Movement parameters
     lift_arm_ratio: float = 0.08
     base_x_step: float = 0.10
-    # wrist_pitch_step: float = 0.075  # Maybe too fast
-    wrist_pitch_step: float = 0.06
+    wrist_pitch_step: float = 0.075  # Maybe too fast
+    # wrist_pitch_step: float = 0.06
     # wrist_pitch_step: float = 0.05  # Too slow
     # ------------------------
 

From 902a12f1fb06a4c1a7415bd72537a4f600742299 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 13 Sep 2024 15:54:56 -0400
Subject: [PATCH 148/410] update pickup

---
 src/stretch/app/{pickup_all.py => ai_pickup.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/stretch/app/{pickup_all.py => ai_pickup.py} (100%)

diff --git a/src/stretch/app/pickup_all.py b/src/stretch/app/ai_pickup.py
similarity index 100%
rename from src/stretch/app/pickup_all.py
rename to src/stretch/app/ai_pickup.py

From bfbb5294bc73241de2853293fdef7a88d3df3ee2 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 13 Sep 2024 16:02:17 -0400
Subject: [PATCH 149/410] update prompt

---
 src/stretch/app/ai_pickup.py              |  7 ++++
 src/stretch/llms/prompts/pickup_prompt.py | 44 +++++++++++++++++++++++
 2 files changed, 51 insertions(+)
 create mode 100644 src/stretch/llms/prompts/pickup_prompt.py

diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index 316a9892a..0b71f1d68 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -73,6 +73,11 @@
     default="",
     help="Name of the receptacle to place the object in",
 )
+@click.option(
+    "--use_llm",
+    is_flag=True,
+    help="Set to use the language model",
+)
 def main(
     robot_ip: str = "192.168.1.15",
     local: bool = False,
@@ -86,6 +91,7 @@ def main(
     mode: str = "one_shot",
     match_method: str = "feature",
     llm: str = "gemma",
+    use_llm: bool = False,
 ):
     """Set up the robot, create a task plan, and execute it."""
     # Create robot
@@ -147,6 +153,7 @@ def main(
             robot.stop()
             raise e
 
+        # Execute the task
         task.run()
 
         if reset:
diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
new file mode 100644
index 000000000..5f812bded
--- /dev/null
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -0,0 +1,44 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+from stretch.llms.base import AbstractPromptBuilder
+
+simple_stretch_prompt = """You are a friendly, helpful robot named Stretch. You are always helpful, and answer questions concisely. You can do the following tasks. You will think step by step when required. You will answer questions very concisely.
+
+Restrictions:
+    - You will never harm a person or suggest harm
+    - You cannot go up or down stairs
+
+When prompted, you will respond in the form:
+pickup(object_name)
+place(location_name)
+say(text)
+
+For example:
+input: "Put the red apple in the cardboard box"
+
+returns:
+say("I will put the red apple in the cardboard box")
+pickup(red apple)
+place(cardboard box)
+
+You will never say anything other than pickup(), place(), and say(). Remember to be friendly, helpful, and concise.
+"""
+
+
+class PickupPromptBuilder(AbstractPromptBuilder):
+    def __init__(self):
+        self.prompt_str = simple_stretch_prompt
+
+    def __str__(self):
+        return self.prompt_str
+
+    def configure(self, **kwargs) -> str:
+        assert len(kwargs) == 0, "SimplePromptBuilder does not take any arguments."
+        return self.prompt_str

From 0fa80c411ce8d622d87bd53212ec64468673d726 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 13 Sep 2024 16:28:26 -0400
Subject: [PATCH 150/410] llm configs

---
 src/stretch/app/ai_pickup.py         |  6 +++---
 src/stretch/app/voice_chat.py        | 17 ++++++-----------
 src/stretch/llms/__init__.py         |  6 +++---
 src/stretch/llms/prompts/__init__.py |  1 +
 4 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index 0b71f1d68..c13c2f29a 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -18,6 +18,7 @@
 from stretch.core import get_parameters
 from stretch.llms import get_llm_choices, get_llm_client
 from stretch.perception import create_semantic_sensor
+from stretch.utils.prompt import PickupPromptBuilder
 
 
 @click.command()
@@ -116,12 +117,11 @@ def main(
     if reset:
         agent.move_closed_loop([0, 0, 0], max_time=60.0)
 
-    # TODO: Add a prompt for the user to enter the target object and receptacle
-    prompt = None
+    prompt = PickupPromptBuilder()
 
     # Get the LLM client
     llm_client = None
-    if prompt is not None:
+    if use_llm:
         llm_client = get_llm_client(llm, prompt=prompt)
 
     # Parse things and listen to the user
diff --git a/src/stretch/app/voice_chat.py b/src/stretch/app/voice_chat.py
index f84de0338..1baca072e 100644
--- a/src/stretch/app/voice_chat.py
+++ b/src/stretch/app/voice_chat.py
@@ -16,16 +16,16 @@
 
 from stretch.audio import AudioRecorder
 from stretch.audio.speech_to_text import WhisperSpeechToText
-from stretch.llms import Gemma2bClient, LlamaClient
+from stretch.llms import get_llm_choices, get_llm_client
 from stretch.llms.prompts.simple_prompt import SimpleStretchPromptBuilder
 
 
 @click.command()
 @click.option(
-    "--model",
-    default="gemma",
+    "--llm",
+    default="gemma2b",
     help="The model to use (gemma or llama)",
-    type=click.Choice(["gemma", "llama"]),
+    type=click.Choice(get_llm_choices()),
 )
 @click.option(
     "--max_audio_duration", default=10.0, help="The maximum duration of the audio recording"
@@ -33,17 +33,12 @@
 @click.option(
     "--silence_limit", default=2.0, help="The amount of silence before stopping the recording"
 )
-def main(model="gemma", max_audio_duration: float = 10.0, silence_limit: float = 2.0):
+def main(llm="gemma2b", max_audio_duration: float = 10.0, silence_limit: float = 2.0):
     # Load the tokenizer and model
     audio_recorder = AudioRecorder()
     whisper = WhisperSpeechToText()
     prompt = SimpleStretchPromptBuilder()
-    if model == "gemma":
-        client = Gemma2bClient(prompt)
-    elif model == "llama":
-        client = LlamaClient(prompt)
-    else:
-        raise ValueError(f"Invalid model: {model}")
+    client = get_llm_client(llm, prompt)
 
     print("Talk to me, Stretch! If you don't say anything, I will give up.")
     for i in range(50):
diff --git a/src/stretch/llms/__init__.py b/src/stretch/llms/__init__.py
index 2d35a6a0a..f5682a391 100644
--- a/src/stretch/llms/__init__.py
+++ b/src/stretch/llms/__init__.py
@@ -50,10 +50,10 @@ def get_llm_client(
         An LLM client.
     """
     if client_type == "gemma2b":
-        return Gemma2bClient(**kwargs)
+        return Gemma2bClient(prompt, **kwargs)
     elif client_type == "llama":
-        return LlamaClient(**kwargs)
+        return LlamaClient(prompt, **kwargs)
     elif client_type == "openai":
-        return OpenaiClient(**kwargs)
+        return OpenaiClient(prompt, **kwargs)
     else:
         raise ValueError(f"Invalid client type: {client_type}")
diff --git a/src/stretch/llms/prompts/__init__.py b/src/stretch/llms/prompts/__init__.py
index e0cd3dc55..1fd5cba3d 100644
--- a/src/stretch/llms/prompts/__init__.py
+++ b/src/stretch/llms/prompts/__init__.py
@@ -9,4 +9,5 @@
 
 from .object_manip_nav_prompt import ObjectManipNavPromptBuilder
 from .ok_robot_prompt import OkRobotPromptBuilder
+from .pickup_prompt import PickupPromptBuilder
 from .simple_prompt import SimpleStretchPromptBuilder

From b22fa8505e48e00484d6520785e10d8a51947a7b Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 13 Sep 2024 16:49:19 -0400
Subject: [PATCH 151/410] updates to llm chat agents

---
 src/stretch/app/chat.py                   | 97 +++++++++++++++++++++++
 src/stretch/llms/__init__.py              | 32 ++++++++
 src/stretch/llms/base.py                  |  4 +
 src/stretch/llms/prompts/pickup_prompt.py | 52 ++++++++++--
 4 files changed, 179 insertions(+), 6 deletions(-)
 create mode 100644 src/stretch/app/chat.py

diff --git a/src/stretch/app/chat.py b/src/stretch/app/chat.py
new file mode 100644
index 000000000..4b560c30a
--- /dev/null
+++ b/src/stretch/app/chat.py
@@ -0,0 +1,97 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import os
+import tempfile
+import timeit
+
+import click
+from termcolor import colored
+
+from stretch.audio import AudioRecorder
+from stretch.audio.speech_to_text import WhisperSpeechToText
+from stretch.llms import get_llm_choices, get_llm_client, get_prompt_builder, get_prompt_choices
+
+
+@click.command()
+@click.option(
+    "--llm",
+    default="gemma2b",
+    help="The model to use (gemma or llama)",
+    type=click.Choice(get_llm_choices()),
+)
+@click.option(
+    "--prompt", default="simple", help="The prompt to use", type=click.Choice(get_prompt_choices())
+)
+@click.option(
+    "--max_audio_duration", default=10.0, help="The maximum duration of the audio recording"
+)
+@click.option(
+    "--silence_limit", default=2.0, help="The amount of silence before stopping the recording"
+)
+@click.option("--voice", default=False, help="Enable voice chat", is_flag=True)
+def main(
+    llm="gemma2b",
+    max_audio_duration: float = 10.0,
+    silence_limit: float = 2.0,
+    voice=False,
+    prompt="simple",
+):
+    prompt = get_prompt_builder(prompt)
+    client = get_llm_client(llm, prompt)
+
+    if voice:
+        # Load the tokenizer and model
+        audio_recorder = AudioRecorder()
+        whisper = WhisperSpeechToText()
+    else:
+        audio_recorder = None
+        whisper = None
+
+    if voice:
+        print("Talk to me, Stretch! If you don't say anything, I will give up.")
+    for i in range(50):
+        if voice:
+            # Record audio
+            input(colored("Press enter to speak or ctrl+c to exit.", "yellow"))
+
+            # Create a temporary file
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
+                temp_filename = temp_audio_file.name
+                audio_recorder.record(
+                    temp_filename, duration=max_audio_duration, silence_limit=silence_limit
+                )
+
+                # Transcribe the audio file
+                input_text = whisper.transcribe_file(temp_filename)
+
+                # Remove the temporary file
+                os.remove(temp_filename)
+
+                print(colored("I heard:", "green"), input_text)
+        else:
+            input_text = input(colored("You: ", "green"))
+
+        if len(input_text) == 0:
+            break
+
+        # Get the response and time it
+        t0 = timeit.default_timer()
+        assistant_response = client(input_text)
+        t1 = timeit.default_timer()
+
+        # Decode and print the result
+        print(colored("Response:", "blue"), prompt.parse_response(assistant_response))
+        print("-" * 80)
+        print("Time taken:", t1 - t0)
+        print("-" * 80)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/stretch/llms/__init__.py b/src/stretch/llms/__init__.py
index f5682a391..1313a1b64 100644
--- a/src/stretch/llms/__init__.py
+++ b/src/stretch/llms/__init__.py
@@ -13,6 +13,8 @@
 from .llama_client import LlamaClient
 from .openai_client import OpenaiClient
 from .prompts.object_manip_nav_prompt import ObjectManipNavPromptBuilder
+from .prompts.ok_robot_prompt import OkRobotPromptBuilder
+from .prompts.pickup_prompt import PickupPromptBuilder
 from .prompts.simple_prompt import SimpleStretchPromptBuilder
 
 # This is a list of all the modules that are imported when you use the import * syntax.
@@ -23,6 +25,10 @@
     "OpenaiClient",
     "ObjectManipNavPromptBuilder",
     "SimpleStretchPromptBuilder",
+    "OkRobotPromptBuilder",
+    "PickupPromptBuilder",
+    "AbstractLLMClient",
+    "AbstractPromptBuilder",
 ]
 
 llms = {
@@ -31,6 +37,32 @@
     "openai": OpenaiClient,
 }
 
+prompts = {
+    "simple": SimpleStretchPromptBuilder,
+    "object_manip_nav": ObjectManipNavPromptBuilder,
+    "ok_robot": OkRobotPromptBuilder,
+    "pickup": PickupPromptBuilder,
+}
+
+
+def get_prompt_builder(prompt_type: str) -> AbstractPromptBuilder:
+    """Return a prompt builder of the specified type.
+
+    Args:
+        prompt_type: The type of prompt builder to create.
+
+    Returns:
+        A prompt builder.
+    """
+    if prompt_type not in prompts:
+        raise ValueError(f"Invalid prompt type: {prompt_type}")
+    return prompts[prompt_type]()
+
+
+def get_prompt_choices():
+    """Return a list of available prompt builders."""
+    return prompts.keys()
+
 
 def get_llm_choices():
     """Return a list of available LLM clients."""
diff --git a/src/stretch/llms/base.py b/src/stretch/llms/base.py
index 127acf12b..5c93c2da4 100644
--- a/src/stretch/llms/base.py
+++ b/src/stretch/llms/base.py
@@ -32,6 +32,10 @@ def __call__(self, kwargs: Optional[Dict[str, Any]] = None) -> str:
             self.prompt_str = self.configure(**kwargs)
         return self.prompt_str
 
+    def parse_response(self, response: str) -> Any:
+        """Parse the response from the LLM. Usually does nothing."""
+        return response
+
 
 class AbstractLLMClient(ABC):
     """Abstract base class for a client that interacts with a language model."""
diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index 5f812bded..9a6303231 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -15,20 +15,37 @@
     - You will never harm a person or suggest harm
     - You cannot go up or down stairs
 
-When prompted, you will respond in the form:
-pickup(object_name)
-place(location_name)
-say(text)
+When prompted, you will respond using the three actions:
+- pickup(object_name)
+- place(location_name)
+- say(text)
+
+These are the only three things you will return, and they are your only way to interact with the world.
 
 For example:
 input: "Put the red apple in the cardboard box"
 
 returns:
-say("I will put the red apple in the cardboard box")
+say("I am picking up the red apple in the cardboard box")
 pickup(red apple)
 place(cardboard box)
 
-You will never say anything other than pickup(), place(), and say(). Remember to be friendly, helpful, and concise.
+You will never say anything other than pickup(), place(), and say(). Remember to be friendly, helpful, and concise. You will always explain what you are going to do before you do it. If you cannot clearly understand what the human wants, you will say so instead of providing pickup() or place().
+
+You will be polite when using the say() function. (e.g., "please", "thank you") and use complete sentences. You can answer simple commonsense questions or respond.
+
+If you do not understand how to do something using these two actions, say you do not know. Do not hallucinate.
+
+For example:
+input: "Thank you!"
+returns:
+say("You're welcome!")
+
+input: "What is your name?"
+returns:
+say("My name is Stretch.")
+
+input:
 """
 
 
@@ -42,3 +59,26 @@ def __str__(self):
     def configure(self, **kwargs) -> str:
         assert len(kwargs) == 0, "SimplePromptBuilder does not take any arguments."
         return self.prompt_str
+
+    def parse_response(self, response: str):
+        """Parse the pickup, place, and say commands from the response into a list."""
+        commands = []
+        for line in response.split("\n"):
+            if line.startswith("pickup("):
+                commands.append(line)
+            elif line.startswith("place("):
+                commands.append(line)
+            elif line.startswith("say("):
+                commands.append(line)
+
+        # Now go through commands and parse into a tuple (command, args)
+        parsed_commands = []
+        for command in commands:
+            if command.startswith("say("):
+                parsed_commands.append(("say", command[4:-1]))
+            elif command.startswith("pickup("):
+                parsed_commands.append(("pickup", command[7:-1]))
+            elif command.startswith("place("):
+                parsed_commands.append(("place", command[6:-1]))
+
+        return parsed_commands

From f5e8b9433c542c02a35e695a0edc37aa3afb802f Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 13 Sep 2024 17:14:15 -0400
Subject: [PATCH 152/410] update information

---
 src/stretch/llms/prompts/pickup_prompt.py | 42 +++++++++++++++++------
 1 file changed, 32 insertions(+), 10 deletions(-)

diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index 9a6303231..f0f5ede12 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -9,7 +9,7 @@
 
 from stretch.llms.base import AbstractPromptBuilder
 
-simple_stretch_prompt = """You are a friendly, helpful robot named Stretch. You are always helpful, and answer questions concisely. You can do the following tasks. You will think step by step when required. You will answer questions very concisely.
+simple_stretch_prompt = """You are a friendly, helpful robot named Stretch. You are always helpful, and answer questions concisely. You will answer questions very concisely.
 
 Restrictions:
     - You will never harm a person or suggest harm
@@ -20,30 +20,52 @@
 - place(location_name)
 - say(text)
 
-These are the only three things you will return, and they are your only way to interact with the world.
+These are the only three things you will return, and they are your only way to interact with the world. For example:
 
-For example:
 input: "Put the red apple in the cardboard box"
-
-returns:
+output:
 say("I am picking up the red apple in the cardboard box")
 pickup(red apple)
 place(cardboard box)
+end()
 
-You will never say anything other than pickup(), place(), and say(). Remember to be friendly, helpful, and concise. You will always explain what you are going to do before you do it. If you cannot clearly understand what the human wants, you will say so instead of providing pickup() or place().
 
-You will be polite when using the say() function. (e.g., "please", "thank you") and use complete sentences. You can answer simple commonsense questions or respond.
+You will never say anything other than pickup(), place(), and say(). Remember to be friendly, helpful, and concise. You will always explain what you are going to do before you do it. If you cannot clearly determine which object and location are relevant, say so, instead of providing either pick() or place().
 
-If you do not understand how to do something using these two actions, say you do not know. Do not hallucinate.
+You will be polite when using the say() function. (e.g., "please", "thank you") and use complete sentences. You can answer simple commonsense questions or respond. If you do not understand how to do something using these three actions, say you do not know. Do not hallucinate. You will always say something to acknowledge the user.
 
 For example:
+
+input: "can you put the shoe away?"
+output:
+say("Where should I put the shoe?")
+end()
+
+input: "Can you put the shoe in the closet?"
+output:
+say("I am picking up the shoe and putting it in the closet.")
+pickup(shoe)
+place(closet)
+end()
+
+input: "Put the pen in the pencil holder"
+output:
+say("I am picking up the pen and putting it in the pencil holder.")
+pickup(pen)
+place(pencil holder)
+end()
+
 input: "Thank you!"
-returns:
+output:
 say("You're welcome!")
+end()
 
 input: "What is your name?"
-returns:
+output:
 say("My name is Stretch.")
+end()
+
+Only use each function once per input. Do not hallucinate.
 
 input:
 """

From 5c110f9be85ffbdbffda337c8703bb87af1253ba Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Sat, 14 Sep 2024 14:25:40 -0700
Subject: [PATCH 153/410] Fix rerun scene graph update

---
 MUJOCO_LOG.TXT                     | 3 +++
 src/stretch/agent/robot_agent.py   | 2 ++
 src/stretch/agent/zmq_client.py    | 2 +-
 src/stretch/visualization/rerun.py | 4 ++--
 4 files changed, 8 insertions(+), 3 deletions(-)
 create mode 100644 MUJOCO_LOG.TXT

diff --git a/MUJOCO_LOG.TXT b/MUJOCO_LOG.TXT
new file mode 100644
index 000000000..f47575042
--- /dev/null
+++ b/MUJOCO_LOG.TXT
@@ -0,0 +1,3 @@
+Mon Aug 26 20:04:52 2024
+WARNING: Nan, Inf or huge value in QACC at DOF 0. The simulation is unstable. Time = 3.2700.
+
diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 88617ec9d..00e679522 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -517,6 +517,8 @@ def update_rerun(self):
         """Update the rerun server with the latest observations."""
         if self.robot._rerun:
             self.robot._rerun.update_voxel_map(self.space)
+            self.robot._rerun.update_scene_graph(self.scene_graph, self.semantic_sensor)
+
         else:
             logger.error("No rerun server available!")
 
diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index f5ec1bc25..f534ba870 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -86,7 +86,7 @@ def __init__(
         ee_link_name: Optional[str] = None,
         manip_mode_controlled_joints: Optional[List[str]] = None,
         start_immediately: bool = True,
-        enable_rerun_server: bool = False,
+        enable_rerun_server: bool = True,
     ):
         """
         Create a client to communicate with the robot over ZMQ.
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index a243ed7c8..bc166c3ba 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -100,7 +100,7 @@ def occupancy_map_to_3d_points(
 class RerunVsualizer:
     def __init__(self):
         rr.init("Stretch_robot", spawn=False)
-        rr.serve(open_browser=False, server_memory_limit="1GB")
+        rr.serve(open_browser=True, server_memory_limit="2GB")
 
         # Create environment Box place holder
         rr.log(
@@ -309,7 +309,7 @@ def update_scene_graph(
                 bounds.append(half_sizes)
                 pose = scene_graph.get_ins_center_pos(idx)
                 confidence = best_view.score
-                centers.append(pose)
+                centers.append(rr.components.PoseTranslation3D(pose))
                 labels.append(f"{name} {confidence:.2f}")
                 colors.append(self.bbox_colors_memory[name])
             rr.log(

From c6870333f3e3ebd216e0f2183ad4983d927f9ea3 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Sat, 14 Sep 2024 14:58:11 -0700
Subject: [PATCH 154/410] rerun reduce clear map blobs radii

---
 src/stretch/visualization/rerun.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index bc166c3ba..110556510 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -215,7 +215,7 @@ def update_voxel_map(
         self,
         space: SparseVoxelMapNavigationSpace,
         debug: bool = False,
-        explored_radius=0.04,
+        explored_radius=0.01,
         obstacle_radius=0.05,
     ):
         """Log voxel map and send it to Rerun visualizer
@@ -293,6 +293,7 @@ def update_scene_graph(
             bounds = []
             colors = []
 
+            t0 = timeit.default_timer()
             for idx, instance in enumerate(scene_graph.instances):
                 name = semantic_sensor.get_class_name_for_id(instance.category_id)
                 if name not in self.bbox_colors_memory:
@@ -322,6 +323,8 @@ def update_scene_graph(
                     colors=colors,
                 ),
             )
+            t1 = timeit.default_timer()
+            print("Time to log scene graph objects: ", t1 - t0)
 
     def step(self, obs, servo):
         """Log all the data"""

From 84eaec4e9fb152cdd8a8049792a26637809f223a Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Sat, 14 Sep 2024 16:33:24 -0700
Subject: [PATCH 155/410] rerun log xyt nav goals

---
 src/stretch/agent/zmq_client.py       |  2 ++
 src/stretch/visualization/rerun.py    | 20 ++++++++++++++++++++
 src/stretch/visualization/rr_robot.py | 10 ++++++++++
 3 files changed, 32 insertions(+)
 create mode 100644 src/stretch/visualization/rr_robot.py

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index f534ba870..7b57225bd 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -466,6 +466,8 @@ def navigate_to(
             xyt = xyt.xyt
         assert len(xyt) == 3, "xyt must be a vector of size 3"
         next_action = {"xyt": xyt, "nav_relative": relative, "nav_blocking": blocking}
+        if self._rerun:
+            self._rerun.update_nav_goal(xyt)
         self.send_action(next_action, timeout=timeout, verbose=verbose)
 
     def reset(self):
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 110556510..24bd9cf85 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -326,6 +326,26 @@ def update_scene_graph(
             t1 = timeit.default_timer()
             print("Time to log scene graph objects: ", t1 - t0)
 
+    def update_nav_goal(self, goal, timeout=10):
+        """Log navigation goal
+        Args:
+            goal (np.ndarray): Goal coordinates
+        """
+        ts = time.time()
+        rr.set_time_seconds("realtime", ts)
+        rr.log("world/xyt_goal/blob", rr.Points3D([0, 0, 0], colors=[0, 255, 0, 50], radii=0.1))
+        rr.log(
+            "world/xyt_goal",
+            rr.Transform3D(
+                translation=[goal[0], goal[1], 0],
+                rotation=rr.RotationAxisAngle(axis=[0, 0, 1], radians=goal[2]),
+                axis_length=0.5,
+            ),
+        )
+        rr.set_time_seconds("realtime", ts + timeout)
+        rr.log("world/xyt_goal", rr.Clear(recursive=True))
+        rr.set_time_seconds("realtime", ts)
+
     def step(self, obs, servo):
         """Log all the data"""
         if obs and servo:
diff --git a/src/stretch/visualization/rr_robot.py b/src/stretch/visualization/rr_robot.py
new file mode 100644
index 000000000..7007ca043
--- /dev/null
+++ b/src/stretch/visualization/rr_robot.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.

From 1e8c018eb0ca9902ed58af0ef75b3951ea45268d Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Sat, 14 Sep 2024 19:08:15 -0700
Subject: [PATCH 156/410] Init rr_robot file

---
 src/stretch/visualization/rr_robot.py | 53 +++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/src/stretch/visualization/rr_robot.py b/src/stretch/visualization/rr_robot.py
index 7007ca043..0a47e208b 100644
--- a/src/stretch/visualization/rr_robot.py
+++ b/src/stretch/visualization/rr_robot.py
@@ -8,3 +8,56 @@
 #
 # Some code may be adapted from other open-source works with their respective licenses. Original
 # license information maybe found below, if so.
+
+import importlib.resources as importlib_resources
+
+import numpy as np
+import urchin as urdf_loader
+
+pkg_path = str(importlib_resources.files("stretch_urdf"))
+model_name = "SE3"  # RE1V0, RE2V0, SE3
+tool_name = "eoa_wrist_dw3_tool_sg3"  # eoa_wrist_dw3_tool_sg3, tool_stretch_gripper, etc
+urdf_file_path = pkg_path + f"/{model_name}/stretch_description_{model_name}_{tool_name}.urdf"
+mesh_files_directory_path = pkg_path + f"/{model_name}/meshes"
+
+
+class URDFmodel:
+    def __init__(self) -> None:
+        """
+        Load URDF model
+        """
+        self.urdf = urdf_loader.URDF.load(urdf_file_path, lazy_load_meshes=True)
+        self.joints_names = [
+            "joint_wrist_yaw",
+            "joint_wrist_pitch",
+            "joint_wrist_roll",
+            "joint_lift",
+            "joint_arm_l0",
+            "joint_arm_l1",
+            "joint_arm_l2",
+            "joint_arm_l3",
+            "joint_head_pan",
+            "joint_head_tilt",
+            "joint_gripper_finger_left",
+        ]
+
+    def get_transform(self, cfg: dict, link_name: str) -> np.ndarray:
+        """
+        Get transformation matrix of the link w.r.t. the base_link
+        """
+        lk_cfg = {
+            "joint_wrist_yaw": cfg["wrist_yaw"],
+            "joint_wrist_pitch": cfg["wrist_pitch"],
+            "joint_wrist_roll": cfg["wrist_roll"],
+            "joint_lift": cfg["lift"],
+            "joint_arm_l0": cfg["arm"] / 4,
+            "joint_arm_l1": cfg["arm"] / 4,
+            "joint_arm_l2": cfg["arm"] / 4,
+            "joint_arm_l3": cfg["arm"] / 4,
+            "joint_head_pan": cfg["head_pan"],
+            "joint_head_tilt": cfg["head_tilt"],
+        }
+        if "gripper" in cfg.keys():
+            lk_cfg["joint_gripper_finger_left"] = cfg["gripper"]
+            lk_cfg["joint_gripper_finger_right"] = cfg["gripper"]
+        return self.urdf.link_fk(lk_cfg, link=link_name)

From 8270d8dc74681f4a14a4586088db0d16a96f6952 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Sun, 15 Sep 2024 15:03:20 -0700
Subject: [PATCH 157/410] Adding urdf mesh logger

---
 src/stretch/visualization/rr_robot.py |  39 ++++
 src/stretch/visualization/rr_urdf.py  | 284 ++++++++++++++++++++++++++
 2 files changed, 323 insertions(+)
 create mode 100644 src/stretch/visualization/rr_urdf.py

diff --git a/src/stretch/visualization/rr_robot.py b/src/stretch/visualization/rr_robot.py
index 0a47e208b..09c9be8cc 100644
--- a/src/stretch/visualization/rr_robot.py
+++ b/src/stretch/visualization/rr_robot.py
@@ -10,15 +10,48 @@
 # license information maybe found below, if so.
 
 import importlib.resources as importlib_resources
+import re
 
 import numpy as np
 import urchin as urdf_loader
 
+from stretch.visualization import rr_urdf
+
 pkg_path = str(importlib_resources.files("stretch_urdf"))
 model_name = "SE3"  # RE1V0, RE2V0, SE3
 tool_name = "eoa_wrist_dw3_tool_sg3"  # eoa_wrist_dw3_tool_sg3, tool_stretch_gripper, etc
 urdf_file_path = pkg_path + f"/{model_name}/stretch_description_{model_name}_{tool_name}.urdf"
 mesh_files_directory_path = pkg_path + f"/{model_name}/meshes"
+urdf_dir = pkg_path + f"/{model_name}/"
+
+
+def get_absolute_path_stretch_urdf(urdf_file_path, mesh_files_directory_path) -> str:
+    """
+    Generates Robot XML with absolute path to mesh files
+    Args:
+        robot_pose_attrib: Robot pose attributes in form {"pos": "x y z", "quat": "x y z w"}
+    Returns:
+        str: Path to the generated XML file
+    """
+
+    with open(urdf_file_path, "r") as f:
+        _urdf_file = f.read()
+
+    # find all the line which has the pattrn {file="something.type"}
+    # and replace the file path with the absolute path
+    pattern = r'filename="(.+?)"'
+    for match in re.finditer(pattern, _urdf_file):
+        orig = match.group(1)
+        fn = match.group(1).split("/")[-1]
+        file_path = mesh_files_directory_path + "/" + fn
+        _urdf_file = _urdf_file.replace(orig, file_path)
+
+    # Absosolute path converted streth xml
+    temp_abs_urdf = "stretch_temp_abs.urdf"
+    with open(urdf_dir + temp_abs_urdf, "w") as f:
+        f.write(_urdf_file)
+    print("Saving temp abs path stretch urdf: {}".format(urdf_dir + f"{temp_abs_urdf}"))
+    return urdf_dir + temp_abs_urdf
 
 
 class URDFmodel:
@@ -61,3 +94,9 @@ def get_transform(self, cfg: dict, link_name: str) -> np.ndarray:
             lk_cfg["joint_gripper_finger_left"] = cfg["gripper"]
             lk_cfg["joint_gripper_finger_right"] = cfg["gripper"]
         return self.urdf.link_fk(lk_cfg, link=link_name)
+
+
+if __name__ == "__main__":
+    urdf_file = get_absolute_path_stretch_urdf(urdf_file_path, mesh_files_directory_path)
+    urdf_logger = rr_urdf.URDFLogger(urdf_file)
+    urdf_logger.log()
diff --git a/src/stretch/visualization/rr_urdf.py b/src/stretch/visualization/rr_urdf.py
new file mode 100644
index 000000000..b9180b676
--- /dev/null
+++ b/src/stretch/visualization/rr_urdf.py
@@ -0,0 +1,284 @@
+#!/usr/bin/env python3
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+"""
+Modified version of the URDF logger
+Adapted from https://github.com/rerun-io/python-example-droid-dataset/blob/master/src/rerun_loader_urdf.py
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import pathlib
+from typing import Optional
+
+import numpy as np
+import rerun as rr  # pip install rerun-sdk
+import scipy.spatial.transform as st
+import trimesh
+from PIL import Image
+from urdf_parser_py import urdf as urdf_parser
+
+
+class URDFLogger:
+    """Class to log a URDF to Rerun."""
+
+    def __init__(self, filepath: str, root_path: str = "") -> None:
+        self.urdf = urdf_parser.URDF.from_xml_file(filepath)
+        self.mat_name_to_mat = {mat.name: mat for mat in self.urdf.materials}
+        self.entity_to_transform = {}
+        self.root_path = root_path
+
+    def link_entity_path(self, link: urdf_parser.Link) -> str:
+        """Return the entity path for the URDF link."""
+        root_name = self.urdf.get_root()
+        link_names = self.urdf.get_chain(root_name, link.name)[0::2]  # skip the joints
+        return "/".join(link_names)
+
+    def joint_entity_path(self, joint: urdf_parser.Joint) -> str:
+        """Return the entity path for the URDF joint."""
+        root_name = self.urdf.get_root()
+        link_names = self.urdf.get_chain(root_name, joint.child)[0::2]  # skip the joints
+        return "/".join(link_names)
+
+    def log(self) -> None:
+        """Log a URDF file to Rerun."""
+        rr.log(
+            self.root_path + "", rr.ViewCoordinates.RIGHT_HAND_Z_UP, timeless=True
+        )  # default ROS convention
+
+        for joint in self.urdf.joints:
+            entity_path = self.joint_entity_path(joint)
+            self.log_joint(entity_path, joint)
+
+        for link in self.urdf.links:
+            entity_path = self.link_entity_path(link)
+            self.log_link(entity_path, link)
+
+    def log_link(self, entity_path: str, link: urdf_parser.Link) -> None:
+        # create one mesh out of all visuals
+        for i, visual in enumerate(link.visuals):
+            self.log_visual(entity_path + f"/visual_{i}", visual)
+
+    def log_joint(self, entity_path: str, joint: urdf_parser.Joint) -> None:
+        translation = rotation = None
+
+        if joint.origin is not None and joint.origin.xyz is not None:
+            translation = joint.origin.xyz
+
+        if joint.origin is not None and joint.origin.rpy is not None:
+            rotation = st.Rotation.from_euler("xyz", joint.origin.rpy).as_matrix()
+
+        self.entity_to_transform[self.root_path + entity_path] = (translation, rotation)
+        rr.log(
+            self.root_path + entity_path, rr.Transform3D(translation=translation, mat3x3=rotation)
+        )
+
+    def log_visual(self, entity_path: str, visual: urdf_parser.Visual) -> None:
+        material = None
+        if visual.material is not None:
+            if visual.material.color is None and visual.material.texture is None:
+                # use globally defined material
+                material = self.mat_name_to_mat[visual.material.name]
+            else:
+                material = visual.material
+
+        transform = np.eye(4)
+        if visual.origin is not None and visual.origin.xyz is not None:
+            transform[:3, 3] = visual.origin.xyz
+        if visual.origin is not None and visual.origin.rpy is not None:
+            transform[:3, :3] = st.Rotation.from_euler("xyz", visual.origin.rpy).as_matrix()
+
+        if isinstance(visual.geometry, urdf_parser.Mesh):
+            resolved_path = resolve_ros_path(visual.geometry.filename)
+            mesh_scale = visual.geometry.scale
+            mesh_or_scene = trimesh.load_mesh(resolved_path)
+            if mesh_scale is not None:
+                transform[:3, :3] *= mesh_scale
+        elif isinstance(visual.geometry, urdf_parser.Box):
+            mesh_or_scene = trimesh.creation.box(extents=visual.geometry.size)
+        elif isinstance(visual.geometry, urdf_parser.Cylinder):
+            mesh_or_scene = trimesh.creation.cylinder(
+                radius=visual.geometry.radius,
+                height=visual.geometry.length,
+            )
+        elif isinstance(visual.geometry, urdf_parser.Sphere):
+            mesh_or_scene = trimesh.creation.icosphere(
+                radius=visual.geometry.radius,
+            )
+        else:
+            rr.log(
+                self.root_path + "",
+                rr.TextLog("Unsupported geometry type: " + str(type(visual.geometry))),
+            )
+            mesh_or_scene = trimesh.Trimesh()
+
+        mesh_or_scene.apply_transform(transform)
+
+        if isinstance(mesh_or_scene, trimesh.Scene):
+            scene = mesh_or_scene
+            # use dump to apply scene graph transforms and get a list of transformed meshes
+            for i, mesh in enumerate(scene.dump()):
+                if material is not None:
+                    if material.color is not None:
+                        mesh.visual = trimesh.visual.ColorVisuals()
+                        mesh.visual.vertex_colors = material.color.rgba
+                    elif material.texture is not None:
+                        texture_path = resolve_ros_path(material.texture.filename)
+                        mesh.visual = trimesh.visual.texture.TextureVisuals(
+                            image=Image.open(texture_path)
+                        )
+                log_trimesh(self.root_path + entity_path + f"/{i}", mesh)
+        else:
+            mesh = mesh_or_scene
+            if material is not None:
+                if material.color is not None:
+                    mesh.visual = trimesh.visual.ColorVisuals()
+                    mesh.visual.vertex_colors = material.color.rgba
+                elif material.texture is not None:
+                    texture_path = resolve_ros_path(material.texture.filename)
+                    mesh.visual = trimesh.visual.texture.TextureVisuals(
+                        image=Image.open(texture_path)
+                    )
+            log_trimesh(self.root_path + entity_path, mesh)
+
+
+def log_trimesh(entity_path: str, mesh: trimesh.Trimesh) -> None:
+    vertex_colors = albedo_texture = vertex_texcoords = None
+    if isinstance(mesh.visual, trimesh.visual.color.ColorVisuals):
+        vertex_colors = mesh.visual.vertex_colors
+    elif isinstance(mesh.visual, trimesh.visual.texture.TextureVisuals):
+        albedo_texture = mesh.visual.material.baseColorTexture
+        if len(np.asarray(albedo_texture).shape) == 2:
+            # If the texture is grayscale, we need to convert it to RGB.
+            albedo_texture = np.stack([albedo_texture] * 3, axis=-1)
+        vertex_texcoords = mesh.visual.uv
+        # Trimesh uses the OpenGL convention for UV coordinates, so we need to flip the V coordinate
+        # since Rerun uses the Vulkan/Metal/DX12/WebGPU convention.
+        if vertex_texcoords is None:
+            pass
+        else:
+            vertex_texcoords[:, 1] = 1.0 - vertex_texcoords[:, 1]
+    # else:
+    #     # Neither simple color nor texture, so we'll try to retrieve vertex colors via trimesh.
+    #     try:
+    #         colors = mesh.visual.to_color().vertex_colors
+    #         if len(colors) == 4:
+    #             # If trimesh gives us a single vertex color for the entire mesh, we can interpret that
+    #             # as an albedo factor for the whole primitive.
+    #             mesh_material = Material(albedo_factor=np.array(colors))
+    #         else:
+    #             vertex_colors = colors
+    #     except Exception:
+    #         pass
+
+    rr.log(
+        entity_path,
+        rr.Mesh3D(
+            vertex_positions=mesh.vertices,
+            triangle_indices=mesh.faces,
+            vertex_normals=mesh.vertex_normals,
+            vertex_colors=vertex_colors,
+            albedo_texture=albedo_texture,
+            vertex_texcoords=vertex_texcoords,
+        ),
+        timeless=True,
+    )
+
+
+def resolve_ros_path(path: str) -> str:
+    """Resolve a ROS path to an absolute path."""
+    if path.startswith("package://"):
+        path = pathlib.Path(path)
+        package_name = path.parts[1]
+        relative_path = pathlib.Path(*path.parts[2:])
+
+        package_path = resolve_ros1_package(package_name) or resolve_ros2_package(package_name)
+
+        if package_path is None:
+            raise ValueError(
+                f"Could not resolve {path}."
+                f"Replace with relative / absolute path, source the correct ROS environment, or install {package_name}."
+            )
+
+        return str(package_path / relative_path)
+    elif str(path).startswith("file://"):
+        return path[len("file://") :]
+    else:
+        return path
+
+
+def resolve_ros2_package(package_name: str) -> Optional[str]:
+    try:
+        import ament_index_python
+
+        try:
+            return ament_index_python.get_package_share_directory(package_name)
+        except ament_index_python.packages.PackageNotFoundError:
+            return None
+    except ImportError:
+        return None
+
+
+def resolve_ros1_package(package_name: str) -> str:
+    try:
+        import rospkg
+
+        try:
+            return rospkg.RosPack().get_path(package_name)
+        except rospkg.ResourceNotFound:
+            return None
+    except ImportError:
+        return None
+
+
+def main() -> None:
+
+    # The Rerun Viewer will always pass these two pieces of information:
+    # 1. The path to be loaded, as a positional arg.
+    # 2. A shared recording ID, via the `--recording-id` flag.
+    #
+    # It is up to you whether you make use of that shared recording ID or not.
+    # If you use it, the data will end up in the same recording as all other plugins interested in
+    # that file, otherwise you can just create a dedicated recording for it. Or both.
+    parser = argparse.ArgumentParser(
+        description="""
+This is an example executable data-loader plugin for the Rerun Viewer.
+Any executable on your `$PATH` with a name that starts with `rerun-loader-` will be
+treated as an external data-loader.
+
+This example will load URDF files, logs them to Rerun,
+and returns a special exit code to indicate that it doesn't support anything else.
+
+To try it out, copy it in your $PATH as `rerun-loader-python-example-urdf`,
+then open a URDF file with Rerun (`rerun example.urdf`).
+"""
+    )
+    parser.add_argument("filepath", type=str)
+    parser.add_argument("--recording-id", type=str)
+    args = parser.parse_args()
+
+    is_file = os.path.isfile(args.filepath)
+    is_urdf_file = ".urdf" in args.filepath
+
+    # Inform the Rerun Viewer that we do not support that kind of file.
+    if not is_file or not is_urdf_file:
+        exit(rr.EXTERNAL_DATA_LOADER_INCOMPATIBLE_EXIT_CODE)
+
+    rr.init("rerun_example_external_data_loader_urdf", recording_id=args.recording_id)
+    # The most important part of this: log to standard output so the Rerun Viewer can ingest it!
+    rr.stdout()
+
+    urdf_logger = URDFLogger(args.filepath)
+    urdf_logger.log()
+
+
+if __name__ == "__main__":
+    main()

From 2a0338bde1d569ca15390d090bf05b5172be9da6 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Sun, 15 Sep 2024 18:12:04 -0700
Subject: [PATCH 158/410] Robot collision mesh viz init

---
 src/stretch/simulation/mujoco_server.py |  10 +-
 src/stretch/visualization/rerun.py      |   7 +
 src/stretch/visualization/rr_robot.py   |  86 ++++---
 src/stretch/visualization/rr_urdf.py    | 284 ------------------------
 4 files changed, 72 insertions(+), 315 deletions(-)
 delete mode 100644 src/stretch/visualization/rr_urdf.py

diff --git a/src/stretch/simulation/mujoco_server.py b/src/stretch/simulation/mujoco_server.py
index 8acfb37b4..696e16f37 100644
--- a/src/stretch/simulation/mujoco_server.py
+++ b/src/stretch/simulation/mujoco_server.py
@@ -364,8 +364,10 @@ def get_control_mode(self) -> str:
         return self.control_mode
 
     @override
-    def start(self, robocasa: bool = False):
-        self.robot_sim.start()  # This will start the simulation and open Mujoco-Viewer window
+    def start(self, show_viewer_ui: bool = False, robocasa: bool = False):
+        self.robot_sim.start(
+            show_viewer_ui
+        )  # This will start the simulation and open Mujoco-Viewer window
         super().start()
 
         # Create a thread for the control loop
@@ -574,6 +576,7 @@ def is_running(self) -> bool:
 @click.option("--robocasa-task", default="PnPCounterToCab", help="Robocasa task to generate")
 @click.option("--robocasa-style", type=int, default=1, help="Robocasa style to generate")
 @click.option("--robocasa-layout", type=int, default=1, help="Robocasa layout to generate")
+@click.option("--show-viewer-ui", default=False, help="Show the Mujoco viewer UI", is_flag=True)
 @click.option(
     "--robocasa-write-to-xml",
     default=False,
@@ -596,6 +599,7 @@ def main(
     robocasa_style: int,
     robocasa_layout: int,
     robocasa_write_to_xml: bool,
+    show_viewer_ui: bool,
 ):
 
     scene_model = None
@@ -621,7 +625,7 @@ def main(
         scene_model=scene_model,
     )
     try:
-        server.start(robocasa=use_robocasa)
+        server.start(show_viewer_ui=show_viewer_ui, robocasa=use_robocasa)
 
     except KeyboardInterrupt:
         server.robot_sim.stop()
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 24bd9cf85..31af175cd 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -22,6 +22,7 @@
 from stretch.mapping.voxel.voxel_map import SparseVoxelMapNavigationSpace
 from stretch.motion import HelloStretchIdx
 from stretch.perception.wrapper import OvmmPerception
+from stretch.visualization import rr_robot
 
 
 def decompose_homogeneous_matrix(homogeneous_matrix: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
@@ -102,6 +103,9 @@ def __init__(self):
         rr.init("Stretch_robot", spawn=False)
         rr.serve(open_browser=True, server_memory_limit="2GB")
 
+        self.urdf_logger = rr_robot.StretchURDFLogger()
+
+        self.urdf_logger.log()
         # Create environment Box place holder
         rr.log(
             "world/map_box",
@@ -356,5 +360,8 @@ def step(self, obs, servo):
                 self.log_ee_frame(obs)
                 self.log_ee_camera(servo)
                 self.log_robot_state(obs)
+                st = time.perf_counter()
+                self.urdf_logger.log()
+                print("Total time to log urdf: ", time.perf_counter() - st)
             except Exception as e:
                 print(e)
diff --git a/src/stretch/visualization/rr_robot.py b/src/stretch/visualization/rr_robot.py
index 09c9be8cc..b07e50a78 100644
--- a/src/stretch/visualization/rr_robot.py
+++ b/src/stretch/visualization/rr_robot.py
@@ -11,12 +11,12 @@
 
 import importlib.resources as importlib_resources
 import re
+import time
 
 import numpy as np
+import rerun as rr
 import urchin as urdf_loader
 
-from stretch.visualization import rr_urdf
-
 pkg_path = str(importlib_resources.files("stretch_urdf"))
 model_name = "SE3"  # RE1V0, RE2V0, SE3
 tool_name = "eoa_wrist_dw3_tool_sg3"  # eoa_wrist_dw3_tool_sg3, tool_stretch_gripper, etc
@@ -27,11 +27,7 @@
 
 def get_absolute_path_stretch_urdf(urdf_file_path, mesh_files_directory_path) -> str:
     """
-    Generates Robot XML with absolute path to mesh files
-    Args:
-        robot_pose_attrib: Robot pose attributes in form {"pos": "x y z", "quat": "x y z w"}
-    Returns:
-        str: Path to the generated XML file
+    Generates Robot URDF with absolute path to mesh files
     """
 
     with open(urdf_file_path, "r") as f:
@@ -54,25 +50,26 @@ def get_absolute_path_stretch_urdf(urdf_file_path, mesh_files_directory_path) ->
     return urdf_dir + temp_abs_urdf
 
 
-class URDFmodel:
-    def __init__(self) -> None:
-        """
-        Load URDF model
-        """
-        self.urdf = urdf_loader.URDF.load(urdf_file_path, lazy_load_meshes=True)
-        self.joints_names = [
-            "joint_wrist_yaw",
-            "joint_wrist_pitch",
-            "joint_wrist_roll",
-            "joint_lift",
-            "joint_arm_l0",
-            "joint_arm_l1",
-            "joint_arm_l2",
-            "joint_arm_l3",
-            "joint_head_pan",
-            "joint_head_tilt",
-            "joint_gripper_finger_left",
-        ]
+class URDFVisualizer:
+    """The `show` method in this class is modified from the
+    original implementation of `urdf_loader.URDF.show`.
+    """
+
+    def __init__(self, urdf_file_path):
+        self.urdf = self.urdf = urdf_loader.URDF.load(urdf_file_path)
+
+    def get_tri_meshes(self, cfg=None, use_collision=True) -> list:
+        if use_collision:
+            fk = self.urdf.collision_trimesh_fk(cfg=cfg)
+        else:
+            fk = self.urdf.visual_trimesh_fk(cfg=cfg)
+        t_meshes = {"mesh": [], "pose": []}
+        for tm in fk:
+            pose = fk[tm]
+            t_mesh = tm.copy()
+            t_meshes["mesh"].append(t_mesh)
+            t_meshes["pose"].append(pose)
+        return t_meshes
 
     def get_transform(self, cfg: dict, link_name: str) -> np.ndarray:
         """
@@ -96,7 +93,40 @@ def get_transform(self, cfg: dict, link_name: str) -> np.ndarray:
         return self.urdf.link_fk(lk_cfg, link=link_name)
 
 
+class StretchURDFLogger(URDFVisualizer):
+    def __init__(self) -> None:
+        urdf_file = get_absolute_path_stretch_urdf(urdf_file_path, mesh_files_directory_path)
+        super().__init__(urdf_file)
+
+    def log(self):
+        st = time.perf_counter()
+        tm = self.get_tri_meshes()
+        mesh_list = tm["mesh"]
+        pose_list = np.array(tm["pose"])
+        for m, p in zip(mesh_list, pose_list):
+            m.apply_transform(p)
+        combined_mesh = np.sum(mesh_list)
+        combined_mesh.remove_duplicate_faces()
+        mesh = combined_mesh
+        print(f"Time to get mesh: {time.perf_counter() - st}")
+        st2 = time.perf_counter()
+        # breakpoint()
+        rr.log(
+            "world/robot/base_link",
+            rr.Mesh3D(
+                vertex_positions=mesh.vertices,
+                triangle_indices=mesh.faces,
+                vertex_normals=mesh.vertex_normals,
+                # vertex_colors=vertex_colors,
+                # albedo_texture=albedo_texture,
+                # vertex_texcoords=vertex_texcoords,
+            ),
+            timeless=True,
+        )
+        t2 = time.perf_counter() - st2
+        print(f"Time to rr log mesh: {t2}")
+
+
 if __name__ == "__main__":
-    urdf_file = get_absolute_path_stretch_urdf(urdf_file_path, mesh_files_directory_path)
-    urdf_logger = rr_urdf.URDFLogger(urdf_file)
+    urdf_logger = StretchURDFLogger()
     urdf_logger.log()
diff --git a/src/stretch/visualization/rr_urdf.py b/src/stretch/visualization/rr_urdf.py
deleted file mode 100644
index b9180b676..000000000
--- a/src/stretch/visualization/rr_urdf.py
+++ /dev/null
@@ -1,284 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Hello Robot, Inc.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the LICENSE file in the root directory
-# of this source tree.
-#
-# Some code may be adapted from other open-source works with their respective licenses. Original
-# license information maybe found below, if so.
-
-"""
-Modified version of the URDF logger
-Adapted from https://github.com/rerun-io/python-example-droid-dataset/blob/master/src/rerun_loader_urdf.py
-"""
-from __future__ import annotations
-
-import argparse
-import os
-import pathlib
-from typing import Optional
-
-import numpy as np
-import rerun as rr  # pip install rerun-sdk
-import scipy.spatial.transform as st
-import trimesh
-from PIL import Image
-from urdf_parser_py import urdf as urdf_parser
-
-
-class URDFLogger:
-    """Class to log a URDF to Rerun."""
-
-    def __init__(self, filepath: str, root_path: str = "") -> None:
-        self.urdf = urdf_parser.URDF.from_xml_file(filepath)
-        self.mat_name_to_mat = {mat.name: mat for mat in self.urdf.materials}
-        self.entity_to_transform = {}
-        self.root_path = root_path
-
-    def link_entity_path(self, link: urdf_parser.Link) -> str:
-        """Return the entity path for the URDF link."""
-        root_name = self.urdf.get_root()
-        link_names = self.urdf.get_chain(root_name, link.name)[0::2]  # skip the joints
-        return "/".join(link_names)
-
-    def joint_entity_path(self, joint: urdf_parser.Joint) -> str:
-        """Return the entity path for the URDF joint."""
-        root_name = self.urdf.get_root()
-        link_names = self.urdf.get_chain(root_name, joint.child)[0::2]  # skip the joints
-        return "/".join(link_names)
-
-    def log(self) -> None:
-        """Log a URDF file to Rerun."""
-        rr.log(
-            self.root_path + "", rr.ViewCoordinates.RIGHT_HAND_Z_UP, timeless=True
-        )  # default ROS convention
-
-        for joint in self.urdf.joints:
-            entity_path = self.joint_entity_path(joint)
-            self.log_joint(entity_path, joint)
-
-        for link in self.urdf.links:
-            entity_path = self.link_entity_path(link)
-            self.log_link(entity_path, link)
-
-    def log_link(self, entity_path: str, link: urdf_parser.Link) -> None:
-        # create one mesh out of all visuals
-        for i, visual in enumerate(link.visuals):
-            self.log_visual(entity_path + f"/visual_{i}", visual)
-
-    def log_joint(self, entity_path: str, joint: urdf_parser.Joint) -> None:
-        translation = rotation = None
-
-        if joint.origin is not None and joint.origin.xyz is not None:
-            translation = joint.origin.xyz
-
-        if joint.origin is not None and joint.origin.rpy is not None:
-            rotation = st.Rotation.from_euler("xyz", joint.origin.rpy).as_matrix()
-
-        self.entity_to_transform[self.root_path + entity_path] = (translation, rotation)
-        rr.log(
-            self.root_path + entity_path, rr.Transform3D(translation=translation, mat3x3=rotation)
-        )
-
-    def log_visual(self, entity_path: str, visual: urdf_parser.Visual) -> None:
-        material = None
-        if visual.material is not None:
-            if visual.material.color is None and visual.material.texture is None:
-                # use globally defined material
-                material = self.mat_name_to_mat[visual.material.name]
-            else:
-                material = visual.material
-
-        transform = np.eye(4)
-        if visual.origin is not None and visual.origin.xyz is not None:
-            transform[:3, 3] = visual.origin.xyz
-        if visual.origin is not None and visual.origin.rpy is not None:
-            transform[:3, :3] = st.Rotation.from_euler("xyz", visual.origin.rpy).as_matrix()
-
-        if isinstance(visual.geometry, urdf_parser.Mesh):
-            resolved_path = resolve_ros_path(visual.geometry.filename)
-            mesh_scale = visual.geometry.scale
-            mesh_or_scene = trimesh.load_mesh(resolved_path)
-            if mesh_scale is not None:
-                transform[:3, :3] *= mesh_scale
-        elif isinstance(visual.geometry, urdf_parser.Box):
-            mesh_or_scene = trimesh.creation.box(extents=visual.geometry.size)
-        elif isinstance(visual.geometry, urdf_parser.Cylinder):
-            mesh_or_scene = trimesh.creation.cylinder(
-                radius=visual.geometry.radius,
-                height=visual.geometry.length,
-            )
-        elif isinstance(visual.geometry, urdf_parser.Sphere):
-            mesh_or_scene = trimesh.creation.icosphere(
-                radius=visual.geometry.radius,
-            )
-        else:
-            rr.log(
-                self.root_path + "",
-                rr.TextLog("Unsupported geometry type: " + str(type(visual.geometry))),
-            )
-            mesh_or_scene = trimesh.Trimesh()
-
-        mesh_or_scene.apply_transform(transform)
-
-        if isinstance(mesh_or_scene, trimesh.Scene):
-            scene = mesh_or_scene
-            # use dump to apply scene graph transforms and get a list of transformed meshes
-            for i, mesh in enumerate(scene.dump()):
-                if material is not None:
-                    if material.color is not None:
-                        mesh.visual = trimesh.visual.ColorVisuals()
-                        mesh.visual.vertex_colors = material.color.rgba
-                    elif material.texture is not None:
-                        texture_path = resolve_ros_path(material.texture.filename)
-                        mesh.visual = trimesh.visual.texture.TextureVisuals(
-                            image=Image.open(texture_path)
-                        )
-                log_trimesh(self.root_path + entity_path + f"/{i}", mesh)
-        else:
-            mesh = mesh_or_scene
-            if material is not None:
-                if material.color is not None:
-                    mesh.visual = trimesh.visual.ColorVisuals()
-                    mesh.visual.vertex_colors = material.color.rgba
-                elif material.texture is not None:
-                    texture_path = resolve_ros_path(material.texture.filename)
-                    mesh.visual = trimesh.visual.texture.TextureVisuals(
-                        image=Image.open(texture_path)
-                    )
-            log_trimesh(self.root_path + entity_path, mesh)
-
-
-def log_trimesh(entity_path: str, mesh: trimesh.Trimesh) -> None:
-    vertex_colors = albedo_texture = vertex_texcoords = None
-    if isinstance(mesh.visual, trimesh.visual.color.ColorVisuals):
-        vertex_colors = mesh.visual.vertex_colors
-    elif isinstance(mesh.visual, trimesh.visual.texture.TextureVisuals):
-        albedo_texture = mesh.visual.material.baseColorTexture
-        if len(np.asarray(albedo_texture).shape) == 2:
-            # If the texture is grayscale, we need to convert it to RGB.
-            albedo_texture = np.stack([albedo_texture] * 3, axis=-1)
-        vertex_texcoords = mesh.visual.uv
-        # Trimesh uses the OpenGL convention for UV coordinates, so we need to flip the V coordinate
-        # since Rerun uses the Vulkan/Metal/DX12/WebGPU convention.
-        if vertex_texcoords is None:
-            pass
-        else:
-            vertex_texcoords[:, 1] = 1.0 - vertex_texcoords[:, 1]
-    # else:
-    #     # Neither simple color nor texture, so we'll try to retrieve vertex colors via trimesh.
-    #     try:
-    #         colors = mesh.visual.to_color().vertex_colors
-    #         if len(colors) == 4:
-    #             # If trimesh gives us a single vertex color for the entire mesh, we can interpret that
-    #             # as an albedo factor for the whole primitive.
-    #             mesh_material = Material(albedo_factor=np.array(colors))
-    #         else:
-    #             vertex_colors = colors
-    #     except Exception:
-    #         pass
-
-    rr.log(
-        entity_path,
-        rr.Mesh3D(
-            vertex_positions=mesh.vertices,
-            triangle_indices=mesh.faces,
-            vertex_normals=mesh.vertex_normals,
-            vertex_colors=vertex_colors,
-            albedo_texture=albedo_texture,
-            vertex_texcoords=vertex_texcoords,
-        ),
-        timeless=True,
-    )
-
-
-def resolve_ros_path(path: str) -> str:
-    """Resolve a ROS path to an absolute path."""
-    if path.startswith("package://"):
-        path = pathlib.Path(path)
-        package_name = path.parts[1]
-        relative_path = pathlib.Path(*path.parts[2:])
-
-        package_path = resolve_ros1_package(package_name) or resolve_ros2_package(package_name)
-
-        if package_path is None:
-            raise ValueError(
-                f"Could not resolve {path}."
-                f"Replace with relative / absolute path, source the correct ROS environment, or install {package_name}."
-            )
-
-        return str(package_path / relative_path)
-    elif str(path).startswith("file://"):
-        return path[len("file://") :]
-    else:
-        return path
-
-
-def resolve_ros2_package(package_name: str) -> Optional[str]:
-    try:
-        import ament_index_python
-
-        try:
-            return ament_index_python.get_package_share_directory(package_name)
-        except ament_index_python.packages.PackageNotFoundError:
-            return None
-    except ImportError:
-        return None
-
-
-def resolve_ros1_package(package_name: str) -> str:
-    try:
-        import rospkg
-
-        try:
-            return rospkg.RosPack().get_path(package_name)
-        except rospkg.ResourceNotFound:
-            return None
-    except ImportError:
-        return None
-
-
-def main() -> None:
-
-    # The Rerun Viewer will always pass these two pieces of information:
-    # 1. The path to be loaded, as a positional arg.
-    # 2. A shared recording ID, via the `--recording-id` flag.
-    #
-    # It is up to you whether you make use of that shared recording ID or not.
-    # If you use it, the data will end up in the same recording as all other plugins interested in
-    # that file, otherwise you can just create a dedicated recording for it. Or both.
-    parser = argparse.ArgumentParser(
-        description="""
-This is an example executable data-loader plugin for the Rerun Viewer.
-Any executable on your `$PATH` with a name that starts with `rerun-loader-` will be
-treated as an external data-loader.
-
-This example will load URDF files, logs them to Rerun,
-and returns a special exit code to indicate that it doesn't support anything else.
-
-To try it out, copy it in your $PATH as `rerun-loader-python-example-urdf`,
-then open a URDF file with Rerun (`rerun example.urdf`).
-"""
-    )
-    parser.add_argument("filepath", type=str)
-    parser.add_argument("--recording-id", type=str)
-    args = parser.parse_args()
-
-    is_file = os.path.isfile(args.filepath)
-    is_urdf_file = ".urdf" in args.filepath
-
-    # Inform the Rerun Viewer that we do not support that kind of file.
-    if not is_file or not is_urdf_file:
-        exit(rr.EXTERNAL_DATA_LOADER_INCOMPATIBLE_EXIT_CODE)
-
-    rr.init("rerun_example_external_data_loader_urdf", recording_id=args.recording_id)
-    # The most important part of this: log to standard output so the Rerun Viewer can ingest it!
-    rr.stdout()
-
-    urdf_logger = URDFLogger(args.filepath)
-    urdf_logger.log()
-
-
-if __name__ == "__main__":
-    main()

From cbbe27aaa12bca7d044ca8595d8b0cc2adabf70e Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Sun, 15 Sep 2024 18:13:41 -0700
Subject: [PATCH 159/410] Robot collision mesh viz init

---
 src/stretch/visualization/rerun.py                            | 4 ++--
 src/stretch/visualization/{rr_robot.py => urdf_visualizer.py} | 0
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename src/stretch/visualization/{rr_robot.py => urdf_visualizer.py} (100%)

diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 31af175cd..61b31545b 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -22,7 +22,7 @@
 from stretch.mapping.voxel.voxel_map import SparseVoxelMapNavigationSpace
 from stretch.motion import HelloStretchIdx
 from stretch.perception.wrapper import OvmmPerception
-from stretch.visualization import rr_robot
+from stretch.visualization import urdf_visualizer
 
 
 def decompose_homogeneous_matrix(homogeneous_matrix: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
@@ -103,7 +103,7 @@ def __init__(self):
         rr.init("Stretch_robot", spawn=False)
         rr.serve(open_browser=True, server_memory_limit="2GB")
 
-        self.urdf_logger = rr_robot.StretchURDFLogger()
+        self.urdf_logger = urdf_visualizer.StretchURDFLogger()
 
         self.urdf_logger.log()
         # Create environment Box place holder
diff --git a/src/stretch/visualization/rr_robot.py b/src/stretch/visualization/urdf_visualizer.py
similarity index 100%
rename from src/stretch/visualization/rr_robot.py
rename to src/stretch/visualization/urdf_visualizer.py

From 63bc12df1055b4ab8c5c4e4dc9a7c5c231a7d30e Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Sun, 15 Sep 2024 18:54:26 -0700
Subject: [PATCH 160/410] Reorganize urdf viz code

---
 src/stretch/visualization/rerun.py           | 25 +++++++-
 src/stretch/visualization/urdf_visualizer.py | 63 ++++++--------------
 2 files changed, 43 insertions(+), 45 deletions(-)

diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 61b31545b..0f34717f1 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -98,12 +98,35 @@ def occupancy_map_to_3d_points(
     return points
 
 
+class StretchURDFLogger(urdf_visualizer.URDFVisualizer):
+    def log(self, cfg=None):
+        st = time.perf_counter()
+        mesh = self.get_combined_robot_mesh(cfg=None, use_collision=True)
+        print(f"Time to get mesh: {time.perf_counter() - st}")
+        st2 = time.perf_counter()
+        # breakpoint()
+        rr.log(
+            "world/robot/base_link",
+            rr.Mesh3D(
+                vertex_positions=mesh.vertices,
+                triangle_indices=mesh.faces,
+                vertex_normals=mesh.vertex_normals,
+                # vertex_colors=vertex_colors,
+                # albedo_texture=albedo_texture,
+                # vertex_texcoords=vertex_texcoords,
+            ),
+            timeless=True,
+        )
+        t2 = time.perf_counter() - st2
+        print(f"Time to rr log mesh: {t2}")
+
+
 class RerunVsualizer:
     def __init__(self):
         rr.init("Stretch_robot", spawn=False)
         rr.serve(open_browser=True, server_memory_limit="2GB")
 
-        self.urdf_logger = urdf_visualizer.StretchURDFLogger()
+        self.urdf_logger = StretchURDFLogger()
 
         self.urdf_logger.log()
         # Create environment Box place holder
diff --git a/src/stretch/visualization/urdf_visualizer.py b/src/stretch/visualization/urdf_visualizer.py
index b07e50a78..fbbbd5961 100644
--- a/src/stretch/visualization/urdf_visualizer.py
+++ b/src/stretch/visualization/urdf_visualizer.py
@@ -11,10 +11,8 @@
 
 import importlib.resources as importlib_resources
 import re
-import time
 
 import numpy as np
-import rerun as rr
 import urchin as urdf_loader
 
 pkg_path = str(importlib_resources.files("stretch_urdf"))
@@ -55,22 +53,38 @@ class URDFVisualizer:
     original implementation of `urdf_loader.URDF.show`.
     """
 
-    def __init__(self, urdf_file_path):
-        self.urdf = self.urdf = urdf_loader.URDF.load(urdf_file_path)
+    abs_urdf_file_path = get_absolute_path_stretch_urdf(urdf_file_path, mesh_files_directory_path)
+
+    def __init__(self, urdf_file=abs_urdf_file_path):
+        self.urdf = self.urdf = urdf_loader.URDF.load(urdf_file)
 
     def get_tri_meshes(self, cfg=None, use_collision=True) -> list:
         if use_collision:
             fk = self.urdf.collision_trimesh_fk(cfg=cfg)
         else:
             fk = self.urdf.visual_trimesh_fk(cfg=cfg)
-        t_meshes = {"mesh": [], "pose": []}
+        t_meshes = {"mesh": [], "pose": [], "link": []}
         for tm in fk:
             pose = fk[tm]
             t_mesh = tm.copy()
             t_meshes["mesh"].append(t_mesh)
             t_meshes["pose"].append(pose)
+            # [:-4] to remove the .STL extension
+            t_meshes["link"].append(
+                tm.metadata["file_name"][:-4] if "file_name" in tm.metadata.keys() else None
+            )
         return t_meshes
 
+    def get_combined_robot_mesh(self, cfg=None, use_collision=True):
+        tm = self.get_tri_meshes(cfg, use_collision)
+        mesh_list = tm["mesh"]
+        pose_list = np.array(tm["pose"])
+        for m, p in zip(mesh_list, pose_list):
+            m.apply_transform(p)
+        combined_mesh = np.sum(mesh_list)
+        combined_mesh.remove_duplicate_faces()
+        return combined_mesh
+
     def get_transform(self, cfg: dict, link_name: str) -> np.ndarray:
         """
         Get transformation matrix of the link w.r.t. the base_link
@@ -91,42 +105,3 @@ def get_transform(self, cfg: dict, link_name: str) -> np.ndarray:
             lk_cfg["joint_gripper_finger_left"] = cfg["gripper"]
             lk_cfg["joint_gripper_finger_right"] = cfg["gripper"]
         return self.urdf.link_fk(lk_cfg, link=link_name)
-
-
-class StretchURDFLogger(URDFVisualizer):
-    def __init__(self) -> None:
-        urdf_file = get_absolute_path_stretch_urdf(urdf_file_path, mesh_files_directory_path)
-        super().__init__(urdf_file)
-
-    def log(self):
-        st = time.perf_counter()
-        tm = self.get_tri_meshes()
-        mesh_list = tm["mesh"]
-        pose_list = np.array(tm["pose"])
-        for m, p in zip(mesh_list, pose_list):
-            m.apply_transform(p)
-        combined_mesh = np.sum(mesh_list)
-        combined_mesh.remove_duplicate_faces()
-        mesh = combined_mesh
-        print(f"Time to get mesh: {time.perf_counter() - st}")
-        st2 = time.perf_counter()
-        # breakpoint()
-        rr.log(
-            "world/robot/base_link",
-            rr.Mesh3D(
-                vertex_positions=mesh.vertices,
-                triangle_indices=mesh.faces,
-                vertex_normals=mesh.vertex_normals,
-                # vertex_colors=vertex_colors,
-                # albedo_texture=albedo_texture,
-                # vertex_texcoords=vertex_texcoords,
-            ),
-            timeless=True,
-        )
-        t2 = time.perf_counter() - st2
-        print(f"Time to rr log mesh: {t2}")
-
-
-if __name__ == "__main__":
-    urdf_logger = StretchURDFLogger()
-    urdf_logger.log()

From 483723702fd88cc772d63c6e0952ed78625f277e Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Sun, 15 Sep 2024 19:41:42 -0700
Subject: [PATCH 161/410] display robot mesh flag

---
 src/stretch/visualization/rerun.py | 55 ++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 11 deletions(-)

diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 0f34717f1..127d10d3f 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -99,10 +99,25 @@ def occupancy_map_to_3d_points(
 
 
 class StretchURDFLogger(urdf_visualizer.URDFVisualizer):
-    def log(self, cfg=None):
+    def log_robot_mesh(self, cfg, use_collision=True):
+        lk_cfg = {
+            "joint_wrist_yaw": cfg["wrist_yaw"],
+            "joint_wrist_pitch": cfg["wrist_pitch"],
+            "joint_wrist_roll": cfg["wrist_roll"],
+            "joint_lift": cfg["lift"],
+            "joint_arm_l0": cfg["arm"] / 4,
+            "joint_arm_l1": cfg["arm"] / 4,
+            "joint_arm_l2": cfg["arm"] / 4,
+            "joint_arm_l3": cfg["arm"] / 4,
+            "joint_head_pan": cfg["head_pan"],
+            "joint_head_tilt": cfg["head_tilt"],
+        }
+        if "gripper" in cfg.keys():
+            lk_cfg["joint_gripper_finger_left"] = cfg["gripper"]
+            lk_cfg["joint_gripper_finger_right"] = cfg["gripper"]
         st = time.perf_counter()
-        mesh = self.get_combined_robot_mesh(cfg=None, use_collision=True)
-        print(f"Time to get mesh: {time.perf_counter() - st}")
+        mesh = self.get_combined_robot_mesh(cfg=lk_cfg, use_collision=use_collision)
+        print(f"Time to get mesh: {1000*(time.perf_counter() - st)}")
         st2 = time.perf_counter()
         # breakpoint()
         rr.log(
@@ -117,18 +132,28 @@ def log(self, cfg=None):
             ),
             timeless=True,
         )
-        t2 = time.perf_counter() - st2
+        t2 = 1000 * (time.perf_counter() - st2)
         print(f"Time to rr log mesh: {t2}")
 
+    def log(self, obs, use_collision=True):
+        state = obs["joint"]
+        cfg = {}
+        for k in HelloStretchIdx.name_to_idx:
+            cfg[k] = state[HelloStretchIdx.name_to_idx[k]]
+        # breakpoint()
+        self.log_robot_mesh(cfg=cfg, use_collision=use_collision)
+
 
 class RerunVsualizer:
-    def __init__(self):
+    def __init__(self, display_robot_mesh: bool = True, open_browser: bool = True):
         rr.init("Stretch_robot", spawn=False)
-        rr.serve(open_browser=True, server_memory_limit="2GB")
+        rr.serve(open_browser=open_browser, server_memory_limit="2GB")
+
+        self.display_robot_mesh = display_robot_mesh
 
-        self.urdf_logger = StretchURDFLogger()
+        if self.display_robot_mesh:
+            self.urdf_logger = StretchURDFLogger()
 
-        self.urdf_logger.log()
         # Create environment Box place holder
         rr.log(
             "world/map_box",
@@ -238,6 +263,11 @@ def log_robot_state(self, obs):
         for k in HelloStretchIdx.name_to_idx:
             rr.log(f"robot_state/joint_pose/{k}", rr.Scalar(state[HelloStretchIdx.name_to_idx[k]]))
 
+    def log_robot_mesh(self, obs, use_collision=True):
+        """
+        Log robot mesh using urdf visualizer"""
+        self.urdf_logger.log(obs, use_collision=use_collision)
+
     def update_voxel_map(
         self,
         space: SparseVoxelMapNavigationSpace,
@@ -383,8 +413,11 @@ def step(self, obs, servo):
                 self.log_ee_frame(obs)
                 self.log_ee_camera(servo)
                 self.log_robot_state(obs)
-                st = time.perf_counter()
-                self.urdf_logger.log()
-                print("Total time to log urdf: ", time.perf_counter() - st)
+
+                if self.display_robot_mesh:
+                    st = time.perf_counter()
+                    self.log_robot_mesh(obs)
+                    print("Total time to log urdf: ", 1000 * (time.perf_counter() - st))
+
             except Exception as e:
                 print(e)

From ea5bf0326aa1a109fa29e2692649712daf7bfceb Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Mon, 16 Sep 2024 00:14:29 -0700
Subject: [PATCH 162/410] Fix blueprint

---
 src/stretch/visualization/rerun.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 127d10d3f..854c2f7e6 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -120,6 +120,7 @@ def log_robot_mesh(self, cfg, use_collision=True):
         print(f"Time to get mesh: {1000*(time.perf_counter() - st)}")
         st2 = time.perf_counter()
         # breakpoint()
+        rr.set_time_seconds("realtime", time.time())
         rr.log(
             "world/robot/base_link",
             rr.Mesh3D(
@@ -174,11 +175,17 @@ def __init__(self, display_robot_mesh: bool = True, open_browser: bool = True):
         self.setup_blueprint()
 
     def setup_blueprint(self):
+        world_view = rrb.Spatial3DView(origin="world")
         my_blueprint = rrb.Blueprint(
-            rrb.Spatial3DView(origin="world"),
-            rrb.BlueprintPanel(expanded=True),
-            rrb.SelectionPanel(expanded=True),
-            rrb.TimePanel(expanded=True),
+            rrb.Horizontal(
+                rrb.Spatial3DView(name="3D View", origin="world"),
+                rrb.Vertical(
+                    rrb.Spatial2DView(name="head_rgb", origin="/world/head_camera"),
+                    rrb.Spatial2DView(name="ee_rgb", origin="/world/ee_camera"),
+                ),
+                rrb.TimePanel(expanded=True),
+                rrb.SelectionPanel(expanded=False),
+            ),
             collapse_panels=True,
         )
         rr.send_blueprint(my_blueprint)

From c2ddebf093a4461cb3951ba667cddf356e3f0935 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 12:02:05 -0400
Subject: [PATCH 163/410] adding chat wrapper

---
 src/stretch/app/ai_pickup.py     | 19 ++++++-
 src/stretch/llms/__init__.py     |  2 +
 src/stretch/llms/chat_wrapper.py | 93 ++++++++++++++++++++++++++++++++
 3 files changed, 113 insertions(+), 1 deletion(-)
 create mode 100644 src/stretch/llms/chat_wrapper.py

diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index c13c2f29a..f8939aa0a 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -16,7 +16,7 @@
 from stretch.agent.task.pickup import PickupTask
 from stretch.agent.zmq_client import HomeRobotZmqClient
 from stretch.core import get_parameters
-from stretch.llms import get_llm_choices, get_llm_client
+from stretch.llms import LLMChatWrapper, get_llm_choices, get_llm_client
 from stretch.perception import create_semantic_sensor
 from stretch.utils.prompt import PickupPromptBuilder
 
@@ -79,6 +79,11 @@
     is_flag=True,
     help="Set to use the language model",
 )
+@click.option(
+    "--use_voice",
+    is_flag=True,
+    help="Set to use voice input",
+)
 def main(
     robot_ip: str = "192.168.1.15",
     local: bool = False,
@@ -93,6 +98,7 @@ def main(
     match_method: str = "feature",
     llm: str = "gemma",
     use_llm: bool = False,
+    use_voice: bool = False,
 ):
     """Set up the robot, create a task plan, and execute it."""
     # Create robot
@@ -123,6 +129,7 @@ def main(
     llm_client = None
     if use_llm:
         llm_client = get_llm_client(llm, prompt=prompt)
+        chat_wrapper = LLMChatWrapper(llm_client, prompt=prompt, voice=use_voice)
 
     # Parse things and listen to the user
     while robot.running:
@@ -134,6 +141,16 @@ def main(
                 target_object = input("Enter the target object: ")
             if len(receptacle) == 0:
                 receptacle = input("Enter the target receptacle: ")
+        else:
+            # Call the LLM client and parse
+            llm_response = chat_wrapper.query()
+            target_object = prompt.get_object(llm_response)
+            receptacle = prompt.get_receptacle(llm_response)
+            say_this = prompt.get_say_this(llm_response)
+
+        if say_this is not None:
+            chat_wrapper.say(say_this)
+            agent.say(say_this)
 
         if len(target_object) == 0 or len(receptacle) == 0:
             logger.error("You need to enter a target object and receptacle")
diff --git a/src/stretch/llms/__init__.py b/src/stretch/llms/__init__.py
index 1313a1b64..0418d0b80 100644
--- a/src/stretch/llms/__init__.py
+++ b/src/stretch/llms/__init__.py
@@ -9,6 +9,7 @@
 from typing import Union
 
 from .base import AbstractLLMClient, AbstractPromptBuilder
+from .chat_wrapper import LLMChatWrapper
 from .gemma_client import Gemma2bClient
 from .llama_client import LlamaClient
 from .openai_client import OpenaiClient
@@ -29,6 +30,7 @@
     "PickupPromptBuilder",
     "AbstractLLMClient",
     "AbstractPromptBuilder",
+    "LLMChatWrapper",
 ]
 
 llms = {
diff --git a/src/stretch/llms/chat_wrapper.py b/src/stretch/llms/chat_wrapper.py
new file mode 100644
index 000000000..872c41261
--- /dev/null
+++ b/src/stretch/llms/chat_wrapper.py
@@ -0,0 +1,93 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import os
+import tempfile
+import timeit
+from typing import Any
+
+from termcolor import colored
+
+from stretch.audio import AudioRecorder
+from stretch.audio.speech_to_text import WhisperSpeechToText
+from stretch.llms import AbstractLLMClient, AbstractPromptBuilder
+
+
+class LLMChatWrapper:
+    """Simple class that will get user information and return prompts."""
+
+    def __init__(
+        self,
+        llm_client: AbstractLLMClient,
+        prompt: AbstractPromptBuilder,
+        voice: bool = False,
+        max_audio_duration: int = 10,
+        silence_limit: int = 2,
+    ):
+        self.voice = voice
+        self.llm_client = llm_client
+        self.prompt = prompt
+        self.max_audio_duration = max_audio_duration
+        self.silence_limit = silence_limit
+
+        if self.voice:
+            # Load the tokenizer and model
+            self.audio_recorder = AudioRecorder()
+            self.whisper = WhisperSpeechToText()
+        else:
+            audio_recorder = None
+            whisper = None
+
+    def query(self, verbose: bool = False) -> Any:
+        if self.voice:
+            input(colored("Press enter to speak or ctrl+c to exit.", "yellow"))
+
+            # Create a temporary file
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
+                temp_filename = temp_audio_file.name
+                self.audio_recorder.record(
+                    temp_filename,
+                    duration=self.max_audio_duration,
+                    silence_limit=self.silence_limit,
+                )
+
+                # Transcribe the audio file
+                input_text = self.whisper.transcribe_file(temp_filename)
+
+                # Remove the temporary file
+                os.remove(temp_filename)
+
+                print(colored("I heard:", "green"), input_text)
+        else:
+            input_text = input(colored("You: ", "green"))
+
+        if len(input_text) == 0:
+            return None
+
+        # Get the response and time it
+        t0 = timeit.default_timer()
+        assistant_response = self.llm_client(input_text)
+        t1 = timeit.default_timer()
+
+        response = self.prompt.parse_response(assistant_response)
+
+        if verbose:
+            # Decode and print the result
+            print(colored("Response:", "blue"), response)
+            print("-" * 80)
+            print("Time taken:", t1 - t0)
+            print("-" * 80)
+
+        return response
+
+    def say(self, text: str) -> None:
+        if self.voice:
+            raise NotImplementedError("Voice is not supported yet.")
+        else:
+            print(colored("Robot:", "blue"), text)

From a05537e7c6b4e5b7b22f513228b5570e90f1169d Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 13:37:54 -0400
Subject: [PATCH 164/410] Remove a weird duplicate of the search operation

---
 src/stretch/agent/base/managed_operation.py   |   1 +
 src/stretch/agent/operations/grasp_object.py  |  19 ++-
 .../agent/operations/search_for_object.py     | 134 +-----------------
 src/stretch/agent/robot_agent.py              |   4 +
 src/stretch/agent/task/pickup/pickup_task.py  |   1 +
 src/stretch/app/ai_pickup.py                  |   9 +-
 src/stretch/llms/prompts/pickup_prompt.py     |  24 ++++
 7 files changed, 57 insertions(+), 135 deletions(-)

diff --git a/src/stretch/agent/base/managed_operation.py b/src/stretch/agent/base/managed_operation.py
index be68bd477..bcc542d67 100644
--- a/src/stretch/agent/base/managed_operation.py
+++ b/src/stretch/agent/base/managed_operation.py
@@ -53,6 +53,7 @@ def name(self) -> str:
     def update(self, **kwargs):
         print(colored("================ Updating the world model ==================", "blue"))
         self.agent.update(**kwargs)
+        breakpoint()
 
     def attempt(self, message: str):
         print(colored(f"Trying {self.name}:", "blue"), message)
diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index 0898b24be..03be0457a 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -45,6 +45,9 @@ class GraspObjectOperation(ManagedOperation):
     match_method: str = "class"
     target_object: Optional[str] = None
 
+    # Should we use the previous mask at all?
+    use_prev_mask: bool = False
+
     # Debugging UI elements
     show_object_to_grasp: bool = False
     show_servo_gui: bool = True
@@ -170,6 +173,8 @@ def get_class_mask(self, servo: Observations) -> np.ndarray:
             else:
                 target_class = self.target_object
 
+            print("[GRASP OBJECT] Detecting objects of class", target_class)
+
             # Now find the mask with that class
             for iid in np.unique(servo.semantic):
                 name = self.agent.semantic_sensor.get_class_name_for_id(iid)
@@ -181,12 +186,23 @@ def get_class_mask(self, servo: Observations) -> np.ndarray:
                 raise ValueError(
                     f"Target object must be set before running match method {self.match_method}."
                 )
+            print("[GRASP OBJECT] Detecting objects described as", self.target_object)
+            text_features = self.agent.encode_text(self.target_object)
+            best_score = float("-inf")
+            best_iid = None
             for iid in np.unique(servo.instance):
                 rgb = servo.ee_rgb[servo.instance == iid]
                 import matplotlib.pyplot as plt
 
                 plt.imshow(rgb)
                 plt.show()
+
+                features = self.agent.encode_image(rgb)
+                score = self.agent.compare_features(text_features, features)
+                if score > best_score:
+                    best_score = score
+                    best_iid = iid
+                    mask = servo.instance == iid
         else:
             raise ValueError(f"Invalid matching method {self.match_method}.")
 
@@ -241,7 +257,7 @@ def get_target_mask(
 
             # Option 2 - try to find the map that most overlapped with what we were just trying to grasp
             # This is in case we are losing track of particular objects and getting classes mixed up
-            if prev_mask is not None:
+            if prev_mask is not None and self.use_prev_mask:
                 # Find the mask with the most points
                 mask = np.bitwise_and(current_instance_mask, prev_mask)
                 mask = np.bitwise_and(mask, class_mask)
@@ -258,6 +274,7 @@ def get_target_mask(
                 target_mask = mask
                 target_mask_pts = num_pts
 
+        input("----")
         if maximum_overlap_pts > self.min_points_to_approach:
             return maximum_overlap_mask
         if target_mask is not None:
diff --git a/src/stretch/agent/operations/search_for_object.py b/src/stretch/agent/operations/search_for_object.py
index 6d94e23d6..311a618f7 100644
--- a/src/stretch/agent/operations/search_for_object.py
+++ b/src/stretch/agent/operations/search_for_object.py
@@ -177,6 +177,7 @@ def was_successful(self) -> bool:
 class SearchForObjectOnFloorOperation(ManagedSearchOperation):
     """Search for an object on the floor"""
 
+    # Important parameters
     plan_for_manipulation: bool = True
     update_at_start: bool = False
 
@@ -190,7 +191,7 @@ def run(self) -> None:
 
         # Set the object class if not set
         if self.object_class is None:
-            self.object_class = self.agent.target_object
+            self.set_target_object_class(self.agent.target_object)
 
         # Clear the current object
         self.agent.current_object = None
@@ -199,9 +200,7 @@ def run(self) -> None:
             # Update world map
             # Switch to navigation posture
             self.robot.move_to_nav_posture()
-
             # Do not update until you are in nav posture
-
             self.update()
 
         if self.show_map_so_far:
@@ -236,135 +235,6 @@ def run(self) -> None:
         # Compute scene graph from instance memory so that we can use it
         scene_graph = self.agent.get_scene_graph()
 
-        receptacle_options = []
-        print(f"Check explored instances for reachable {self.object_class} instances:")
-        for i, instance in enumerate(instances):
-            name = self.agent.semantic_sensor.get_class_name_for_id(instance.category_id)
-            print(f" - Found instance {i} with name {name} and global id {instance.global_id}.")
-
-            if self.agent.is_instance_unreachable(instance):
-                print(" - Instance is unreachable.")
-                continue
-
-            if self.show_instances_detected:
-                self.show_instance(instance, f"Instance {i} with name {name}")
-
-            if self.object_class in name:
-                relations = scene_graph.get_matching_relations(instance.global_id, "floor", "on")
-                if len(relations) > 0:
-                    # We found a matching relation!
-                    print(f" - Found a toy on the floor at {instance.get_best_view().get_pose()}.")
-
-                    # Move to object on floor
-                    plan = self.plan_to_instance_for_manipulation(
-                        instance, start=start, radius_m=0.5
-                    )
-                    if plan.success:
-                        print(
-                            f" - Confirmed toy is reachable with base pose at {plan.trajectory[-1]}."
-                        )
-                        self.agent.current_object = instance
-                        break
-
-        # Check to see if there is a visitable frontier
-        if self.agent.current_object is None:
-            self.warn(f"No {self.object_class} found. Moving to frontier.")
-            # Find a point on the frontier and move there
-            res = self.agent.plan_to_frontier(start=start)
-            if res.success:
-                self.robot.execute_trajectory(
-                    [node.state for node in res.trajectory], final_timeout=10.0
-                )
-            # Update world model once we get to frontier
-            self.update()
-
-            # If we moved to the frontier, then and only then can we clean up the object plans.
-            self.warn("Resetting object plans.")
-            self.agent.reset_object_plans()
-        else:
-            self.cheer(f"Found object of {self.object_class}!")
-            view = self.agent.current_object.get_best_view()
-            image = Image.fromarray(view.get_image())
-            image.save("object.png")
-            if self.show_map_so_far:
-                # This shows us what the robot has found so far
-                object_xyz = self.agent.current_object.point_cloud.mean(axis=0).cpu().numpy()
-                xyt = self.robot.get_base_pose()
-                self.agent.voxel_map.show(
-                    orig=object_xyz,
-                    xyt=xyt,
-                    footprint=self.robot_model.get_footprint(),
-                    planner_visuals=False,
-                )
-
-        # TODO: better behavior
-        # If no visitable frontier, pick a random point nearby and just wander around
-
-    def was_successful(self) -> bool:
-        return self.agent.current_object is not None and not self.agent.is_instance_unreachable(
-            self.agent.current_object
-        )
-
-
-class SearchForObjectOnFloorOperation(ManagedSearchOperation):
-    """Search for an object on the floor"""
-
-    # Important parameters
-    plan_for_manipulation: bool = True
-
-    def can_start(self) -> bool:
-        self.attempt("If receptacle is found, we can start searching for objects.")
-        return self.agent.current_receptacle is not None
-
-    def run(self) -> None:
-        self.intro("Find a reachable object on the floor.")
-        self._successful = False
-
-        # Set the object class if not set
-        if self.object_class is None:
-            self.set_target_object_class(self.agent.target_object)
-
-        # Clear the current object
-        self.agent.current_object = None
-
-        # Update world map
-        # Switch to navigation posture
-        self.robot.move_to_nav_posture()
-        # Do not update until you are in nav posture
-        self.update()
-
-        if self.show_map_so_far:
-            # This shows us what the robot has found so far
-            xyt = self.robot.get_base_pose()
-            self.agent.voxel_map.show(
-                orig=np.zeros(3), xyt=xyt, footprint=self.robot_model.get_footprint()
-            )
-
-        # Get the current location of the robot
-        start = self.robot.get_base_pose()
-        if not self.navigation_space.is_valid(start):
-            self.error(
-                "Robot is in an invalid configuration. It is probably too close to geometry, or localization has failed."
-            )
-            breakpoint()
-
-        if self.show_instances_detected:
-            # Show the last instance image
-            import matplotlib
-
-            # TODO: why do we need to configure this every time
-            matplotlib.use("TkAgg")
-            import matplotlib.pyplot as plt
-
-            plt.imshow(self.agent.voxel_map.observations[0].instance)
-            plt.show()
-
-        # Check to see if we have a receptacle in the map
-        instances = self.agent.voxel_map.instances.get_instances()
-
-        # Compute scene graph from instance memory so that we can use it
-        scene_graph = self.agent.get_scene_graph()
-
         receptacle_options = []
         print(f"Check explored instances for reachable {self.object_class} instances:")
         for i, instance in enumerate(instances):
diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 88617ec9d..84360df71 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -204,6 +204,10 @@ def encode_image(self, image: Union[np.ndarray, torch.Tensor]) -> torch.Tensor:
         """Encode image using the encoder"""
         return self.encoder.encode_image(image)
 
+    def compare_features(self, feature1: torch.Tensor, feature2: torch.Tensor) -> float:
+        """Compare two feature vectors using the encoder"""
+        return self.encoder.compute_score(feature1, feature2)
+
     def get_instance_from_text(
         self,
         text_query: str,
diff --git a/src/stretch/agent/task/pickup/pickup_task.py b/src/stretch/agent/task/pickup/pickup_task.py
index 657b399b0..7b2049541 100644
--- a/src/stretch/agent/task/pickup/pickup_task.py
+++ b/src/stretch/agent/task/pickup/pickup_task.py
@@ -171,6 +171,7 @@ def get_one_shot_task(self, add_rotate: bool = False, matching: str = "feature")
         )
         grasp_object.set_target_object_class(self.agent.target_object)
         grasp_object.servo_to_grasp = self.use_visual_servoing_for_grasp
+        grasp_object.match_method = matching
         place_object_on_receptacle = PlaceObjectOperation(
             "place_object_on_receptacle", self.agent, on_cannot_start=go_to_receptacle
         )
diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index f8939aa0a..383768ac6 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -16,9 +16,8 @@
 from stretch.agent.task.pickup import PickupTask
 from stretch.agent.zmq_client import HomeRobotZmqClient
 from stretch.core import get_parameters
-from stretch.llms import LLMChatWrapper, get_llm_choices, get_llm_client
+from stretch.llms import LLMChatWrapper, PickupPromptBuilder, get_llm_choices, get_llm_client
 from stretch.perception import create_semantic_sensor
-from stretch.utils.prompt import PickupPromptBuilder
 
 
 @click.command()
@@ -32,6 +31,7 @@
 @click.option("--parameter_file", default="default_planner.yaml", help="Path to parameter file")
 @click.option(
     "--match_method",
+    "--match-method",
     type=click.Choice(["class", "feature"]),
     default="feature",
     help="Method to match objects to pick up. Options: class, feature.",
@@ -61,11 +61,13 @@
 )
 @click.option(
     "--show_intermediate_maps",
+    "--show-intermediate-maps",
     is_flag=True,
     help="Set to visualize intermediate maps",
 )
 @click.option(
     "--target_object",
+    "--target-object",
     default="",
     help="Name of the object to pick up",
 )
@@ -76,11 +78,13 @@
 )
 @click.option(
     "--use_llm",
+    "--use-llm",
     is_flag=True,
     help="Set to use the language model",
 )
 @click.option(
     "--use_voice",
+    "--use-voice",
     is_flag=True,
     help="Set to use voice input",
 )
@@ -135,6 +139,7 @@ def main(
     while robot.running:
         agent.reset()
 
+        say_this = None
         if llm_client is None:
             # Call the LLM client and parse
             if len(target_object) == 0:
diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index f0f5ede12..a24aa4fd0 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -7,6 +7,8 @@
 # Some code may be adapted from other open-source works with their respective licenses. Original
 # license information maybe found below, if so.
 
+from typing import List, Tuple
+
 from stretch.llms.base import AbstractPromptBuilder
 
 simple_stretch_prompt = """You are a friendly, helpful robot named Stretch. You are always helpful, and answer questions concisely. You will answer questions very concisely.
@@ -104,3 +106,25 @@ def parse_response(self, response: str):
                 parsed_commands.append(("place", command[6:-1]))
 
         return parsed_commands
+
+    def get_object(self, response: List[Tuple[str, str]]) -> str:
+        """Return the object from the response."""
+        for command, args in response:
+            if command == "pickup":
+                return args
+        return ""
+
+    def get_receptacle(self, response: List[Tuple[str, str]]) -> str:
+        """Return the receptacle from the response."""
+        for command, args in response:
+            if command == "place":
+                return args
+        return ""
+
+    def get_say_this(self, response: List[Tuple[str, str]]) -> str:
+        """Return the text to say from the response."""
+        all_messages = []
+        for command, args in response:
+            if command == "say":
+                all_messages.append(args)
+        return " ".join(all_messages)

From 2264b8062c6a486e9244db34e99e6879c562c72d Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 13:39:46 -0400
Subject: [PATCH 165/410] remove a breakpoint and add another to debug why rgb
 images arent looking good yet

---
 src/stretch/agent/base/managed_operation.py  | 1 -
 src/stretch/agent/operations/grasp_object.py | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/base/managed_operation.py b/src/stretch/agent/base/managed_operation.py
index bcc542d67..be68bd477 100644
--- a/src/stretch/agent/base/managed_operation.py
+++ b/src/stretch/agent/base/managed_operation.py
@@ -53,7 +53,6 @@ def name(self) -> str:
     def update(self, **kwargs):
         print(colored("================ Updating the world model ==================", "blue"))
         self.agent.update(**kwargs)
-        breakpoint()
 
     def attempt(self, message: str):
         print(colored(f"Trying {self.name}:", "blue"), message)
diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index 03be0457a..ecea4b81a 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -191,6 +191,7 @@ def get_class_mask(self, servo: Observations) -> np.ndarray:
             best_score = float("-inf")
             best_iid = None
             for iid in np.unique(servo.instance):
+                breakpoint()
                 rgb = servo.ee_rgb[servo.instance == iid]
                 import matplotlib.pyplot as plt
 
@@ -199,6 +200,7 @@ def get_class_mask(self, servo: Observations) -> np.ndarray:
 
                 features = self.agent.encode_image(rgb)
                 score = self.agent.compare_features(text_features, features)
+                print(f"Score for {iid} is {score}")
                 if score > best_score:
                     best_score = score
                     best_iid = iid

From e1c2c010e844b0e29c44a411759bb386540f5abd Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 13:52:00 -0400
Subject: [PATCH 166/410] making sure information is synced

---
 src/stretch/agent/operations/grasp_object.py | 181 ++++++++++---------
 1 file changed, 93 insertions(+), 88 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index ecea4b81a..faba34f3a 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -191,6 +191,11 @@ def get_class_mask(self, servo: Observations) -> np.ndarray:
             best_score = float("-inf")
             best_iid = None
             for iid in np.unique(servo.instance):
+
+                # Ignore the background
+                if iid < 0:
+                    continue
+
                 breakpoint()
                 rgb = servo.ee_rgb[servo.instance == iid]
                 import matplotlib.pyplot as plt
@@ -343,6 +348,14 @@ def visual_servo_to_object(
         q_last = np.array([0.0 for _ in range(11)])  # 11 DOF, HelloStretchIdx
         random_motion_counter = 0
 
+        if not pregrasp_done:
+            # Move to pregrasp position
+            self.pregrasp_open_loop(
+                instance.get_center(), distance_from_object=self.pregrasp_distance_from_object
+            )
+            pregrasp_done = True
+        input("PREGRASP DONE")
+
         # Main loop - run unless we time out, blocking.
         while timeit.default_timer() - t0 < max_duration:
 
@@ -451,97 +464,89 @@ def visual_servo_to_object(
                 if res == ord("q"):
                     break
 
-            if not pregrasp_done and current_xyz is not None:
-                self.pregrasp_open_loop(
-                    current_xyz, distance_from_object=self.pregrasp_distance_from_object
-                )
-                pregrasp_done = True
+            # check not moving threshold
+            if not_moving_count > max_not_moving_count:
+                self.info("Not moving; try to grasp.")
+                success = self._grasp()
+                break
+            # If we have a target mask, compute the median depth of the object
+            # Otherwise we will just try to grasp if we are close enough - assume we lost track!
+            if target_mask is not None:
+                object_depth = servo.ee_depth[target_mask]
+                median_object_depth = np.median(servo.ee_depth[target_mask]) / 1000
             else:
-                # check not moving threshold
-                if not_moving_count > max_not_moving_count:
-                    self.info("Not moving; try to grasp.")
+                print("detected classes:", np.unique(servo.ee_semantic))
+                if center_depth < self.median_distance_when_grasping:
+                    success = self._grasp()
+                continue
+
+            dx, dy = mask_center[1] - center_x, mask_center[0] - center_y
+
+            # Is the center of the image part of the target mask or not?
+            center_in_mask = target_mask[int(center_y), int(center_x)] > 0
+            # TODO: add deadband bubble around this?
+
+            # Since we were able to detect it, copy over the target mask
+            prev_target_mask = target_mask
+
+            # Are we aligned to the object
+            aligned = np.abs(dx) < self.align_x_threshold and np.abs(dy) < self.align_y_threshold
+
+            print()
+            print("----- STEP VISUAL SERVOING -----")
+            print("Observed this many target mask points:", np.sum(target_mask.flatten()))
+            print("failed =", failed_counter, "/", self.max_failed_attempts)
+            print("cur x =", base_x)
+            print(" lift =", lift)
+            print("  arm =", arm)
+            print("pitch =", wrist_pitch)
+            print(f"base_x={base_x}, wrist_pitch={wrist_pitch}, dx={dx}, dy={dy}")
+            print(f"Median distance to object is {median_object_depth}.")
+            print(f"Center distance to object is {center_depth}.")
+            print("Center in mask?", center_in_mask)
+            print("Current XYZ:", current_xyz)
+            print("Aligned?", aligned)
+
+            # Fix lift to only go down
+            lift = min(lift, prev_lift)
+
+            if aligned:
+                # First, check to see if we are close enough to grasp
+                if center_depth < self.median_distance_when_grasping:
+                    print(
+                        f"Center depth of {center_depth} is close enough to grasp; less than {self.median_distance_when_grasping}."
+                    )
+                    self.info("Aligned and close enough to grasp.")
                     success = self._grasp()
                     break
-                # If we have a target mask, compute the median depth of the object
-                # Otherwise we will just try to grasp if we are close enough - assume we lost track!
-                if target_mask is not None:
-                    object_depth = servo.ee_depth[target_mask]
-                    median_object_depth = np.median(servo.ee_depth[target_mask]) / 1000
-                else:
-                    print("detected classes:", np.unique(servo.ee_semantic))
-                    if center_depth < self.median_distance_when_grasping:
-                        success = self._grasp()
-                    continue
-
-                dx, dy = mask_center[1] - center_x, mask_center[0] - center_y
-
-                # Is the center of the image part of the target mask or not?
-                center_in_mask = target_mask[int(center_y), int(center_x)] > 0
-                # TODO: add deadband bubble around this?
-
-                # Since we were able to detect it, copy over the target mask
-                prev_target_mask = target_mask
-
-                # Are we aligned to the object
-                aligned = (
-                    np.abs(dx) < self.align_x_threshold and np.abs(dy) < self.align_y_threshold
-                )
-
-                print()
-                print("----- STEP VISUAL SERVOING -----")
-                print("Observed this many target mask points:", np.sum(target_mask.flatten()))
-                print("failed =", failed_counter, "/", self.max_failed_attempts)
-                print("cur x =", base_x)
-                print(" lift =", lift)
-                print("  arm =", arm)
-                print("pitch =", wrist_pitch)
-                print(f"base_x={base_x}, wrist_pitch={wrist_pitch}, dx={dx}, dy={dy}")
-                print(f"Median distance to object is {median_object_depth}.")
-                print(f"Center distance to object is {center_depth}.")
-                print("Center in mask?", center_in_mask)
-                print("Current XYZ:", current_xyz)
-                print("Aligned?", aligned)
-
-                # Fix lift to only go down
-                lift = min(lift, prev_lift)
-
-                if aligned:
-                    # First, check to see if we are close enough to grasp
-                    if center_depth < self.median_distance_when_grasping:
-                        print(
-                            f"Center depth of {center_depth} is close enough to grasp; less than {self.median_distance_when_grasping}."
-                        )
-                        self.info("Aligned and close enough to grasp.")
-                        success = self._grasp()
-                        break
-                    # If we are aligned, step the whole thing closer by some amount
-                    # This is based on the pitch - basically
-                    aligned_once = True
-                    arm_component = np.cos(wrist_pitch) * self.lift_arm_ratio
-                    lift_component = np.sin(wrist_pitch) * self.lift_arm_ratio
-                    arm += arm_component
-                    lift += lift_component
-                else:
-                    # Add these to do some really hacky proportionate control
-                    px = max(0.25, np.abs(2 * dx / target_mask.shape[1]))
-                    py = max(0.25, np.abs(2 * dy / target_mask.shape[0]))
-
-                    # Move the base and modify the wrist pitch
-                    # TODO: remove debug code
-                    # print(f"dx={dx}, dy={dy}, px={px}, py={py}")
-                    if dx > self.align_x_threshold:
-                        # Move in x - this means translate the base
-                        base_x += -self.base_x_step * px
-                    elif dx < -1 * self.align_x_threshold:
-                        base_x += self.base_x_step * px
-                    if dy > self.align_y_threshold:
-                        # Move in y - this means translate the base
-                        wrist_pitch += -self.wrist_pitch_step * py
-                    elif dy < -1 * self.align_y_threshold:
-                        wrist_pitch += self.wrist_pitch_step * py
-
-                    # Force to reacquire the target mask if we moved the camera too much
-                    prev_target_mask = None
+                # If we are aligned, step the whole thing closer by some amount
+                # This is based on the pitch - basically
+                aligned_once = True
+                arm_component = np.cos(wrist_pitch) * self.lift_arm_ratio
+                lift_component = np.sin(wrist_pitch) * self.lift_arm_ratio
+                arm += arm_component
+                lift += lift_component
+            else:
+                # Add these to do some really hacky proportionate control
+                px = max(0.25, np.abs(2 * dx / target_mask.shape[1]))
+                py = max(0.25, np.abs(2 * dy / target_mask.shape[0]))
+
+                # Move the base and modify the wrist pitch
+                # TODO: remove debug code
+                # print(f"dx={dx}, dy={dy}, px={px}, py={py}")
+                if dx > self.align_x_threshold:
+                    # Move in x - this means translate the base
+                    base_x += -self.base_x_step * px
+                elif dx < -1 * self.align_x_threshold:
+                    base_x += self.base_x_step * px
+                if dy > self.align_y_threshold:
+                    # Move in y - this means translate the base
+                    wrist_pitch += -self.wrist_pitch_step * py
+                elif dy < -1 * self.align_y_threshold:
+                    wrist_pitch += self.wrist_pitch_step * py
+
+                # Force to reacquire the target mask if we moved the camera too much
+                prev_target_mask = None
 
                 # safety checks
                 q = [

From 801109fb23575b33dac08a7c21dfbf7011060d6a Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 13:54:59 -0400
Subject: [PATCH 167/410] increase pregrasp distance from object to better
 servo to it

---
 src/stretch/agent/operations/grasp_object.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index faba34f3a..36bf875f9 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -65,7 +65,7 @@ class GraspObjectOperation(ManagedOperation):
     align_y_threshold: int = 15
 
     # pregrasp_distance_from_object: float = 0.075
-    pregrasp_distance_from_object: float = 0.1
+    pregrasp_distance_from_object: float = 0.2
 
     # This is the distance at which we close the gripper when visual servoing
     # median_distance_when_grasping: float = 0.175
@@ -354,7 +354,10 @@ def visual_servo_to_object(
                 instance.get_center(), distance_from_object=self.pregrasp_distance_from_object
             )
             pregrasp_done = True
-        input("PREGRASP DONE")
+
+        # Give a short pause here to make sure ee image is up to date
+        time.sleep(0.5)
+        self.warn("Starting visual servoing.")
 
         # Main loop - run unless we time out, blocking.
         while timeit.default_timer() - t0 < max_duration:

From 4066ba27ae14cca4028b21f4f2f14fa2fbd24b9a Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 13:58:34 -0400
Subject: [PATCH 168/410] looking up mask open-vocab by features instead of by
 weird class stuff that's often wrong

---
 src/stretch/agent/operations/grasp_object.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index 36bf875f9..eda6f4423 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -196,16 +196,16 @@ def get_class_mask(self, servo: Observations) -> np.ndarray:
                 if iid < 0:
                     continue
 
-                breakpoint()
-                rgb = servo.ee_rgb[servo.instance == iid]
-                import matplotlib.pyplot as plt
+                rgb = servo.ee_rgb * (servo.instance == iid)[:, :, None].repeat(3, axis=-1)
 
-                plt.imshow(rgb)
-                plt.show()
+                # TODO: remove debug code
+                #  import matplotlib.pyplot as plt
+                # plt.imshow(rgb)
+                # plt.show()
 
                 features = self.agent.encode_image(rgb)
                 score = self.agent.compare_features(text_features, features)
-                print(f"Score for {iid} is {score}")
+                print(f" - Score for {iid} is {score}")
                 if score > best_score:
                     best_score = score
                     best_iid = iid

From 6df034e7c42ca90117968f4562534a460e6d61aa Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 14:03:11 -0400
Subject: [PATCH 169/410] tweak this

---
 src/stretch/agent/operations/grasp_object.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index eda6f4423..cb6c87c8f 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -303,7 +303,7 @@ def _grasp(self) -> bool:
 
             # Move the arm in closer
             self.robot.arm_to(
-                [base_x, lift - 0.05, arm + 0.1, 0, wrist_pitch, 0],
+                [base_x, lift - 0.1, arm + 0.15, 0, wrist_pitch, 0],
                 head=constants.look_at_ee,
                 blocking=True,
             )

From 8017bf692cb92eb8eedb2288ecf833172057502b Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Mon, 16 Sep 2024 11:19:44 -0700
Subject: [PATCH 170/410] cleanup

---
 src/stretch/visualization/rerun.py           | 34 ++++++++++++++------
 src/stretch/visualization/urdf_visualizer.py | 30 ++++++++++++++---
 2 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 854c2f7e6..4a422fb0b 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -99,7 +99,13 @@ def occupancy_map_to_3d_points(
 
 
 class StretchURDFLogger(urdf_visualizer.URDFVisualizer):
-    def log_robot_mesh(self, cfg, use_collision=True):
+    def log_robot_mesh(self, cfg: dict, use_collision: bool = True, debug: bool = False):
+        """
+        Log robot mesh using joint configuration data
+        Args:
+            cfg (dict): Joint configuration
+            use_collision (bool): use collision mesh
+        """
         lk_cfg = {
             "joint_wrist_yaw": cfg["wrist_yaw"],
             "joint_wrist_pitch": cfg["wrist_pitch"],
@@ -115,14 +121,15 @@ def log_robot_mesh(self, cfg, use_collision=True):
         if "gripper" in cfg.keys():
             lk_cfg["joint_gripper_finger_left"] = cfg["gripper"]
             lk_cfg["joint_gripper_finger_right"] = cfg["gripper"]
-        st = time.perf_counter()
+        t0 = timeit.default_timer()
         mesh = self.get_combined_robot_mesh(cfg=lk_cfg, use_collision=use_collision)
-        print(f"Time to get mesh: {1000*(time.perf_counter() - st)}")
-        st2 = time.perf_counter()
+        t1 = timeit.default_timer()
+        if debug:
+            print(f"Time to get mesh: {1000*(t1 - t1)} ms")
         # breakpoint()
         rr.set_time_seconds("realtime", time.time())
         rr.log(
-            "world/robot/base_link",
+            "world/robot/mesh",
             rr.Mesh3D(
                 vertex_positions=mesh.vertices,
                 triangle_indices=mesh.faces,
@@ -133,10 +140,18 @@ def log_robot_mesh(self, cfg, use_collision=True):
             ),
             timeless=True,
         )
-        t2 = 1000 * (time.perf_counter() - st2)
-        print(f"Time to rr log mesh: {t2}")
+        t2 = 1000 * (timeit.default_timer() - t1)
+        if debug:
+            print(f"Time to rr log mesh: {t2}")
+            print(f"Total time to log mesh: {1000*(timeit.default_timer() - t0)} ms")
 
-    def log(self, obs, use_collision=True):
+    def log(self, obs, use_collision: bool = True, debug: bool = False):
+        """
+        Log robot mesh using urdf visualizer to rerun
+        Args:
+            obs (dict): Observation dataclass
+            use_collision (bool): use collision mesh
+        """
         state = obs["joint"]
         cfg = {}
         for k in HelloStretchIdx.name_to_idx:
@@ -175,6 +190,7 @@ def __init__(self, display_robot_mesh: bool = True, open_browser: bool = True):
         self.setup_blueprint()
 
     def setup_blueprint(self):
+        """Setup the blueprint for the visualizer"""
         world_view = rrb.Spatial3DView(origin="world")
         my_blueprint = rrb.Blueprint(
             rrb.Horizontal(
@@ -422,9 +438,7 @@ def step(self, obs, servo):
                 self.log_robot_state(obs)
 
                 if self.display_robot_mesh:
-                    st = time.perf_counter()
                     self.log_robot_mesh(obs)
-                    print("Total time to log urdf: ", 1000 * (time.perf_counter() - st))
 
             except Exception as e:
                 print(e)
diff --git a/src/stretch/visualization/urdf_visualizer.py b/src/stretch/visualization/urdf_visualizer.py
index fbbbd5961..d2ea0e85e 100644
--- a/src/stretch/visualization/urdf_visualizer.py
+++ b/src/stretch/visualization/urdf_visualizer.py
@@ -14,6 +14,7 @@
 
 import numpy as np
 import urchin as urdf_loader
+from trimesh import Trimesh
 
 pkg_path = str(importlib_resources.files("stretch_urdf"))
 model_name = "SE3"  # RE1V0, RE2V0, SE3
@@ -55,10 +56,18 @@ class URDFVisualizer:
 
     abs_urdf_file_path = get_absolute_path_stretch_urdf(urdf_file_path, mesh_files_directory_path)
 
-    def __init__(self, urdf_file=abs_urdf_file_path):
-        self.urdf = self.urdf = urdf_loader.URDF.load(urdf_file)
+    def __init__(self, urdf_file: str = abs_urdf_file_path):
+        self.urdf = urdf_loader.URDF.load(urdf_file)
 
-    def get_tri_meshes(self, cfg=None, use_collision=True) -> list:
+    def get_tri_meshes(self, cfg: dict = None, use_collision: bool = True) -> list:
+        """
+        Get list of trimesh objects, pose and link names of the robot with the given configuration
+        Args:
+            cfg: Configuration of the robot
+            use_collision: Whether to use collision meshes or visual meshes
+        Returns:
+            list: List of trimesh objects, pose and link names of the robot with the given configuration
+        """
         if use_collision:
             fk = self.urdf.collision_trimesh_fk(cfg=cfg)
         else:
@@ -75,7 +84,15 @@ def get_tri_meshes(self, cfg=None, use_collision=True) -> list:
             )
         return t_meshes
 
-    def get_combined_robot_mesh(self, cfg=None, use_collision=True):
+    def get_combined_robot_mesh(self, cfg: dict = None, use_collision: bool = True) -> Trimesh:
+        """
+        Get an fully combined mesh of the robot with the given configuration
+        Args:
+            cfg: Configuration of the robot
+            use_collision: Whether to use collision meshes or visual meshes
+        Returns:
+            Trimesh: Fully combined mesh of the robot with the given configuration
+        """
         tm = self.get_tri_meshes(cfg, use_collision)
         mesh_list = tm["mesh"]
         pose_list = np.array(tm["pose"])
@@ -88,6 +105,11 @@ def get_combined_robot_mesh(self, cfg=None, use_collision=True):
     def get_transform(self, cfg: dict, link_name: str) -> np.ndarray:
         """
         Get transformation matrix of the link w.r.t. the base_link
+        Args:
+            cfg: Configuration of the robot
+            link_name: Name of the link
+        Returns:
+            Transformation matrix of the link w.r.t. the base_link
         """
         lk_cfg = {
             "joint_wrist_yaw": cfg["wrist_yaw"],

From cc90966de8316413d18db1c89a357ed3c9593087 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Mon, 16 Sep 2024 11:20:29 -0700
Subject: [PATCH 171/410] remove mujoco log

---
 MUJOCO_LOG.TXT | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 MUJOCO_LOG.TXT

diff --git a/MUJOCO_LOG.TXT b/MUJOCO_LOG.TXT
deleted file mode 100644
index f47575042..000000000
--- a/MUJOCO_LOG.TXT
+++ /dev/null
@@ -1,3 +0,0 @@
-Mon Aug 26 20:04:52 2024
-WARNING: Nan, Inf or huge value in QACC at DOF 0. The simulation is unstable. Time = 3.2700.
-

From 54463301647a3405088591776b44021345251f8b Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 14:48:20 -0400
Subject: [PATCH 172/410] remove an input

---
 src/stretch/agent/operations/grasp_object.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index cb6c87c8f..9884b9c59 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -281,7 +281,6 @@ def get_target_mask(
                 target_mask = mask
                 target_mask_pts = num_pts
 
-        input("----")
         if maximum_overlap_pts > self.min_points_to_approach:
             return maximum_overlap_mask
         if target_mask is not None:

From 23583fd09e8268675a2534697f06d8d81bba3603 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Mon, 16 Sep 2024 11:51:05 -0700
Subject: [PATCH 173/410] improve rerun vis settings

---
 src/stretch/visualization/rerun.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 4a422fb0b..2cb81b224 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -191,7 +191,6 @@ def __init__(self, display_robot_mesh: bool = True, open_browser: bool = True):
 
     def setup_blueprint(self):
         """Setup the blueprint for the visualizer"""
-        world_view = rrb.Spatial3DView(origin="world")
         my_blueprint = rrb.Blueprint(
             rrb.Horizontal(
                 rrb.Spatial3DView(name="3D View", origin="world"),
@@ -199,8 +198,9 @@ def setup_blueprint(self):
                     rrb.Spatial2DView(name="head_rgb", origin="/world/head_camera"),
                     rrb.Spatial2DView(name="ee_rgb", origin="/world/ee_camera"),
                 ),
-                rrb.TimePanel(expanded=True),
-                rrb.SelectionPanel(expanded=False),
+                rrb.TimePanel(state=True),
+                rrb.SelectionPanel(state=True),
+                rrb.BlueprintPanel(state=True),
             ),
             collapse_panels=True,
         )
@@ -218,7 +218,7 @@ def log_head_camera(self, obs):
             rr.Pinhole(
                 resolution=[obs["rgb"].shape[1], obs["rgb"].shape[0]],
                 image_from_camera=obs["camera_K"],
-                image_plane_distance=0.35,
+                image_plane_distance=0.25,
             ),
         )
 
@@ -275,7 +275,7 @@ def log_ee_camera(self, servo):
             rr.Pinhole(
                 resolution=[servo.ee_rgb.shape[1], servo.ee_rgb.shape[0]],
                 image_from_camera=servo.ee_camera_K,
-                image_plane_distance=0.35,
+                image_plane_distance=0.25,
             ),
         )
 

From d0184f2b2d00d0b5a5872eb886f2b1c887420cbe Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 15:00:13 -0400
Subject: [PATCH 174/410] update grasp object to make sure we actually move
 closer when aligned iwth an object

---
 src/stretch/agent/operations/grasp_object.py | 88 ++++++++++----------
 1 file changed, 45 insertions(+), 43 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index 9884b9c59..b11da0b83 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -65,7 +65,7 @@ class GraspObjectOperation(ManagedOperation):
     align_y_threshold: int = 15
 
     # pregrasp_distance_from_object: float = 0.075
-    pregrasp_distance_from_object: float = 0.2
+    pregrasp_distance_from_object: float = 0.25
 
     # This is the distance at which we close the gripper when visual servoing
     # median_distance_when_grasping: float = 0.175
@@ -502,6 +502,7 @@ def visual_servo_to_object(
             print(" lift =", lift)
             print("  arm =", arm)
             print("pitch =", wrist_pitch)
+            print("Center depth:", center_depth)
             print(f"base_x={base_x}, wrist_pitch={wrist_pitch}, dx={dx}, dy={dy}")
             print(f"Median distance to object is {median_object_depth}.")
             print(f"Center distance to object is {center_depth}.")
@@ -521,6 +522,7 @@ def visual_servo_to_object(
                     self.info("Aligned and close enough to grasp.")
                     success = self._grasp()
                     break
+
                 # If we are aligned, step the whole thing closer by some amount
                 # This is based on the pitch - basically
                 aligned_once = True
@@ -547,53 +549,53 @@ def visual_servo_to_object(
                 elif dy < -1 * self.align_y_threshold:
                     wrist_pitch += self.wrist_pitch_step * py
 
-                # Force to reacquire the target mask if we moved the camera too much
-                prev_target_mask = None
-
-                # safety checks
-                q = [
-                    base_x,
-                    0.0,
-                    0.0,
-                    lift,
-                    arm,
-                    0.0,
-                    0.0,
-                    wrist_pitch,
-                    0.0,
-                    0.0,
-                    0.0,
-                ]  # 11 DOF: see HelloStretchIdx
-
-                q = np.array(q)
-
+            # Force to reacquire the target mask if we moved the camera too much
+            prev_target_mask = None
+
+            # safety checks
+            q = [
+                base_x,
+                0.0,
+                0.0,
+                lift,
+                arm,
+                0.0,
+                0.0,
+                wrist_pitch,
+                0.0,
+                0.0,
+                0.0,
+            ]  # 11 DOF: see HelloStretchIdx
+
+            q = np.array(q)
+
+            ee_pos, ee_quat = self.robot_model.manip_fk(q)
+
+            while ee_pos[2] < 0.03:
+                lift += 0.01
+                q[HelloStretchIdx.LIFT] = lift
                 ee_pos, ee_quat = self.robot_model.manip_fk(q)
 
-                while ee_pos[2] < 0.03:
-                    lift += 0.01
-                    q[HelloStretchIdx.LIFT] = lift
-                    ee_pos, ee_quat = self.robot_model.manip_fk(q)
-
-                print("tgt x =", base_x)
-                print(" lift =", lift)
-                print("  arm =", arm)
-                print("pitch =", wrist_pitch)
+            print("tgt x =", base_x)
+            print(" lift =", lift)
+            print("  arm =", arm)
+            print("pitch =", wrist_pitch)
 
-                self.robot.arm_to(
-                    [base_x, lift, arm, 0, wrist_pitch, 0],
-                    head=constants.look_at_ee,
-                    blocking=False,
-                )
-                prev_lift = lift
-                time.sleep(self.expected_network_delay)
+            self.robot.arm_to(
+                [base_x, lift, arm, 0, wrist_pitch, 0],
+                head=constants.look_at_ee,
+                blocking=False,
+            )
+            prev_lift = lift
+            time.sleep(self.expected_network_delay)
 
-                # check not moving
-                if np.linalg.norm(q - q_last) < 0.05:  # TODO: tune
-                    not_moving_count += 1
-                else:
-                    not_moving_count = 0
+            # check not moving
+            if np.linalg.norm(q - q_last) < 0.05:  # TODO: tune
+                not_moving_count += 1
+            else:
+                not_moving_count = 0
 
-                q_last = q
+            q_last = q
 
             if random_motion_counter > self.max_random_motions:
                 self.error("Failed to align to object after 10 random motions.")

From 711294d42399a649cb0c7e46e3a4bead4f03a073 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 15:18:59 -0400
Subject: [PATCH 175/410] debugging and tweaking various parameters to try to
 improve performance

---
 src/stretch/agent/operations/grasp_object.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index b11da0b83..f34228a3b 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -69,7 +69,8 @@ class GraspObjectOperation(ManagedOperation):
 
     # This is the distance at which we close the gripper when visual servoing
     # median_distance_when_grasping: float = 0.175
-    median_distance_when_grasping: float = 0.15
+    # median_distance_when_grasping: float = 0.15
+    median_distance_when_grasping: float = 0.2
 
     # Movement parameters
     lift_arm_ratio: float = 0.08
@@ -82,6 +83,10 @@ class GraspObjectOperation(ManagedOperation):
     # Parameters about how to grasp - less important
     grasp_loose: bool = False
     reset_observation: bool = False
+    # Move the arm forward by this amount when grasping
+    _grasp_arm_offset: float = 0.13
+    # Move the arm down by this amount when grasping
+    _grasp_lift_offset: float = -0.13
 
     # Visual servoing config
     track_image_center: bool = False
@@ -302,7 +307,14 @@ def _grasp(self) -> bool:
 
             # Move the arm in closer
             self.robot.arm_to(
-                [base_x, lift - 0.1, arm + 0.15, 0, wrist_pitch, 0],
+                [
+                    base_x,
+                    lift + self._grasp_lift_offset,
+                    arm + self._grasp_arm_offset,
+                    0,
+                    wrist_pitch,
+                    0,
+                ],
                 head=constants.look_at_ee,
                 blocking=True,
             )

From 6e4f217bd9d7e74adb019233bc489b90a38418fe Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 15:40:29 -0400
Subject: [PATCH 176/410] updates

---
 src/stretch/agent/operations/grasp_object.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index f34228a3b..bc47ba4ef 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -86,7 +86,7 @@ class GraspObjectOperation(ManagedOperation):
     # Move the arm forward by this amount when grasping
     _grasp_arm_offset: float = 0.13
     # Move the arm down by this amount when grasping
-    _grasp_lift_offset: float = -0.13
+    _grasp_lift_offset: float = -0.10
 
     # Visual servoing config
     track_image_center: bool = False
@@ -309,7 +309,7 @@ def _grasp(self) -> bool:
             self.robot.arm_to(
                 [
                     base_x,
-                    lift + self._grasp_lift_offset,
+                    np.clip(lift + self._grasp_lift_offset, 0, 0.5),
                     arm + self._grasp_arm_offset,
                     0,
                     wrist_pitch,

From 3358110507b9096cede460b40d123ba2f473a9be Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 16:37:19 -0400
Subject: [PATCH 177/410] more parameter tweaks

---
 src/stretch/agent/operations/grasp_object.py | 10 ++++++++--
 src/stretch/agent/operations/place_object.py |  2 ++
 src/stretch/agent/operations/pregrasp.py     |  1 +
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index bc47ba4ef..5a029a4db 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -61,6 +61,8 @@ class GraspObjectOperation(ManagedOperation):
     # align_x_threshold: int = 10
     # align_y_threshold: int = 7
     # These are the values used to decide when it's aligned enough to grasp
+    # align_x_threshold: int = 15
+    # align_y_threshold: int = 15
     align_x_threshold: int = 15
     align_y_threshold: int = 15
 
@@ -71,6 +73,8 @@ class GraspObjectOperation(ManagedOperation):
     # median_distance_when_grasping: float = 0.175
     # median_distance_when_grasping: float = 0.15
     median_distance_when_grasping: float = 0.2
+    lift_min_height: float = 0.1
+    lift_max_height: float = 0.5
 
     # Movement parameters
     lift_arm_ratio: float = 0.08
@@ -169,6 +173,7 @@ def get_class_mask(self, servo: Observations) -> np.ndarray:
         """
         mask = np.zeros_like(servo.semantic).astype(bool)  # type: ignore
 
+        print("[GRASP OBJECT] match method =", self.match_method)
         if self.match_method == "class":
 
             # Get the target class
@@ -185,7 +190,6 @@ def get_class_mask(self, servo: Observations) -> np.ndarray:
                 name = self.agent.semantic_sensor.get_class_name_for_id(iid)
                 if name is not None and target_class in name:
                     mask = np.bitwise_or(mask, servo.semantic == iid)
-
         elif self.match_method == "feature":
             if self.target_object is None:
                 raise ValueError(
@@ -309,7 +313,9 @@ def _grasp(self) -> bool:
             self.robot.arm_to(
                 [
                     base_x,
-                    np.clip(lift + self._grasp_lift_offset, 0, 0.5),
+                    np.clip(
+                        lift + self._grasp_lift_offset, self.lift_min_height, self.lift_max_height
+                    ),
                     arm + self._grasp_arm_offset,
                     0,
                     wrist_pitch,
diff --git a/src/stretch/agent/operations/place_object.py b/src/stretch/agent/operations/place_object.py
index b6f19c15c..5bd16d4f0 100644
--- a/src/stretch/agent/operations/place_object.py
+++ b/src/stretch/agent/operations/place_object.py
@@ -147,6 +147,8 @@ def run(self) -> None:
         # Switch to place position
         print(" - Move to manip posture")
         self.robot.move_to_manip_posture()
+        self.robot.switch_to_manipulation_mode()
+
         # Get object xyz coords
         xyt = self.robot.get_base_pose()
         placement_xyz = self.sample_placement_position(xyt)
diff --git a/src/stretch/agent/operations/pregrasp.py b/src/stretch/agent/operations/pregrasp.py
index e5d8db7b6..5b2ff0edf 100644
--- a/src/stretch/agent/operations/pregrasp.py
+++ b/src/stretch/agent/operations/pregrasp.py
@@ -87,6 +87,7 @@ def run(self):
         # Strip out fields from the full robot state to only get the 6dof manipulator state
         # TODO: we should probably handle this in the zmq wrapper.
         # arm_cmd = conversions.config_to_manip_command(joint_state)
+        self.robot.switch_to_manipulation_mode()
         self.robot.arm_to(joint_state, blocking=True)
 
     def was_successful(self):

From efe62729e06676974ea2e885ca5c6b90a72bd577 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 16:38:21 -0400
Subject: [PATCH 178/410] updates to grasping code - new grasp script that uses
 just head camera

---
 .../agent/operations/grasp_closed_loop.py     | 546 ------------------
 .../agent/operations/grasp_open_loop.py       | 210 +++++++
 2 files changed, 210 insertions(+), 546 deletions(-)
 delete mode 100644 src/stretch/agent/operations/grasp_closed_loop.py
 create mode 100644 src/stretch/agent/operations/grasp_open_loop.py

diff --git a/src/stretch/agent/operations/grasp_closed_loop.py b/src/stretch/agent/operations/grasp_closed_loop.py
deleted file mode 100644
index 0b98f38da..000000000
--- a/src/stretch/agent/operations/grasp_closed_loop.py
+++ /dev/null
@@ -1,546 +0,0 @@
-# Copyright (c) Hello Robot, Inc.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the LICENSE file in the root directory
-# of this source tree.
-#
-# Some code may be adapted from other open-source works with their respective licenses. Original
-# license information maybe found below, if so.
-
-import time
-import timeit
-from typing import Optional, Tuple
-
-import cv2
-import numpy as np
-
-import stretch.motion.constants as constants
-from stretch.agent.base import ManagedOperation
-from stretch.core.interfaces import Observations
-from stretch.mapping.instance import Instance
-from stretch.motion.kinematics import HelloStretchIdx
-from stretch.utils.filters import MaskTemporalFilter
-from stretch.utils.geometry import point_global_to_base
-from stretch.utils.gripper import GripperArucoDetector
-from stretch.utils.point_cloud import show_point_cloud
-
-
-class ClosedLoopGraspObjectOperation(ManagedOperation):
-    """Move the robot to grasp, using the end effector camera."""
-
-    use_pitch_from_vertical: bool = True
-    lift_distance: float = 0.2
-    servo_to_grasp: bool = False
-    _success: bool = False
-
-    # Task information
-    target_object: Optional[str] = None
-
-    # Debugging UI elements
-    show_object_to_grasp: bool = False
-    show_servo_gui: bool = True
-    show_point_cloud: bool = False
-
-    # Thresholds for centering on object
-    align_x_threshold: int = 10
-    align_y_threshold: int = 7
-    grasp_loose: bool = False
-    reset_observation: bool = False
-
-    # Visual servoing config
-    track_image_center: bool = False
-    gripper_aruco_detector: GripperArucoDetector = None
-    min_points_to_approach: int = 100
-    detected_center_offset_y: int = -40
-    median_distance_when_grasping: float = 0.175
-    percentage_of_image_when_grasping: float = 0.2
-    open_loop_z_offset: float = -0.1
-    open_loop_x_offset: float = -0.1
-    max_failed_attempts: int = 10
-
-    # Movement parameters
-    lift_arm_ratio: float = 0.08
-    base_x_step: float = 0.10
-    wrist_pitch_step: float = 0.075
-
-    # Timing issues
-    expected_network_delay = 0.4
-    open_loop: bool = False
-
-    # Observation memory
-    observations = MaskTemporalFilter(
-        observation_history_window_size_secs=5.0, observation_history_window_size_n=3
-    )
-
-    def configure(
-        self,
-        target_object: str,
-        show_object_to_grasp: bool = False,
-        servo_to_grasp: bool = False,
-        show_servo_gui: bool = True,
-        show_point_cloud: bool = False,
-        reset_observation: bool = False,
-        grasp_loose: bool = False,
-    ):
-        """Configure the operation with the given keyword arguments.
-
-        Args:
-            show_object_to_grasp (bool, optional): Show the object to grasp. Defaults to False.
-            servo_to_grasp (bool, optional): Use visual servoing to grasp. Defaults to False.
-            show_servo_gui (bool, optional): Show the servo GUI. Defaults to True.
-            show_point_cloud (bool, optional): Show the point cloud. Defaults to False.
-            reset_observation (bool, optional): Reset the observation. Defaults to False.
-            grasp_loose (bool, optional): Grasp loosely. Useful for grasping some objects like cups. Defaults to False.
-        """
-        self.target_object = target_object
-        self.show_object_to_grasp = show_object_to_grasp
-        self.servo_to_grasp = servo_to_grasp
-        self.show_servo_gui = show_servo_gui
-        self.show_point_cloud = show_point_cloud
-        self.reset_observation = reset_observation
-        self.grasp_loose = grasp_loose
-
-    def _debug_show_point_cloud(self, servo: Observations, current_xyz: np.ndarray) -> None:
-        """Show the point cloud for debugging purposes.
-
-        Args:
-            servo (Observations): Servo observation
-            current_xyz (np.ndarray): Current xyz location
-        """
-        # TODO: remove this, overrides existing servo state
-        # servo = self.robot.get_servo_observation()
-        world_xyz = servo.get_ee_xyz_in_world_frame()
-        world_xyz_head = servo.get_xyz_in_world_frame()
-        all_xyz = np.concatenate([world_xyz_head.reshape(-1, 3), world_xyz.reshape(-1, 3)], axis=0)
-        all_rgb = np.concatenate([servo.rgb.reshape(-1, 3), servo.ee_rgb.reshape(-1, 3)], axis=0)
-        show_point_cloud(all_xyz, all_rgb / 255, orig=current_xyz)
-
-    def can_start(self):
-        """Grasping can start if we have a target object picked out, and are moving to its instance, and if the robot is ready to begin manipulation."""
-        if self.target_object is None:
-            self.error("No target object set.")
-            return False
-        return self.agent.current_object is not None and self.robot.in_manipulation_mode()
-
-    def get_class_mask(self, servo: Observations) -> np.ndarray:
-        """Get the mask for the class of the object we are trying to grasp. Multiple options might be acceptable.
-
-        Args:
-            servo (Observations): Servo observation
-
-        Returns:
-            np.ndarray: Mask for the class of the object we are trying to grasp
-        """
-        mask = np.zeros_like(servo.semantic).astype(bool)  # type: ignore
-        for iid in np.unique(servo.semantic):
-            name = self.agent.semantic_sensor.get_class_name_for_id(iid)
-            if name is not None and self.target_object in name:
-                mask = np.bitwise_or(mask, servo.semantic == iid)
-        return mask
-
-    def set_target_object_class(self, target_object: str):
-        """Set the target object class.
-
-        Args:
-            target_object (str): Target object class
-        """
-        self.target_object = target_object
-
-    def get_target_mask(
-        self,
-        servo: Observations,
-        instance: Instance,
-        center: Tuple[int, int],
-        prev_mask: Optional[np.ndarray] = None,
-    ) -> Optional[np.ndarray]:
-        """Get target mask to move to. If we do not provide the mask from the previous step, we will simply find the mask with the most points of the correct class. Otherwise, we will try to find the mask that most overlaps with the previous mask. There are two options here: one where we simply find the mask with the most points, and another where we try to find the mask that most overlaps with the previous mask. This is in case we are losing track of particular objects and getting classes mixed up.
-
-        Args:
-            servo (Observations): Servo observation
-            instance (Instance): Instance we are trying to grasp
-            prev_mask (Optional[np.ndarray], optional): Mask from the previous step. Defaults to None.
-
-        Returns:
-            Optional[np.ndarray]: Target mask to move to
-        """
-        # Find the best masks
-        class_mask = self.get_class_mask(servo)
-        instance_mask = servo.instance
-        if servo.ee_xyz is None:
-            servo.compute_ee_xyz()
-
-        target_mask = None
-        target_mask_pts = float("-inf")
-        maximum_overlap_mask = None
-        maximum_overlap_pts = float("-inf")
-        center_x, center_y = center
-        for iid in np.unique(instance_mask):
-            current_instance_mask = instance_mask == iid
-
-            # If we are centered on the mask and it's the right class, just go for it
-            if class_mask[center_y, center_x] > 0 and current_instance_mask[center_y, center_x] > 0:
-                # This is the correct one - it's centered and the right class. Just go there.
-                print("!!! CENTERED ON THE RIGHT OBJECT !!!")
-                return current_instance_mask
-
-            # Option 2 - try to find the map that most overlapped with what we were just trying to grasp
-            # This is in case we are losing track of particular objects and getting classes mixed up
-            if prev_mask is not None:
-                # Find the mask with the most points
-                mask = np.bitwise_and(current_instance_mask, prev_mask)
-                mask = np.bitwise_and(mask, class_mask)
-                num_pts = sum(mask.flatten())
-
-                if num_pts > maximum_overlap_pts:
-                    maximum_overlap_pts = num_pts
-                    maximum_overlap_mask = mask
-
-            # Simply find the mask with the most points
-            mask = np.bitwise_and(current_instance_mask, class_mask)
-            num_pts = sum(mask.flatten())
-            if num_pts > target_mask_pts:
-                target_mask = mask
-                target_mask_pts = num_pts
-
-        if maximum_overlap_pts > self.min_points_to_approach:
-            return maximum_overlap_mask
-        if target_mask is not None:
-            return target_mask
-        else:
-            return prev_mask
-
-    def _grasp(self) -> bool:
-        """Helper function to close gripper around object."""
-        self.cheer("Grasping object!")
-        self.robot.close_gripper(loose=self.grasp_loose, blocking=True)
-        time.sleep(0.5)
-
-        # Get a joint state for the object
-        joint_state = self.robot.get_joint_positions()
-
-        # Lifted joint state
-        lifted_joint_state = joint_state.copy()
-        lifted_joint_state[HelloStretchIdx.LIFT] += 0.2
-        self.robot.arm_to(lifted_joint_state, head=constants.look_at_ee, blocking=True)
-        return True
-
-    def visual_servo_to_object(
-        self, instance: Instance, max_duration: float = 120.0, max_not_moving_count: int = 10
-    ) -> bool:
-        """Use visual servoing to grasp the object."""
-
-        self.intro(f"Visual servoing to grasp object {instance.global_id} {instance.category_id=}.")
-        if self.show_servo_gui:
-            self.warn("If you want to stop the visual servoing with the GUI up, press 'q'.")
-
-        t0 = timeit.default_timer()
-        aligned_once = False
-        pregrasp_done = False
-        prev_target_mask = None
-        success = False
-        prev_lift = float("Inf")
-
-        # Track the fingertips using aruco markers
-        if self.gripper_aruco_detector is None:
-            self.gripper_aruco_detector = GripperArucoDetector()
-
-        # Track the last object location and the number of times we've failed to grasp
-        current_xyz = None
-        failed_counter = 0
-        not_moving_count = 0
-        q_last = np.array([0.0 for _ in range(11)])  # 11 DOF, HelloStretchIdx
-
-        # Main loop - run unless we time out, blocking.
-        while timeit.default_timer() - t0 < max_duration:
-
-            # Get servo observation
-            servo = self.robot.get_servo_observation()
-            joint_state = self.robot.get_joint_positions()
-            world_xyz = servo.get_ee_xyz_in_world_frame()
-
-            if not self.open_loop:
-                # Now compute what to do
-                base_x = joint_state[HelloStretchIdx.BASE_X]
-                wrist_pitch = joint_state[HelloStretchIdx.WRIST_PITCH]
-                arm = joint_state[HelloStretchIdx.ARM]
-                lift = joint_state[HelloStretchIdx.LIFT]
-
-            # Compute the center of the image that we will be tracking
-            if self.track_image_center:
-                center_x, center_y = servo.ee_rgb.shape[1] // 2, servo.ee_rgb.shape[0] // 2
-            else:
-                center = self.gripper_aruco_detector.detect_center(servo.ee_rgb)
-                if center is not None:
-                    center_y, center_x = np.round(center).astype(int)
-                    center_y += self.detected_center_offset_y
-                else:
-                    center_x, center_y = servo.ee_rgb.shape[1] // 2, servo.ee_rgb.shape[0] // 2
-
-            # add offset to center
-            center_x -= 10  # move closer to top
-
-            # Run semantic segmentation on it
-            servo = self.agent.semantic_sensor.predict(servo, ee=True)
-            latest_mask = self.get_target_mask(
-                servo, instance, prev_mask=prev_target_mask, center=(center_x, center_y)
-            )
-
-            # dilate mask
-            kernel = np.ones((3, 3), np.uint8)
-            mask_np = latest_mask.astype(np.uint8)
-            dilated_mask = cv2.dilate(mask_np, kernel, iterations=1)
-            latest_mask = dilated_mask.astype(bool)
-
-            # push to history
-            self.observations.push_mask_to_observation_history(
-                observation=latest_mask,
-                timestamp=time.time(),
-                mask_size_threshold=self.min_points_to_approach,
-                acquire_lock=True,
-            )
-
-            target_mask = self.observations.get_latest_observation()
-            if target_mask is None:
-                target_mask = np.zeros([servo.ee_rgb.shape[0], servo.ee_rgb.shape[1]], dtype=bool)
-
-            # Get depth
-            center_depth = servo.ee_depth[center_y, center_x] / 1000
-
-            # Compute the center of the mask in image coords
-            mask_center = self.observations.get_latest_centroid()
-            if mask_center is None:
-                # if not aligned_once:
-                #     self.error(
-                #         "Lost track before even seeing object with EE camera. Just try open loop."
-                #     )
-                #     if self.show_servo_gui:
-                #         cv2.destroyAllWindows()
-                # return False
-                if failed_counter < self.max_failed_attempts:
-                    mask_center = np.array([center_y, center_x])
-                else:
-                    # If we are aligned, but we lost the object, just try to grasp it
-                    self.error(f"Lost track. Trying to grasp at {current_xyz}.")
-                    if current_xyz is not None:
-                        current_xyz[0] += self.open_loop_x_offset
-                        current_xyz[2] += self.open_loop_z_offset
-                    if self.show_servo_gui:
-                        cv2.destroyAllWindows()
-                    return self.grasp_open_loop(current_xyz)
-            else:
-                failed_counter = 0
-                mask_center = mask_center.astype(int)
-                assert (
-                    world_xyz.shape[0] == servo.semantic.shape[0]
-                    and world_xyz.shape[1] == servo.semantic.shape[1]
-                ), "World xyz shape does not match semantic shape."
-                current_xyz = world_xyz[int(mask_center[0]), int(mask_center[1])]
-                if self.show_point_cloud:
-                    self._debug_show_point_cloud(servo, current_xyz)
-
-            # Optionally display which object we are servoing to
-            if self.show_servo_gui:
-                servo_ee_rgb = cv2.cvtColor(servo.ee_rgb, cv2.COLOR_RGB2BGR)
-                mask = target_mask.astype(np.uint8) * 255
-                mask = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)
-                mask[:, :, 0] = 0
-                servo_ee_rgb = cv2.addWeighted(servo_ee_rgb, 0.5, mask, 0.5, 0, servo_ee_rgb)
-                # Draw the center of the image
-                servo_ee_rgb = cv2.circle(servo_ee_rgb, (center_x, center_y), 5, (255, 0, 0), -1)
-                # Draw the center of the mask
-                servo_ee_rgb = cv2.circle(
-                    servo_ee_rgb, (int(mask_center[1]), int(mask_center[0])), 5, (0, 255, 0), -1
-                )
-                cv2.imshow("servo_ee_rgb", servo_ee_rgb)
-                cv2.waitKey(1)
-                res = cv2.waitKey(1) & 0xFF  # 0xFF is a mask to get the last 8 bits
-                if res == ord("q"):
-                    break
-
-            if not pregrasp_done and current_xyz is not None:
-                self.pregrasp_open_loop(current_xyz, distance_from_object=0.075)
-                pregrasp_done = True
-            else:
-                # check not moving threshold
-                if not_moving_count > max_not_moving_count:
-                    success = self._grasp()
-                    break
-                # If we have a target mask, compute the median depth of the object
-                # Otherwise we will just try to grasp if we are close enough - assume we lost track!
-                if target_mask is not None:
-                    object_depth = servo.ee_depth[target_mask]
-                    median_object_depth = np.median(servo.ee_depth[target_mask]) / 1000
-                else:
-                    print("detected classes:", np.unique(servo.ee_semantic))
-                    if center_depth < self.median_distance_when_grasping:
-                        success = self._grasp()
-                    continue
-
-                dx, dy = mask_center[1] - center_x, mask_center[0] - center_y
-
-                # Is the center of the image part of the target mask or not?
-                center_in_mask = target_mask[int(center_y), int(center_x)] > 0
-                # TODO: add deadband bubble around this?
-
-                # Since we were able to detect it, copy over the target mask
-                prev_target_mask = target_mask
-
-                print()
-                print("----- STEP VISUAL SERVOING -----")
-                print("Observed this many target mask points:", np.sum(target_mask.flatten()))
-                print("failed =", failed_counter, "/", self.max_failed_attempts)
-                print("cur x =", base_x)
-                print(" lift =", lift)
-                print("  arm =", arm)
-                print("pitch =", wrist_pitch)
-                print(f"base_x={base_x}, wrist_pitch={wrist_pitch}, dx={dx}, dy={dy}")
-                print(f"Median distance to object is {median_object_depth}.")
-                print(f"Center distance to object is {center_depth}.")
-                print("Center in mask?", center_in_mask)
-                print("Current XYZ:", current_xyz)
-
-                aligned = (
-                    np.abs(dx) < self.align_x_threshold and np.abs(dy) < self.align_y_threshold
-                )
-
-                # Fix lift to only go down
-                lift = min(lift, prev_lift)
-
-                if aligned:
-                    # First, check to see if we are close enough to grasp
-                    if center_depth < self.median_distance_when_grasping:
-                        success = self._grasp()
-                        break
-                    # If we are aligned, step the whole thing closer by some amount
-                    # This is based on the pitch - basically
-                    aligned_once = True
-                    arm_component = np.cos(wrist_pitch) * self.lift_arm_ratio
-                    lift_component = np.sin(wrist_pitch) * self.lift_arm_ratio
-                    arm += arm_component
-                    lift += lift_component
-                else:
-                    # Add these to do some really hacky proportionate control
-                    px = max(0.25, np.abs(2 * dx / target_mask.shape[1]))
-                    py = max(0.25, np.abs(2 * dy / target_mask.shape[0]))
-
-                    # Move the base and modify the wrist pitch
-                    # TODO: remove debug code
-                    # print(f"dx={dx}, dy={dy}, px={px}, py={py}")
-                    if dx > self.align_x_threshold:
-                        # Move in x - this means translate the base
-                        base_x += -self.base_x_step * px
-                    elif dx < -1 * self.align_x_threshold:
-                        base_x += self.base_x_step * px
-                    if dy > self.align_y_threshold:
-                        # Move in y - this means translate the base
-                        wrist_pitch += -self.wrist_pitch_step * py
-                    elif dy < -1 * self.align_y_threshold:
-                        wrist_pitch += self.wrist_pitch_step * py
-
-                    # Force to reacquire the target mask if we moved the camera too much
-                    prev_target_mask = None
-
-                # safety checks
-                q = [
-                    base_x,
-                    0.0,
-                    0.0,
-                    lift,
-                    arm,
-                    0.0,
-                    0.0,
-                    wrist_pitch,
-                    0.0,
-                    0.0,
-                    0.0,
-                ]  # 11 DOF: see HelloStretchIdx
-
-                q = np.array(q)
-
-                ee_pos, ee_quat = self.robot_model.manip_fk(q)
-
-                while ee_pos[2] < 0.03:
-                    lift += 0.01
-                    q[HelloStretchIdx.LIFT] = lift
-                    ee_pos, ee_quat = self.robot_model.manip_fk(q)
-
-                print("tgt x =", base_x)
-                print(" lift =", lift)
-                print("  arm =", arm)
-                print("pitch =", wrist_pitch)
-
-                self.robot.arm_to(
-                    [base_x, lift, arm, 0, wrist_pitch, 0],
-                    head=constants.look_at_ee,
-                    blocking=False,
-                )
-                prev_lift = lift
-                time.sleep(self.expected_network_delay)
-
-                # check not moving
-                if np.linalg.norm(q - q_last) < 0.05:  # TODO: tune
-                    not_moving_count += 1
-                else:
-                    not_moving_count = 0
-
-                q_last = q
-
-        if self.show_servo_gui:
-            cv2.destroyAllWindows()
-        return success
-
-    def run(self) -> None:
-        self.intro("Grasping the object.")
-        self._success = False
-        if self.show_object_to_grasp:
-            self.show_instance(self.agent.current_object)
-
-        assert self.target_object is not None, "Target object must be set before running."
-
-        # Now we should be able to see the object if we orient gripper properly
-        # Get the end effector pose
-        obs = self.robot.get_observation()
-        joint_state = self.robot.get_joint_positions()
-        model = self.robot.get_robot_model()
-
-        if joint_state[HelloStretchIdx.GRIPPER] < 0.0:
-            self.robot.open_gripper(blocking=True)
-
-        # Get the current base pose of the robot
-        xyt = self.robot.get_base_pose()
-
-        # Note that these are in the robot's current coordinate frame; they're not global coordinates, so this is ok to use to compute motions.
-        object_xyz = self.agent.current_object.get_center()
-        relative_object_xyz = point_global_to_base(object_xyz, xyt)
-
-        # Compute the angles necessary
-        if self.use_pitch_from_vertical:
-            ee_pos, ee_rot = model.manip_fk(joint_state)
-            dy = np.abs(ee_pos[1] - relative_object_xyz[1])
-            dz = np.abs(ee_pos[2] - relative_object_xyz[2])
-            pitch_from_vertical = np.arctan2(dy, dz)
-        else:
-            pitch_from_vertical = 0.0
-
-        # Compute final pregrasp joint state goal and send the robot there
-        joint_state[HelloStretchIdx.WRIST_PITCH] = -np.pi / 2 + pitch_from_vertical
-        self.robot.arm_to(joint_state, head=constants.look_at_ee, blocking=True)
-
-        if self.servo_to_grasp:
-            # If we try to servo, then do this
-            self._success = self.visual_servo_to_object(self.agent.current_object)
-
-        if not self._success:
-            self.grasp_open_loop(object_xyz)
-
-        # clear observations
-        if self.reset_observation:
-            self.observations.clear_history()
-            self.agent.reset_object_plans()
-            self.agent.voxel_map.instances.pop_global_instance(
-                env_id=0, global_instance_id=self.agent.current_object.global_id
-            )
-
-    def was_successful(self) -> bool:
-        """Return true if successful"""
-        return self._success
diff --git a/src/stretch/agent/operations/grasp_open_loop.py b/src/stretch/agent/operations/grasp_open_loop.py
new file mode 100644
index 000000000..313ab18ec
--- /dev/null
+++ b/src/stretch/agent/operations/grasp_open_loop.py
@@ -0,0 +1,210 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import time
+
+import numpy as np
+
+import stretch.motion.constants as constants
+from stretch.agent.base import ManagedOperation
+from stretch.core.interfaces import Observations
+
+# from stretch.mapping.instance import Instance
+from stretch.motion.kinematics import HelloStretchIdx
+from stretch.utils.geometry import point_global_to_base
+from stretch.utils.point_cloud import show_point_cloud
+
+
+class OpenLoopGraspObjectOperation(ManagedOperation):
+
+    debug_show_point_cloud: bool = False
+    match_method: str = "feature"
+    target_object: str = None
+
+    def configure(
+        self,
+        target_object: str,
+        match_method: str = "feature",
+        debug_show_point_cloud: bool = False,
+    ) -> None:
+        self.target_object = target_object
+        self.match_method = match_method
+        self.debug_show_point_cloud = debug_show_point_cloud
+
+    def can_start(self):
+        """Grasping can start if we have a target object picked out, and are moving to its instance, and if the robot is ready to begin manipulation."""
+        if self.target_object is None:
+            self.error("No target object set.")
+            return False
+        return self.agent.current_object is not None and self.robot.in_manipulation_mode()
+
+    def was_successful(self) -> bool:
+        """Return true if successful"""
+        return self._success
+
+    def _debug_show_point_cloud(self, servo: Observations, current_xyz: np.ndarray) -> None:
+        """Show the point cloud for debugging purposes.
+
+        Args:
+            servo (Observations): Servo observation
+            current_xyz (np.ndarray): Current xyz location
+        """
+        # TODO: remove this, overrides existing servo state
+        # servo = self.robot.get_servo_observation()
+        # world_xyz = servo.get_ee_xyz_in_world_frame()
+        world_xyz_head = servo.get_xyz_in_world_frame()
+        # all_xyz = np.concatenate([world_xyz_head.reshape(-1, 3), world_xyz.reshape(-1, 3)], axis=0)
+        # all_rgb = np.concatenate([servo.rgb.reshape(-1, 3), servo.ee_rgb.reshape(-1, 3)], axis=0)
+        all_xyz = world_xyz_head.reshape(-1, 3)
+        all_rgb = servo.rgb.reshape(-1, 3) / 255
+        show_point_cloud(all_xyz, all_rgb, orig=current_xyz)
+
+    def get_class_mask(self, servo: Observations) -> np.ndarray:
+        """Get the mask for the class of the object we are trying to grasp. Multiple options might be acceptable.
+
+        Args:
+            servo (Observations): Servo observation
+
+        Returns:
+            np.ndarray: Mask for the class of the object we are trying to grasp
+        """
+        mask = np.zeros_like(servo.semantic).astype(bool)  # type: ignore
+
+        print("[GRASP OBJECT] match method =", self.match_method)
+        if self.match_method == "class":
+
+            # Get the target class
+            if self.agent.current_object is not None:
+                target_class_id = self.agent.current_object.category_id
+                target_class = self.agent.semantic_sensor.get_class_name_for_id(target_class_id)
+            else:
+                target_class = self.target_object
+
+            print("[GRASP OBJECT] Detecting objects of class", target_class)
+
+            # Now find the mask with that class
+            for iid in np.unique(servo.semantic):
+                name = self.agent.semantic_sensor.get_class_name_for_id(iid)
+                if name is not None and target_class in name:
+                    mask = np.bitwise_or(mask, servo.semantic == iid)
+        elif self.match_method == "feature":
+            if self.target_object is None:
+                raise ValueError(
+                    f"Target object must be set before running match method {self.match_method}."
+                )
+            print("[GRASP OBJECT] Detecting objects described as", self.target_object)
+            text_features = self.agent.encode_text(self.target_object)
+            best_score = float("-inf")
+            best_iid = None
+            for iid in np.unique(servo.instance):
+
+                # Ignore the background
+                if iid < 0:
+                    continue
+
+                rgb = servo.ee_rgb * (servo.instance == iid)[:, :, None].repeat(3, axis=-1)
+
+                # TODO: remove debug code
+                #  import matplotlib.pyplot as plt
+                # plt.imshow(rgb)
+                # plt.show()
+
+                features = self.agent.encode_image(rgb)
+                score = self.agent.compare_features(text_features, features)
+                print(f" - Score for {iid} is {score}")
+                if score > best_score:
+                    best_score = score
+                    best_iid = iid
+                    mask = servo.instance == iid
+        else:
+            raise ValueError(f"Invalid matching method {self.match_method}.")
+
+        return mask
+
+    def run(self):
+        """Grasp based on data from the head camera only."""
+        self.attempt("Grasping an object")
+        obs = self.robot.get_observation()
+        current_xyz = obs.get_xyz_in_world_frame()
+        current_xyz_base = point_global_to_base(current_xyz, self.robot.get_base_position())
+
+        # Find the best object mask
+        mask = self.get_class_mask(obs)
+
+        # If the mask is empty, just use blind grasp based on instance center
+        if np.sum(mask) == 0:
+            self.warn("No mask found, using blind grasp.")
+            object_xyz = self.agent.current_object.get_center()
+        else:
+            # Find the object location
+            object_points = current_xyz[mask]
+            object_xyz = np.mean(object_points, axis=0)
+            self.info(f"Object location: {object_xyz}")
+
+        if self.debug_show_point_cloud:
+            self._debug_show_point_cloud(obs, object_xyz)
+
+        # Grasp the object
+        self.grasp_open_loop(object_xyz)
+
+    def grasp_open_loop(self, object_xyz: np.ndarray):
+        """Grasp the object in an open loop manner. We will just move to object_xyz and close the gripper.
+
+        Args:
+            object_xyz (np.ndarray): Location to grasp
+
+        Returns:
+            bool: True if successful, False otherwise
+        """
+
+        model = self.robot.get_robot_model()
+        xyt = self.robot.get_base_pose()
+        relative_object_xyz = point_global_to_base(object_xyz, xyt)
+        joint_state = self.robot.get_joint_positions()
+
+        # We assume the current end-effector orientation is the correct one, going into this
+        ee_pos, ee_rot = model.manip_fk(joint_state)
+
+        # If we failed, or if we are not servoing, then just move to the object
+        target_joint_positions, _, _, success, _ = self.robot_model.manip_ik_for_grasp_frame(
+            relative_object_xyz, ee_rot, q0=joint_state
+        )
+        target_joint_positions[HelloStretchIdx.BASE_X] -= 0.04
+        if not success:
+            print("Failed to find a valid IK solution.")
+            self._success = False
+            return
+        elif (
+            target_joint_positions[HelloStretchIdx.ARM] < 0
+            or target_joint_positions[HelloStretchIdx.LIFT] < 0
+        ):
+            print(
+                f"{self.name}: Target joint state is invalid: {target_joint_positions}. Positions for arm and lift must be positive."
+            )
+            self._success = False
+            return
+
+        # Lift the arm up a bit
+        target_joint_positions_lifted = target_joint_positions.copy()
+        target_joint_positions_lifted[HelloStretchIdx.LIFT] += self.lift_distance
+
+        # Move to the target joint state
+        print(f"{self.name}: Moving to grasp position.")
+        self.robot.arm_to(target_joint_positions, head=constants.look_at_ee, blocking=True)
+        time.sleep(0.5)
+        print(f"{self.name}: Closing the gripper.")
+        self.robot.close_gripper(blocking=True)
+        time.sleep(0.5)
+        print(f"{self.name}: Lifting the arm up so as not to hit the base.")
+        self.robot.arm_to(target_joint_positions_lifted, head=constants.look_at_ee, blocking=False)
+        print(f"{self.name}: Return arm to initial configuration.")
+        self.robot.arm_to(joint_state, head=constants.look_at_ee, blocking=True)
+        print(f"{self.name}: Done.")
+        self._success = True
+        return

From 7359cc1d3a7c782ed6be8546bd0b1eb46f223f05 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 16:46:33 -0400
Subject: [PATCH 179/410] add a flag to try the open-loop grasping code

---
 src/stretch/agent/operations/__init__.py     |  4 ++-
 src/stretch/agent/operations/grasp_object.py |  2 --
 src/stretch/agent/task/pickup/pickup_task.py | 36 ++++++++++++++------
 src/stretch/app/ai_pickup.py                 |  3 ++
 src/stretch/app/pickup.py                    |  8 ++++-
 5 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/src/stretch/agent/operations/__init__.py b/src/stretch/agent/operations/__init__.py
index 71fb89fc2..250b2f9cd 100644
--- a/src/stretch/agent/operations/__init__.py
+++ b/src/stretch/agent/operations/__init__.py
@@ -16,8 +16,10 @@
     WaveOperation,
     WithdrawOperation,
 )
-from .grasp_closed_loop import ClosedLoopGraspObjectOperation
+
+# from .grasp_closed_loop import ClosedLoopGraspObjectOperation
 from .grasp_object import GraspObjectOperation
+from .grasp_open_loop import OpenLoopGraspObjectOperation
 from .navigate import NavigateToObjectOperation
 from .place_object import PlaceObjectOperation
 from .pregrasp import PreGraspObjectOperation
diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index 5a029a4db..0ef1130f8 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -61,8 +61,6 @@ class GraspObjectOperation(ManagedOperation):
     # align_x_threshold: int = 10
     # align_y_threshold: int = 7
     # These are the values used to decide when it's aligned enough to grasp
-    # align_x_threshold: int = 15
-    # align_y_threshold: int = 15
     align_x_threshold: int = 15
     align_y_threshold: int = 15
 
diff --git a/src/stretch/agent/task/pickup/pickup_task.py b/src/stretch/agent/task/pickup/pickup_task.py
index 7b2049541..c09447e56 100644
--- a/src/stretch/agent/task/pickup/pickup_task.py
+++ b/src/stretch/agent/task/pickup/pickup_task.py
@@ -14,6 +14,7 @@
     GoToNavOperation,
     GraspObjectOperation,
     NavigateToObjectOperation,
+    OpenLoopGraspObjectOperation,
     PlaceObjectOperation,
     PreGraspObjectOperation,
     RotateInPlaceOperation,
@@ -161,17 +162,30 @@ def get_one_shot_task(self, add_rotate: bool = False, matching: str = "feature")
         )
         # If we cannot start, we should go back to the search_for_object operation.
         # To determine if we can start, we look at the end effector camera and see if there's anything detectable.
-        grasp_object = GraspObjectOperation(
-            "grasp_the_object",
-            self.agent,
-            parent=pregrasp_object,
-            on_failure=pregrasp_object,
-            on_cannot_start=go_to_object,
-            retry_on_failure=False,
-        )
-        grasp_object.set_target_object_class(self.agent.target_object)
-        grasp_object.servo_to_grasp = self.use_visual_servoing_for_grasp
-        grasp_object.match_method = matching
+        if self.use_visual_servoing_for_grasp:
+            grasp_object = GraspObjectOperation(
+                f"grasp_the_{self.target_object}",
+                self.agent,
+                parent=pregrasp_object,
+                on_failure=pregrasp_object,
+                on_cannot_start=go_to_object,
+                retry_on_failure=False,
+            )
+            grasp_object.set_target_object_class(self.agent.target_object)
+            grasp_object.servo_to_grasp = True
+            grasp_object.match_method = matching
+        else:
+            grasp_object = OpenLoopGraspObjectOperation(
+                f"grasp_the_{self.target_object}",
+                self.agent,
+                parent=pregrasp_object,
+                on_failure=pregrasp_object,
+                on_cannot_start=go_to_object,
+                retry_on_failure=False,
+            )
+            grasp_object.set_target_object_class(self.agent.target_object)
+            grasp_object.match_method = matching
+
         place_object_on_receptacle = PlaceObjectOperation(
             "place_object_on_receptacle", self.agent, on_cannot_start=go_to_receptacle
         )
diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index 383768ac6..ed308bd61 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -88,6 +88,7 @@
     is_flag=True,
     help="Set to use voice input",
 )
+@click.option("--open_loop", "--open-loop", is_flag=True, help="Use open loop grasping")
 def main(
     robot_ip: str = "192.168.1.15",
     local: bool = False,
@@ -103,6 +104,7 @@ def main(
     llm: str = "gemma",
     use_llm: bool = False,
     use_voice: bool = False,
+    open_loop: bool = False,
 ):
     """Set up the robot, create a task plan, and execute it."""
     # Create robot
@@ -168,6 +170,7 @@ def main(
                 target_object=target_object,
                 target_receptacle=receptacle,
                 matching=match_method,
+                use_visual_servoing_for_grasp=not open_loop,
             )
             task = pickup_task.get_task(add_rotate=True, mode=mode)
         except Exception as e:
diff --git a/src/stretch/app/pickup.py b/src/stretch/app/pickup.py
index a88884fb4..369c12e29 100644
--- a/src/stretch/app/pickup.py
+++ b/src/stretch/app/pickup.py
@@ -59,9 +59,11 @@
     help="Mode of operation for the robot.",
     type=click.Choice(["one_shot", "all"]),
 )
+@click.option("--open_loop", is_flag=True, help="Use open loop grasping")
 def main(
     robot_ip: str = "192.168.1.15",
     local: bool = False,
+    open_loop: bool = False,
     parameter_file: str = "config/default_planner.yaml",
     device_id: int = 0,
     verbose: bool = False,
@@ -99,7 +101,11 @@ def main(
     # After the robot has started...
     try:
         pickup_task = PickupTask(
-            agent, target_object=target_object, target_receptacle=receptacle, matching=match_method
+            agent,
+            target_object=target_object,
+            target_receptacle=receptacle,
+            matching=match_method,
+            use_visual_servoing_for_grasp=not open_loop,
         )
         task = pickup_task.get_task(add_rotate=force_rotate, mode=mode)
     except Exception as e:

From 369e5f06bf5860b4ba7340a81e1f57d00602ab84 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 16:47:32 -0400
Subject: [PATCH 180/410] helper function(s) for setting target object class

---
 src/stretch/agent/operations/grasp_open_loop.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/stretch/agent/operations/grasp_open_loop.py b/src/stretch/agent/operations/grasp_open_loop.py
index 313ab18ec..021a465ea 100644
--- a/src/stretch/agent/operations/grasp_open_loop.py
+++ b/src/stretch/agent/operations/grasp_open_loop.py
@@ -37,6 +37,15 @@ def configure(
         self.match_method = match_method
         self.debug_show_point_cloud = debug_show_point_cloud
 
+    def set_target_object_class(self, target_object: str) -> None:
+        self.target_object = target_object
+
+    def set_target_object_instance(self, target_object: str) -> None:
+        self.target_object = target_object
+
+    def set_target_object(self, target_object: str) -> None:
+        self.target_object = target_object
+
     def can_start(self):
         """Grasping can start if we have a target object picked out, and are moving to its instance, and if the robot is ready to begin manipulation."""
         if self.target_object is None:

From ba8e3bdc4da5471a94f1bd6858bcdc0306e839ba Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 16:48:01 -0400
Subject: [PATCH 181/410] update grasp open loop

---
 src/stretch/agent/operations/grasp_open_loop.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_open_loop.py b/src/stretch/agent/operations/grasp_open_loop.py
index 021a465ea..d594e7ed4 100644
--- a/src/stretch/agent/operations/grasp_open_loop.py
+++ b/src/stretch/agent/operations/grasp_open_loop.py
@@ -40,9 +40,6 @@ def configure(
     def set_target_object_class(self, target_object: str) -> None:
         self.target_object = target_object
 
-    def set_target_object_instance(self, target_object: str) -> None:
-        self.target_object = target_object
-
     def set_target_object(self, target_object: str) -> None:
         self.target_object = target_object
 

From dde7c5fa328bc65cedc9d904e731a78e7ca67657 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 16:49:15 -0400
Subject: [PATCH 182/410] update

---
 src/stretch/agent/operations/grasp_open_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/agent/operations/grasp_open_loop.py b/src/stretch/agent/operations/grasp_open_loop.py
index d594e7ed4..a0008299f 100644
--- a/src/stretch/agent/operations/grasp_open_loop.py
+++ b/src/stretch/agent/operations/grasp_open_loop.py
@@ -138,7 +138,7 @@ def run(self):
         self.attempt("Grasping an object")
         obs = self.robot.get_observation()
         current_xyz = obs.get_xyz_in_world_frame()
-        current_xyz_base = point_global_to_base(current_xyz, self.robot.get_base_position())
+        current_xyz_base = point_global_to_base(current_xyz, self.robot.get_base_pose())
 
         # Find the best object mask
         mask = self.get_class_mask(obs)

From a1cbc54d742f326d6cca97d303519707b5dddcdb Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 16:55:04 -0400
Subject: [PATCH 183/410] various fixes

---
 src/stretch/agent/operations/grasp_open_loop.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_open_loop.py b/src/stretch/agent/operations/grasp_open_loop.py
index a0008299f..3b5123cc9 100644
--- a/src/stretch/agent/operations/grasp_open_loop.py
+++ b/src/stretch/agent/operations/grasp_open_loop.py
@@ -138,7 +138,6 @@ def run(self):
         self.attempt("Grasping an object")
         obs = self.robot.get_observation()
         current_xyz = obs.get_xyz_in_world_frame()
-        current_xyz_base = point_global_to_base(current_xyz, self.robot.get_base_pose())
 
         # Find the best object mask
         mask = self.get_class_mask(obs)
@@ -149,7 +148,7 @@ def run(self):
             object_xyz = self.agent.current_object.get_center()
         else:
             # Find the object location
-            object_points = current_xyz[mask]
+            object_points = current_xyz[mask, :]
             object_xyz = np.mean(object_points, axis=0)
             self.info(f"Object location: {object_xyz}")
 

From 3e33b13925c1ddaef417083facbc735e3c1ef300 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 16:57:29 -0400
Subject: [PATCH 184/410] Make ee semantic prediction

---
 src/stretch/agent/operations/grasp_open_loop.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/stretch/agent/operations/grasp_open_loop.py b/src/stretch/agent/operations/grasp_open_loop.py
index 3b5123cc9..766251abf 100644
--- a/src/stretch/agent/operations/grasp_open_loop.py
+++ b/src/stretch/agent/operations/grasp_open_loop.py
@@ -137,6 +137,7 @@ def run(self):
         """Grasp based on data from the head camera only."""
         self.attempt("Grasping an object")
         obs = self.robot.get_observation()
+        obs = self.agent.semantic_sensor.predict(obs, ee=False)
         current_xyz = obs.get_xyz_in_world_frame()
 
         # Find the best object mask

From 24debcd971301792abc36b9835a1619ea8c6f61c Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 17:00:47 -0400
Subject: [PATCH 185/410] update and remove a breakpoint

---
 src/stretch/agent/operations/grasp_open_loop.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_open_loop.py b/src/stretch/agent/operations/grasp_open_loop.py
index 766251abf..31e33b8e5 100644
--- a/src/stretch/agent/operations/grasp_open_loop.py
+++ b/src/stretch/agent/operations/grasp_open_loop.py
@@ -61,12 +61,7 @@ def _debug_show_point_cloud(self, servo: Observations, current_xyz: np.ndarray)
             servo (Observations): Servo observation
             current_xyz (np.ndarray): Current xyz location
         """
-        # TODO: remove this, overrides existing servo state
-        # servo = self.robot.get_servo_observation()
-        # world_xyz = servo.get_ee_xyz_in_world_frame()
         world_xyz_head = servo.get_xyz_in_world_frame()
-        # all_xyz = np.concatenate([world_xyz_head.reshape(-1, 3), world_xyz.reshape(-1, 3)], axis=0)
-        # all_rgb = np.concatenate([servo.rgb.reshape(-1, 3), servo.ee_rgb.reshape(-1, 3)], axis=0)
         all_xyz = world_xyz_head.reshape(-1, 3)
         all_rgb = servo.rgb.reshape(-1, 3) / 255
         show_point_cloud(all_xyz, all_rgb, orig=current_xyz)
@@ -108,13 +103,15 @@ def get_class_mask(self, servo: Observations) -> np.ndarray:
             text_features = self.agent.encode_text(self.target_object)
             best_score = float("-inf")
             best_iid = None
+
+            breakpoint()
             for iid in np.unique(servo.instance):
 
                 # Ignore the background
                 if iid < 0:
                     continue
 
-                rgb = servo.ee_rgb * (servo.instance == iid)[:, :, None].repeat(3, axis=-1)
+                rgb = servo.rgb * (servo.instance == iid)[:, :, None].repeat(3, axis=-1)
 
                 # TODO: remove debug code
                 #  import matplotlib.pyplot as plt

From b5d00bbaa66556e047cbddba96c1990f8239604b Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 17:37:30 -0400
Subject: [PATCH 186/410] no breakpoint. no tts

---
 src/stretch/agent/operations/grasp_open_loop.py | 1 -
 src/stretch/app/ai_pickup.py                    | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_open_loop.py b/src/stretch/agent/operations/grasp_open_loop.py
index 31e33b8e5..33611879e 100644
--- a/src/stretch/agent/operations/grasp_open_loop.py
+++ b/src/stretch/agent/operations/grasp_open_loop.py
@@ -104,7 +104,6 @@ def get_class_mask(self, servo: Observations) -> np.ndarray:
             best_score = float("-inf")
             best_iid = None
 
-            breakpoint()
             for iid in np.unique(servo.instance):
 
                 # Ignore the background
diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index ed308bd61..f389af185 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -152,12 +152,12 @@ def main(
             # Call the LLM client and parse
             llm_response = chat_wrapper.query()
             target_object = prompt.get_object(llm_response)
-            receptacle = prompt.get_receptacle(llm_response)
+            qreceptacle = prompt.get_receptacle(llm_response)
             say_this = prompt.get_say_this(llm_response)
 
         if say_this is not None:
             chat_wrapper.say(say_this)
-            agent.say(say_this)
+            # agent.say(say_this)
 
         if len(target_object) == 0 or len(receptacle) == 0:
             logger.error("You need to enter a target object and receptacle")

From 75c025cee90098dfc78bb33b91b255f95fda395e Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 17:43:05 -0400
Subject: [PATCH 187/410] extra LLM debugging outputs

---
 src/stretch/app/ai_pickup.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index f389af185..abefbb2b6 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -152,8 +152,12 @@ def main(
             # Call the LLM client and parse
             llm_response = chat_wrapper.query()
             target_object = prompt.get_object(llm_response)
-            qreceptacle = prompt.get_receptacle(llm_response)
+            receptacle = prompt.get_receptacle(llm_response)
             say_this = prompt.get_say_this(llm_response)
+            print("LLM response:", llm_response)
+            print("Target object:", target_object)
+            print("Receptacle:", receptacle)
+            print("Say this:", say_this)
 
         if say_this is not None:
             chat_wrapper.say(say_this)

From 70557b0f757f5450aca610d8e60ae8cb22004812 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 17:47:03 -0400
Subject: [PATCH 188/410] updates to code

---
 src/stretch/agent/operations/grasp_open_loop.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/stretch/agent/operations/grasp_open_loop.py b/src/stretch/agent/operations/grasp_open_loop.py
index 33611879e..ec4c1ba40 100644
--- a/src/stretch/agent/operations/grasp_open_loop.py
+++ b/src/stretch/agent/operations/grasp_open_loop.py
@@ -26,16 +26,19 @@ class OpenLoopGraspObjectOperation(ManagedOperation):
     debug_show_point_cloud: bool = False
     match_method: str = "feature"
     target_object: str = None
+    lift_distance: float = 0.2
 
     def configure(
         self,
         target_object: str,
         match_method: str = "feature",
         debug_show_point_cloud: bool = False,
+        lift_distance: float = 0.2,
     ) -> None:
         self.target_object = target_object
         self.match_method = match_method
         self.debug_show_point_cloud = debug_show_point_cloud
+        self.lift_distance = lift_distance
 
     def set_target_object_class(self, target_object: str) -> None:
         self.target_object = target_object

From 4e4c3d58467a3aff33b3b318ebf41c53bcb8800c Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 18:00:55 -0400
Subject: [PATCH 189/410] update and capture

---
 .../agent/operations/grasp_open_loop.py       | 24 ++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/operations/grasp_open_loop.py b/src/stretch/agent/operations/grasp_open_loop.py
index ec4c1ba40..2863af24a 100644
--- a/src/stretch/agent/operations/grasp_open_loop.py
+++ b/src/stretch/agent/operations/grasp_open_loop.py
@@ -23,7 +23,7 @@
 
 class OpenLoopGraspObjectOperation(ManagedOperation):
 
-    debug_show_point_cloud: bool = False
+    debug_show_point_cloud: bool = True
     match_method: str = "feature"
     target_object: str = None
     lift_distance: float = 0.2
@@ -135,6 +135,23 @@ def get_class_mask(self, servo: Observations) -> np.ndarray:
     def run(self):
         """Grasp based on data from the head camera only."""
         self.attempt("Grasping an object")
+
+        # Get the arm out of the way so we can look for the object
+        q = self.robot.get_joint_positions()
+        base_x = q[HelloStretchIdx.BASE_X]
+        lift = q[HelloStretchIdx.LIFT]
+        arm = q[HelloStretchIdx.ARM]
+        wrist_pitch = q[HelloStretchIdx.WRIST_PITCH]
+        wrist_yaw = q[HelloStretchIdx.WRIST_YAW]
+        wrist_roll = q[HelloStretchIdx.WRIST_ROLL]
+        self.robot.arm_to(
+            [base_x, 0.3, arm, wrist_roll, wrist_pitch, wrist_yaw],
+            head=[-np.pi / 2, -3 * np.pi / 8],
+            blocking=True,
+        )
+        time.sleep(0.25)
+
+        # Capture an observation
         obs = self.robot.get_observation()
         obs = self.agent.semantic_sensor.predict(obs, ee=False)
         current_xyz = obs.get_xyz_in_world_frame()
@@ -155,6 +172,8 @@ def run(self):
         if self.debug_show_point_cloud:
             self._debug_show_point_cloud(obs, object_xyz)
 
+        breakpoint()
+
         # Grasp the object
         self.grasp_open_loop(object_xyz)
 
@@ -171,6 +190,9 @@ def grasp_open_loop(self, object_xyz: np.ndarray):
         model = self.robot.get_robot_model()
         xyt = self.robot.get_base_pose()
         relative_object_xyz = point_global_to_base(object_xyz, xyt)
+
+        breakpoint()
+
         joint_state = self.robot.get_joint_positions()
 
         # We assume the current end-effector orientation is the correct one, going into this

From f2180d0f69d2ebc916a472ce312f169238b12533 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 16 Sep 2024 18:16:19 -0400
Subject: [PATCH 190/410] open loop grasping

---
 src/stretch/agent/operations/grasp_open_loop.py | 10 +++++-----
 src/stretch/app/ai_pickup.py                    | 14 +++++++-------
 src/stretch/llms/chat_wrapper.py                |  6 ++++++
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_open_loop.py b/src/stretch/agent/operations/grasp_open_loop.py
index 2863af24a..b11c77ff6 100644
--- a/src/stretch/agent/operations/grasp_open_loop.py
+++ b/src/stretch/agent/operations/grasp_open_loop.py
@@ -23,7 +23,7 @@
 
 class OpenLoopGraspObjectOperation(ManagedOperation):
 
-    debug_show_point_cloud: bool = True
+    debug_show_point_cloud: bool = False
     match_method: str = "feature"
     target_object: str = None
     lift_distance: float = 0.2
@@ -172,8 +172,6 @@ def run(self):
         if self.debug_show_point_cloud:
             self._debug_show_point_cloud(obs, object_xyz)
 
-        breakpoint()
-
         # Grasp the object
         self.grasp_open_loop(object_xyz)
 
@@ -191,8 +189,6 @@ def grasp_open_loop(self, object_xyz: np.ndarray):
         xyt = self.robot.get_base_pose()
         relative_object_xyz = point_global_to_base(object_xyz, xyt)
 
-        breakpoint()
-
         joint_state = self.robot.get_joint_positions()
 
         # We assume the current end-effector orientation is the correct one, going into this
@@ -221,6 +217,10 @@ def grasp_open_loop(self, object_xyz: np.ndarray):
         target_joint_positions_lifted = target_joint_positions.copy()
         target_joint_positions_lifted[HelloStretchIdx.LIFT] += self.lift_distance
 
+        print("Go to lifted target position")
+        self.robot.arm_to(target_joint_positions_lifted, head=constants.look_at_ee, blocking=True)
+        time.sleep(0.5)
+
         # Move to the target joint state
         print(f"{self.name}: Moving to grasp position.")
         self.robot.arm_to(target_joint_positions, head=constants.look_at_ee, blocking=True)
diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index abefbb2b6..1f6c57ff6 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -11,7 +11,7 @@
 
 import click
 
-import stretch.utils.logger as logger
+# import stretch.utils.logger as logger
 from stretch.agent.robot_agent import RobotAgent
 from stretch.agent.task.pickup import PickupTask
 from stretch.agent.zmq_client import HomeRobotZmqClient
@@ -154,18 +154,18 @@ def main(
             target_object = prompt.get_object(llm_response)
             receptacle = prompt.get_receptacle(llm_response)
             say_this = prompt.get_say_this(llm_response)
-            print("LLM response:", llm_response)
-            print("Target object:", target_object)
-            print("Receptacle:", receptacle)
-            print("Say this:", say_this)
+            # print("LLM response:", llm_response)
+            # print("Target object:", target_object)
+            # print("Receptacle:", receptacle)
+            # print("Say this:", say_this)
 
         if say_this is not None:
             chat_wrapper.say(say_this)
             # agent.say(say_this)
 
         if len(target_object) == 0 or len(receptacle) == 0:
-            logger.error("You need to enter a target object and receptacle")
-            break
+            # logger.error("You need to enter a target object and receptacle")
+            continue
 
         # After the robot has started...
         try:
diff --git a/src/stretch/llms/chat_wrapper.py b/src/stretch/llms/chat_wrapper.py
index 872c41261..af0b46787 100644
--- a/src/stretch/llms/chat_wrapper.py
+++ b/src/stretch/llms/chat_wrapper.py
@@ -46,7 +46,9 @@ def __init__(
 
     def query(self, verbose: bool = False) -> Any:
         if self.voice:
+            print("-" * 80)
             input(colored("Press enter to speak or ctrl+c to exit.", "yellow"))
+            print("-" * 80)
 
             # Create a temporary file
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
@@ -65,7 +67,9 @@ def query(self, verbose: bool = False) -> Any:
 
                 print(colored("I heard:", "green"), input_text)
         else:
+            print("-" * 80)
             input_text = input(colored("You: ", "green"))
+            print("-" * 80)
 
         if len(input_text) == 0:
             return None
@@ -90,4 +94,6 @@ def say(self, text: str) -> None:
         if self.voice:
             raise NotImplementedError("Voice is not supported yet.")
         else:
+            print("-" * 80)
             print(colored("Robot:", "blue"), text)
+            print("-" * 80)

From dd099b81e19a97689343cc731bad97c16acbed6f Mon Sep 17 00:00:00 2001
From: Peiqi Liu <leo20021210@163.com>
Date: Wed, 18 Sep 2024 19:30:09 -0400
Subject: [PATCH 191/410] change

---
 src/stretch/dynav/mapping_utils/voxel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/dynav/mapping_utils/voxel.py b/src/stretch/dynav/mapping_utils/voxel.py
index 2d97b3cc8..22ea48d36 100644
--- a/src/stretch/dynav/mapping_utils/voxel.py
+++ b/src/stretch/dynav/mapping_utils/voxel.py
@@ -667,7 +667,7 @@ def get_2d_map(
         obstacles_soft = torch.sum(obstacle_voxels, dim=-1)
         obstacles = obstacles_soft > self.obs_min_density
 
-        history_ids = history_ids[:, :, min_height:max_height]
+        # history_ids = history_ids[:, :, min_height:max_height]
         history_soft = torch.max(history_ids, dim=-1).values
         history_soft = torch.from_numpy(maximum_filter(history_soft.float().numpy(), size = 5))
 

From b94df65030c15f2c5091b9bc9481050e8e213bd8 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Thu, 19 Sep 2024 15:15:06 -0700
Subject: [PATCH 192/410] Actual robot mesh realtime rerun working

---
 src/stretch/visualization/rerun.py | 54 +++++++++++++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 2cb81b224..2402dd88f 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -99,6 +99,56 @@ def occupancy_map_to_3d_points(
 
 
 class StretchURDFLogger(urdf_visualizer.URDFVisualizer):
+    link_names = []
+    link_poses = []
+
+    def load_robot_mesh(self, cfg: dict = None, use_collision: bool = True, debug: bool = False):
+        trimesh_list = self.get_tri_meshes(cfg=cfg, use_collision=use_collision)
+        self.link_names = trimesh_list["link"]
+        self.link_poses = trimesh_list["pose"]
+        for i in range(len(trimesh_list["link"])):
+            rr.log(
+                f"world/robot/mesh/{trimesh_list['link'][i]}",
+                rr.Mesh3D(
+                    vertex_positions=trimesh_list["mesh"][i].vertices,
+                    triangle_indices=trimesh_list["mesh"][i].faces,
+                    vertex_normals=trimesh_list["mesh"][i].vertex_normals,
+                ),
+                static=True,
+            )
+
+    def log_transforms(self, cfg: dict, debug: bool = False):
+        lk_cfg = {
+            "joint_wrist_yaw": cfg["wrist_yaw"],
+            "joint_wrist_pitch": cfg["wrist_pitch"],
+            "joint_wrist_roll": cfg["wrist_roll"],
+            "joint_lift": cfg["lift"],
+            "joint_arm_l0": cfg["arm"] / 4,
+            "joint_arm_l1": cfg["arm"] / 4,
+            "joint_arm_l2": cfg["arm"] / 4,
+            "joint_arm_l3": cfg["arm"] / 4,
+            "joint_head_pan": cfg["head_pan"],
+            "joint_head_tilt": cfg["head_tilt"],
+        }
+        if "gripper" in cfg.keys():
+            lk_cfg["joint_gripper_finger_left"] = cfg["gripper"]
+            lk_cfg["joint_gripper_finger_right"] = cfg["gripper"]
+        tms = self.get_tri_meshes(cfg=lk_cfg, use_collision=False)
+        self.link_poses = tms["pose"]
+        self.link_names = tms["link"]
+        for link in self.link_names:
+            idx = self.link_names.index(link)
+            rr.set_time_seconds("realtime", time.time())
+            rr.log(
+                f"world/robot/mesh/{link}",
+                rr.Transform3D(
+                    translation=self.link_poses[idx][:3, 3],
+                    mat3x3=self.link_poses[idx][:3, :3],
+                    axis_length=0.3,
+                ),
+                static=False,
+            )
+
     def log_robot_mesh(self, cfg: dict, use_collision: bool = True, debug: bool = False):
         """
         Log robot mesh using joint configuration data
@@ -157,7 +207,8 @@ def log(self, obs, use_collision: bool = True, debug: bool = False):
         for k in HelloStretchIdx.name_to_idx:
             cfg[k] = state[HelloStretchIdx.name_to_idx[k]]
         # breakpoint()
-        self.log_robot_mesh(cfg=cfg, use_collision=use_collision)
+        # self.log_robot_mesh(cfg=cfg, use_collision=use_collision)
+        self.log_transforms(cfg=cfg, debug=debug)
 
 
 class RerunVsualizer:
@@ -169,6 +220,7 @@ def __init__(self, display_robot_mesh: bool = True, open_browser: bool = True):
 
         if self.display_robot_mesh:
             self.urdf_logger = StretchURDFLogger()
+            self.urdf_logger.load_robot_mesh(use_collision=False)
 
         # Create environment Box place holder
         rr.log(

From 1e70832a7617e61e2d6792be62a651a6e5ec970a Mon Sep 17 00:00:00 2001
From: Peiqi Liu <leo20021210@163.com>
Date: Thu, 19 Sep 2024 20:54:09 -0400
Subject: [PATCH 193/410] save

---
 src/stretch/dynav/llm_localizer.py         | 16 ++++++++++-----
 src/stretch/dynav/llm_server.py            | 23 ++++++++++------------
 src/stretch/dynav/robot_agent_manip_mdp.py |  7 ++-----
 src/stretch/dynav/voxel_map_localizer.py   |  2 +-
 4 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/stretch/dynav/llm_localizer.py b/src/stretch/dynav/llm_localizer.py
index cf9c15e2b..a0d273e28 100644
--- a/src/stretch/dynav/llm_localizer.py
+++ b/src/stretch/dynav/llm_localizer.py
@@ -146,7 +146,7 @@ def add(self,
                         weights = weights,
                         obs_count = obs_count)
 
-    def compute_coord(self, text, image_info, threshold = 0.25):
+    def compute_coord(self, text, image_info, threshold = 0.2):
         rgb = image_info['image']
         inputs = self.exist_processor(text=[['a photo of a ' + text]], images=rgb, return_tensors="pt")
         for input in inputs:
@@ -175,7 +175,7 @@ def owl_locater(self, A, encoded_image, timestamps_lst):
             if i not in encoded_image:
                 continue
             image_info = encoded_image[i][-1]
-            res = self.compute_coord(A, image_info, threshold=0.15)
+            res = self.compute_coord(A, image_info, threshold=0.2)
             if res is not None:
                 debug_text = '#### - Obejct is detected in observations where instance' + str(i + 1) + ' comes from. **😃** Directly navigate to it.\n'
                 return res, debug_text, i, None
@@ -184,7 +184,7 @@ def owl_locater(self, A, encoded_image, timestamps_lst):
         return None, debug_text, None, None
     
     def gpt_chunk(self, chunk, sys_prompt, user_prompt):
-        for i in range(10):
+        for i in range(3):
             try:
                 response = self.gpt_client.chat.completions.create(
                     model=self.existence_checking_model,
@@ -264,7 +264,9 @@ def llm_locator(self, A, encoded_image, process_chunk, context_length = 30):
         # print(A, timestamps_lst)
         return self.owl_locater(A, encoded_image, timestamps_lst)
 
-    def localize_A(self, A, debug = True, return_debug = False, count_threshold = 3):
+    def localize_A(self, A, debug = True, return_debug = False, count_threshold = -1):
+        start_time = time.time()
+
         encoded_image = OrderedDict()
 
         counts = torch.bincount(self.voxel_pcd._obs_counts)
@@ -278,7 +280,7 @@ def localize_A(self, A, debug = True, return_debug = False, count_threshold = 3)
 
         if 'gemini' in self.existence_checking_model:
             process_chunk = self.gemini_chunk
-            context_length = 100
+            context_length = 60
         elif 'gpt' in self.existence_checking_model:
             process_chunk = self.gpt_chunk
             context_length = 30
@@ -312,6 +314,10 @@ def localize_A(self, A, debug = True, return_debug = False, count_threshold = 3)
                         "url": f"data:image/png;base64,{base64_encoded}"}
                     }, {'image':image, 'xyz':xyz, 'depth':depth}]
         target_point, debug_text, obs, point = self.llm_locator(A, encoded_image, process_chunk, context_length)
+
+        end_time = time.time()
+        print('visual grounding takes', end_time - start_time, 'seconds')
+
         if not debug:
             return target_point
         elif not return_debug:
diff --git a/src/stretch/dynav/llm_server.py b/src/stretch/dynav/llm_server.py
index 8d0ab64bd..85c2b01ac 100644
--- a/src/stretch/dynav/llm_server.py
+++ b/src/stretch/dynav/llm_server.py
@@ -101,7 +101,7 @@ def get_xyz(depth, pose, intrinsics):
 
 class ImageProcessor:
     def __init__(self,  
-        vision_method = 'flash_owl', 
+        vision_method = 'pro_owl', 
         siglip = True,
         device = 'cuda',
         min_depth = 0.25,
@@ -234,9 +234,6 @@ def process_text(self, text, start_pose):
         
         if len(localized_point) == 2:
             localized_point = np.array([localized_point[0], localized_point[1], 0])
-            
-        if torch.linalg.norm((torch.tensor(localized_point) - torch.tensor(start_pose))[:2]) < 0.7:
-            return [[np.nan, np.nan, np.nan], localized_point]
 
         point = self.sample_navigation(start_pose, localized_point)
 
@@ -263,14 +260,14 @@ def process_text(self, text, start_pose):
             debug_text = '# Robot\'s monologue: \n' + debug_text
             rr.log("robot_monologue", rr.TextDocument(debug_text, media_type = rr.MediaType.MARKDOWN), static = self.static)
 
-        if obs is not None and mode == 'navigation':
-            rgb = self.voxel_map.observations[obs - 1].rgb
-            if not self.rerun:
-                if isinstance(rgb, torch.Tensor):
-                    rgb = np.array(rgb)
-                cv2.imwrite(self.log + '/debug_' + text + '.png', rgb[:, :, [2, 1, 0]])
-            else:
-                rr.log('/Past_observation_most_similar_to_text', rr.Image(rgb), static = self.static)
+        # if obs is not None and mode == 'navigation':
+        #     rgb = self.voxel_map.observations[obs - 1].rgb
+        #     if not self.rerun:
+        #         if isinstance(rgb, torch.Tensor):
+        #             rgb = np.array(rgb)
+        #         cv2.imwrite(self.log + '/debug_' + text + '.png', rgb[:, :, [2, 1, 0]])
+        #     else:
+        #         rr.log('/Past_observation_most_similar_to_text', rr.Image(rgb), static = self.static)
         traj = []
         waypoints = None
 
@@ -283,7 +280,7 @@ def process_text(self, text, start_pose):
                 waypoints = [pt.state for pt in res.trajectory]
                 # If we are navigating to some object of interst, send (x, y, z) of 
                 # the object so that we can make sure the robot looks at the object after navigation
-                finished = len(waypoints) <= 6 and mode == 'navigation'
+                finished = len(waypoints) <= 10 and mode == 'navigation'
                 if not finished:
                     waypoints = waypoints[:8]
                 traj = self.planner.clean_path_for_xy(waypoints)
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index c3805c127..36a2c3af0 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -25,8 +25,8 @@
 from stretch.dynav.ok_robot_hw.camera import RealSenseCamera
 from stretch.dynav.ok_robot_hw.utils.grasper_utils import pickup, move_to_point, capture_and_process_image
 from stretch.dynav.ok_robot_hw.utils.communication_utils import send_array, recv_array
-from stretch.dynav.voxel_map_server import ImageProcessor
-# from stretch.dynav.llm_server import ImageProcessor
+# from stretch.dynav.voxel_map_server import ImageProcessor
+from stretch.dynav.llm_server import ImageProcessor
 
 import cv2
 
@@ -90,10 +90,7 @@ def look_around(self):
         print("*" * 10, "Look around to check", "*" * 10)
         for pan in [0.4, -0.4, -1.2]:
             for tilt in [-0.6]:
-                start_time = time.time()
                 self.robot.head_to(pan, tilt, blocking = True)
-                end_time = time.time()
-                print('moving head takes ', end_time - start_time, 'seconds.')
                 self.update()
 
     def rotate_in_place(self):
diff --git a/src/stretch/dynav/voxel_map_localizer.py b/src/stretch/dynav/voxel_map_localizer.py
index d43868fbd..987ffac15 100644
--- a/src/stretch/dynav/voxel_map_localizer.py
+++ b/src/stretch/dynav/voxel_map_localizer.py
@@ -163,7 +163,7 @@ def find_obs_id_for_A(self, A):
         alignments = self.find_alignment_over_model(A).cpu()
         return obs_counts[alignments.argmax(dim = -1)].detach().cpu()
 
-    def compute_coord(self, text, obs_id, threshold = 0.25):
+    def compute_coord(self, text, obs_id, threshold = 0.2):
         # print(obs_id, len(self.voxel_map_wrapper.observations))
         if obs_id <= 0 or obs_id > len(self.voxel_map_wrapper.observations):
             return None

From 96ba8e80fdd18be839c27c25a52870c88c829ba3 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Thu, 19 Sep 2024 18:19:15 -0700
Subject: [PATCH 194/410] rerun cleanup

---
 src/stretch/agent/robot_agent.py             |   2 +-
 src/stretch/agent/zmq_client.py              |   1 -
 src/stretch/visualization/rerun.py           | 136 ++++++++-----------
 src/stretch/visualization/urdf_visualizer.py |  11 +-
 4 files changed, 64 insertions(+), 86 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 5b2665b55..a1d04a520 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -395,7 +395,7 @@ def rotate_in_place(
                 self.voxel_map.show(
                     orig=np.zeros(3),
                     xyt=self.robot.get_base_pose(),
-                    footprint=self.robot.get_robot_model().get_footprint(),
+                    footprint=self.robot.get_robot_model().get_(),
                     instances=self.semantic_sensor is not None,
                 )
 
diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 7b57225bd..22a8c0031 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -1135,7 +1135,6 @@ def blocking_spin_rerun(self) -> None:
         """Use the rerun server so that we can visualize what is going on as the robot takes actions in the world."""
         while not self._finish:
             self._rerun.step(self._obs, self._servo)
-            time.sleep(0.3)
 
     @property
     def is_homed(self) -> bool:
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 2402dd88f..b219fdcd7 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -117,7 +117,17 @@ def load_robot_mesh(self, cfg: dict = None, use_collision: bool = True, debug: b
                 static=True,
             )
 
-    def log_transforms(self, cfg: dict, debug: bool = False):
+    def log_transforms(self, obs, debug: bool = False):
+        """
+        Log robot mesh using urdf visualizer to rerun
+        Args:
+            obs (dict): Observation dataclass
+            use_collision (bool): use collision mesh
+        """
+        state = obs["joint"]
+        cfg = {}
+        for k in HelloStretchIdx.name_to_idx:
+            cfg[k] = state[HelloStretchIdx.name_to_idx[k]]
         lk_cfg = {
             "joint_wrist_yaw": cfg["wrist_yaw"],
             "joint_wrist_pitch": cfg["wrist_pitch"],
@@ -133,7 +143,9 @@ def log_transforms(self, cfg: dict, debug: bool = False):
         if "gripper" in cfg.keys():
             lk_cfg["joint_gripper_finger_left"] = cfg["gripper"]
             lk_cfg["joint_gripper_finger_right"] = cfg["gripper"]
+        t0 = timeit.default_timer()
         tms = self.get_tri_meshes(cfg=lk_cfg, use_collision=False)
+        t1 = timeit.default_timer()
         self.link_poses = tms["pose"]
         self.link_names = tms["link"]
         for link in self.link_names:
@@ -148,73 +160,23 @@ def log_transforms(self, cfg: dict, debug: bool = False):
                 ),
                 static=False,
             )
-
-    def log_robot_mesh(self, cfg: dict, use_collision: bool = True, debug: bool = False):
-        """
-        Log robot mesh using joint configuration data
-        Args:
-            cfg (dict): Joint configuration
-            use_collision (bool): use collision mesh
-        """
-        lk_cfg = {
-            "joint_wrist_yaw": cfg["wrist_yaw"],
-            "joint_wrist_pitch": cfg["wrist_pitch"],
-            "joint_wrist_roll": cfg["wrist_roll"],
-            "joint_lift": cfg["lift"],
-            "joint_arm_l0": cfg["arm"] / 4,
-            "joint_arm_l1": cfg["arm"] / 4,
-            "joint_arm_l2": cfg["arm"] / 4,
-            "joint_arm_l3": cfg["arm"] / 4,
-            "joint_head_pan": cfg["head_pan"],
-            "joint_head_tilt": cfg["head_tilt"],
-        }
-        if "gripper" in cfg.keys():
-            lk_cfg["joint_gripper_finger_left"] = cfg["gripper"]
-            lk_cfg["joint_gripper_finger_right"] = cfg["gripper"]
-        t0 = timeit.default_timer()
-        mesh = self.get_combined_robot_mesh(cfg=lk_cfg, use_collision=use_collision)
-        t1 = timeit.default_timer()
-        if debug:
-            print(f"Time to get mesh: {1000*(t1 - t1)} ms")
-        # breakpoint()
-        rr.set_time_seconds("realtime", time.time())
-        rr.log(
-            "world/robot/mesh",
-            rr.Mesh3D(
-                vertex_positions=mesh.vertices,
-                triangle_indices=mesh.faces,
-                vertex_normals=mesh.vertex_normals,
-                # vertex_colors=vertex_colors,
-                # albedo_texture=albedo_texture,
-                # vertex_texcoords=vertex_texcoords,
-            ),
-            timeless=True,
-        )
-        t2 = 1000 * (timeit.default_timer() - t1)
+        t2 = timeit.default_timer()
         if debug:
-            print(f"Time to rr log mesh: {t2}")
-            print(f"Total time to log mesh: {1000*(timeit.default_timer() - t0)} ms")
-
-    def log(self, obs, use_collision: bool = True, debug: bool = False):
-        """
-        Log robot mesh using urdf visualizer to rerun
-        Args:
-            obs (dict): Observation dataclass
-            use_collision (bool): use collision mesh
-        """
-        state = obs["joint"]
-        cfg = {}
-        for k in HelloStretchIdx.name_to_idx:
-            cfg[k] = state[HelloStretchIdx.name_to_idx[k]]
-        # breakpoint()
-        # self.log_robot_mesh(cfg=cfg, use_collision=use_collision)
-        self.log_transforms(cfg=cfg, debug=debug)
+            print("Time to get tri meshes (ms): ", 1000 * (t1 - t0))
+            print("Time to log robot transforms (ms): ", 1000 * (t2 - t1))
+            print("Total time to log robot transforms (ms): ", 1000 * (t2 - t0))
 
 
 class RerunVsualizer:
-    def __init__(self, display_robot_mesh: bool = True, open_browser: bool = True):
+    def __init__(
+        self,
+        display_robot_mesh: bool = True,
+        open_browser: bool = True,
+        server_memory_limit: str = "4GB",
+        collapse_panels: bool = True,
+    ):
         rr.init("Stretch_robot", spawn=False)
-        rr.serve(open_browser=open_browser, server_memory_limit="2GB")
+        rr.serve(open_browser=open_browser, server_memory_limit=server_memory_limit)
 
         self.display_robot_mesh = display_robot_mesh
 
@@ -238,23 +200,26 @@ def __init__(self, display_robot_mesh: bool = True, open_browser: bool = True):
         )
 
         self.bbox_colors_memory = {}
-        self.step_delay_s = 1 / 15
-        self.setup_blueprint()
+        self.step_delay_s = 0.3
+        self.setup_blueprint(collapse_panels)
 
-    def setup_blueprint(self):
-        """Setup the blueprint for the visualizer"""
-        my_blueprint = rrb.Blueprint(
-            rrb.Horizontal(
-                rrb.Spatial3DView(name="3D View", origin="world"),
-                rrb.Vertical(
-                    rrb.Spatial2DView(name="head_rgb", origin="/world/head_camera"),
-                    rrb.Spatial2DView(name="ee_rgb", origin="/world/ee_camera"),
-                ),
-                rrb.TimePanel(state=True),
-                rrb.SelectionPanel(state=True),
-                rrb.BlueprintPanel(state=True),
+    def setup_blueprint(self, collapse_panels: bool):
+        """Setup the blueprint for the visualizer
+        Args:
+            collapse_panels (bool): fully hides the blueprint/selection panels,
+                                    and shows the simplified time panel
+        """
+        main = rrb.Horizontal(
+            rrb.Spatial3DView(name="3D View", origin="world"),
+            rrb.Vertical(
+                rrb.Spatial2DView(name="head_rgb", origin="/world/head_camera"),
+                rrb.Spatial2DView(name="ee_rgb", origin="/world/ee_camera"),
             ),
-            collapse_panels=True,
+            column_shares=[3, 1],
+        )
+        my_blueprint = rrb.Blueprint(
+            rrb.Vertical(main, rrb.TimePanel(state=True)),
+            collapse_panels=collapse_panels,
         )
         rr.send_blueprint(my_blueprint)
 
@@ -338,10 +303,10 @@ def log_robot_state(self, obs):
         for k in HelloStretchIdx.name_to_idx:
             rr.log(f"robot_state/joint_pose/{k}", rr.Scalar(state[HelloStretchIdx.name_to_idx[k]]))
 
-    def log_robot_mesh(self, obs, use_collision=True):
+    def log_robot_transforms(self, obs):
         """
-        Log robot mesh using urdf visualizer"""
-        self.urdf_logger.log(obs, use_collision=use_collision)
+        Log robot mesh transforms using urdf visualizer"""
+        self.urdf_logger.log_transforms(obs)
 
     def update_voxel_map(
         self,
@@ -467,7 +432,7 @@ def update_nav_goal(self, goal, timeout=10):
         rr.set_time_seconds("realtime", ts)
         rr.log("world/xyt_goal/blob", rr.Points3D([0, 0, 0], colors=[0, 255, 0, 50], radii=0.1))
         rr.log(
-            "world/xyt_goal",
+            "world/xyt_goal/blob",
             rr.Transform3D(
                 translation=[goal[0], goal[1], 0],
                 rotation=rr.RotationAxisAngle(axis=[0, 0, 1], radians=goal[2]),
@@ -483,6 +448,7 @@ def step(self, obs, servo):
         if obs and servo:
             rr.set_time_seconds("realtime", time.time())
             try:
+                t0 = timeit.default_timer()
                 self.log_robot_xyt(obs)
                 self.log_head_camera(obs)
                 self.log_ee_frame(obs)
@@ -490,7 +456,11 @@ def step(self, obs, servo):
                 self.log_robot_state(obs)
 
                 if self.display_robot_mesh:
-                    self.log_robot_mesh(obs)
+                    self.log_robot_transforms(obs)
+                t1 = timeit.default_timer()
+                sleep_time = self.step_delay_s - (t1 - t0)
+                if sleep_time > 0:
+                    time.sleep(sleep_time)
 
             except Exception as e:
                 print(e)
diff --git a/src/stretch/visualization/urdf_visualizer.py b/src/stretch/visualization/urdf_visualizer.py
index d2ea0e85e..30a5c243b 100644
--- a/src/stretch/visualization/urdf_visualizer.py
+++ b/src/stretch/visualization/urdf_visualizer.py
@@ -11,6 +11,7 @@
 
 import importlib.resources as importlib_resources
 import re
+import timeit
 
 import numpy as np
 import urchin as urdf_loader
@@ -59,7 +60,9 @@ class URDFVisualizer:
     def __init__(self, urdf_file: str = abs_urdf_file_path):
         self.urdf = urdf_loader.URDF.load(urdf_file)
 
-    def get_tri_meshes(self, cfg: dict = None, use_collision: bool = True) -> list:
+    def get_tri_meshes(
+        self, cfg: dict = None, use_collision: bool = True, debug: bool = False
+    ) -> list:
         """
         Get list of trimesh objects, pose and link names of the robot with the given configuration
         Args:
@@ -68,10 +71,12 @@ def get_tri_meshes(self, cfg: dict = None, use_collision: bool = True) -> list:
         Returns:
             list: List of trimesh objects, pose and link names of the robot with the given configuration
         """
+        t0 = timeit.default_timer()
         if use_collision:
             fk = self.urdf.collision_trimesh_fk(cfg=cfg)
         else:
             fk = self.urdf.visual_trimesh_fk(cfg=cfg)
+        t1 = timeit.default_timer()
         t_meshes = {"mesh": [], "pose": [], "link": []}
         for tm in fk:
             pose = fk[tm]
@@ -82,6 +87,10 @@ def get_tri_meshes(self, cfg: dict = None, use_collision: bool = True) -> list:
             t_meshes["link"].append(
                 tm.metadata["file_name"][:-4] if "file_name" in tm.metadata.keys() else None
             )
+        t2 = timeit.default_timer()
+        if debug:
+            print(f"[get_trimeshes method] Time to compute FK (ms): {1000 * (t1 - t0)}")
+            print(f"[get_trimeshes method] Time to get meshes list (ms): {1000 * (t2 - t1)}")
         return t_meshes
 
     def get_combined_robot_mesh(self, cfg: dict = None, use_collision: bool = True) -> Trimesh:

From f0cc726dbcd492790aa02532f4f290a2b9797d6e Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Thu, 19 Sep 2024 19:40:03 -0700
Subject: [PATCH 195/410] remove robot frames vis and fix xyt goal vis

---
 src/stretch/visualization/rerun.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index b219fdcd7..c650096d8 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -156,7 +156,7 @@ def log_transforms(self, obs, debug: bool = False):
                 rr.Transform3D(
                     translation=self.link_poses[idx][:3, 3],
                     mat3x3=self.link_poses[idx][:3, :3],
-                    axis_length=0.3,
+                    axis_length=0.0,
                 ),
                 static=False,
             )
@@ -235,7 +235,7 @@ def log_head_camera(self, obs):
             rr.Pinhole(
                 resolution=[obs["rgb"].shape[1], obs["rgb"].shape[0]],
                 image_from_camera=obs["camera_K"],
-                image_plane_distance=0.25,
+                image_plane_distance=0.15,
             ),
         )
 
@@ -246,13 +246,12 @@ def log_robot_xyt(self, obs):
         theta = obs["compass"]
         rb_arrow = rr.Arrows3D(
             origins=[0, 0, 0],
-            vectors=[0.5, 0, 0],
+            vectors=[0.4, 0, 0],
             radii=0.02,
             labels="robot",
             colors=[255, 0, 0, 255],
         )
         rr.log("world/robot/arrow", rb_arrow)
-        rr.log("world/robot/blob", rr.Points3D([0, 0, 0], colors=[255, 0, 0, 255], radii=0.13))
         rr.log(
             "world/robot",
             rr.Transform3D(
@@ -292,7 +291,7 @@ def log_ee_camera(self, servo):
             rr.Pinhole(
                 resolution=[servo.ee_rgb.shape[1], servo.ee_rgb.shape[0]],
                 image_from_camera=servo.ee_camera_K,
-                image_plane_distance=0.25,
+                image_plane_distance=0.15,
             ),
         )
 
@@ -430,18 +429,18 @@ def update_nav_goal(self, goal, timeout=10):
         """
         ts = time.time()
         rr.set_time_seconds("realtime", ts)
-        rr.log("world/xyt_goal/blob", rr.Points3D([0, 0, 0], colors=[0, 255, 0, 50], radii=0.1))
+        rr.log("world/xyt_goal", rr.Points3D([0, 0, 0], colors=[0, 255, 0, 50], radii=0.1))
         rr.log(
-            "world/xyt_goal/blob",
+            "world/xyt_goal",
             rr.Transform3D(
                 translation=[goal[0], goal[1], 0],
                 rotation=rr.RotationAxisAngle(axis=[0, 0, 1], radians=goal[2]),
                 axis_length=0.5,
             ),
         )
-        rr.set_time_seconds("realtime", ts + timeout)
-        rr.log("world/xyt_goal", rr.Clear(recursive=True))
-        rr.set_time_seconds("realtime", ts)
+        # rr.set_time_seconds("realtime", ts + timeout)
+        # rr.log("world/xyt_goal", rr.Clear(recursive=True))
+        # rr.set_time_seconds("realtime", ts)
 
     def step(self, obs, servo):
         """Log all the data"""

From efa588ec74a8e31854c3b4d793ec54a975789f4b Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Thu, 19 Sep 2024 19:42:31 -0700
Subject: [PATCH 196/410] Fix typo

---
 src/stretch/agent/robot_agent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index a1d04a520..5b2665b55 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -395,7 +395,7 @@ def rotate_in_place(
                 self.voxel_map.show(
                     orig=np.zeros(3),
                     xyt=self.robot.get_base_pose(),
-                    footprint=self.robot.get_robot_model().get_(),
+                    footprint=self.robot.get_robot_model().get_footprint(),
                     instances=self.semantic_sensor is not None,
                 )
 

From 875c713cea343fdc4e5efda6bcd53c5638c8dba7 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Thu, 19 Sep 2024 19:51:29 -0700
Subject: [PATCH 197/410] Add missing docstrings

---
 src/stretch/visualization/rerun.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index c650096d8..950f3677d 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -102,7 +102,14 @@ class StretchURDFLogger(urdf_visualizer.URDFVisualizer):
     link_names = []
     link_poses = []
 
-    def load_robot_mesh(self, cfg: dict = None, use_collision: bool = True, debug: bool = False):
+    def load_robot_mesh(self, cfg: dict = None, use_collision: bool = False):
+        """
+        Load robot mesh using urdf visualizer to rerun
+        This is to be run once at the beginning of the rerun
+        Args:
+            cfg (dict): Configuration of the robot
+            use_collision (bool): use collision mesh
+        """
         trimesh_list = self.get_tri_meshes(cfg=cfg, use_collision=use_collision)
         self.link_names = trimesh_list["link"]
         self.link_poses = trimesh_list["pose"]

From e5384693c7b61ac3cf30e91e789e230eb0b1e0ad Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Thu, 19 Sep 2024 21:24:57 -0700
Subject: [PATCH 198/410] wip

---
 .../agent/manipulation/dinobot_prototype.py   | 53 ++++++++++---------
 1 file changed, 28 insertions(+), 25 deletions(-)

diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index b5c405326..94ba0c4a3 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -29,7 +29,6 @@
     extract_3d_coordinates,
     find_transformation,
 )
-from stretch.motion import HelloStretchIdx
 from stretch.perception.detection.detic import DeticPerception
 
 DEBUG_VISUALIZATION = False
@@ -157,28 +156,26 @@ def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray) -> float:
         """
         Move the robot to the bottleneck pose
         """
-        ee_transform = np.eye(4)
-        ee_transform[:3, :3] = R
-        ee_transform[:3, 3] = t
-        T_d405_to_wrist_yaw = self.urdf_model.get_transform(
-            "gripper_camera_color_optical_frame", "link_wrist_yaw"
-        )
-        ee_transform_wrist_yaw_T = T_d405_to_wrist_yaw @ ee_transform
-        translation_vector = ee_transform_wrist_yaw_T[:3, 3]
-        rotation_mat = ee_transform_wrist_yaw_T[:3, :3]
-        robot.switch_to_manipulation_mode()
+        model = robot.get_robot_model()
         joint_state = robot.get_joint_positions().copy()
-        print(f"TRanslation: {translation_vector}")
-        breakpoint()
-        joint_state[HelloStretchIdx.ARM] = joint_state[HelloStretchIdx.ARM] - translation_vector[1]
-        # joint_state[HelloStretchIdx.LIFT] = joint_state[HelloStretchIdx.LIFT] + ee_transform_wrist_yaw_T[:3][2]
-        joint_state[HelloStretchIdx.BASE_X] = (
-            joint_state[HelloStretchIdx.BASE_X] - translation_vector[0]
-        )
-        # joint_state[HelloStretchIdx.WRIST_YAW] = joint_state[HelloStretchIdx.WRIST_YAW] + R[0]
-        # joint_state[HelloStretchIdx.WRIST_PITCH] = joint_state[HelloStretchIdx.WRIST_PITCH] + R[1]
-        # joint_state[HelloStretchIdx.WRIST_ROLL] = joint_state[HelloStretchIdx.WRIST_ROLL] + R[2]
-        robot.arm_to(joint_state, blocking=True)
+        ee_pos, ee_rot = model.manip_fk(joint_state)
+        T_ee = self.urdf_model.get_transform("gripper_camera_color_optical_frame", "base_link")
+        # Translate and rotate the t and R with T_ee
+        t_transformed = T_ee[:3, :3] @ t + T_ee[:3, 3]
+        R_transformed = T_ee[:3, :3] @ R @ np.linalg.inv(T_ee[:3, :3])
+
+        print(f"EE Pos: {ee_pos}, EE Rot: {ee_rot}", "R: ", R, "t: ", t)
+        rot = Rotation.from_matrix(R)
+
+        target_joint_state, success, info = model.manip_ik((ee_pos - t, rot.as_quat()))
+
+        # # Create a 4x4 transformation matrix from R and t
+        # transformation_matrix = np.eye(4)
+        # transformation_matrix[:3, :3] = R
+        # transformation_matrix[:3, 3] = t
+
+        robot.switch_to_manipulation_mode()
+        robot.arm_to(target_joint_state, blocking=True)
 
     def move_to_bottleneck(
         self,
@@ -227,12 +224,18 @@ def move_to_bottleneck(
         print(f"Error: {error}")
 
         # Debug 3D Visualization
+        DEBUG_VISUALIZATION = True
         if DEBUG_VISUALIZATION:
             unique_colors = generate_unique_colors(len(points1))
             origin_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(
                 size=0.03, origin=[0, 0, 0]
             )
 
+            transformed_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(
+                size=0.03, origin=t
+            )
+            transformed_frame.rotate(R, center=(0, 0, 0))
+
             # Bottleneck frame
             bottleneck_frame = o3d.geometry.PointCloud()
             bottleneck_frame.points = o3d.utility.Vector3dVector(
@@ -247,7 +250,7 @@ def move_to_bottleneck(
                 sphere.paint_uniform_color(color)
                 sphere.translate(point)
                 spheres.append(sphere)
-            components = [origin_frame, bottleneck_frame]
+            components = [origin_frame, bottleneck_frame, transformed_frame]
             components.extend(spheres)
             o3d.visualization.draw_geometries(components)
 
@@ -267,7 +270,7 @@ def move_to_bottleneck(
             # components.extend(spheres)
             # o3d.visualization.draw_geometries(components)
 
-        self.move_robot(robot, angles, t)
+        # self.move_robot(robot, R, t)
         return error
 
 
@@ -299,7 +302,7 @@ def generate_unique_colors(n: int) -> List[Tuple[int, int, int]]:
 
 
 if __name__ == "__main__":
-    robot = RobotClient(robot_ip="10.0.0.14")
+    robot = RobotClient(robot_ip="10.0.0.2")
     dinobot = Dinobot()
     detic = DeticPerception()
     track_object_id = 41  # detic object id for cup

From 851412896daca0b2665eb9763c91eaa6d0fbab52 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Thu, 19 Sep 2024 23:15:16 -0700
Subject: [PATCH 199/410] wip

---
 .../agent/manipulation/dinobot_prototype.py   | 103 ++++++++++++++----
 1 file changed, 84 insertions(+), 19 deletions(-)

diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index 94ba0c4a3..7fbb5f5e4 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -10,7 +10,6 @@
 import sys
 
 sys.path.append("/home/hello-robot/repos/dino-vit-features")
-sys.path.append("/home/hello-robot/repos/gripper_grasp_space")
 import time
 from typing import List, Tuple
 
@@ -21,8 +20,8 @@
 from correspondences import find_correspondences, visualize_correspondences
 from extractor import ViTExtractor
 from scipy.spatial.transform import Rotation
-from urdf_utils import get_stretch_3_urdf
 
+import stretch.motion.constants as constants
 from stretch.agent import RobotClient
 from stretch.agent.manipulation.dinobot import (
     compute_error,
@@ -30,6 +29,7 @@
     find_transformation,
 )
 from stretch.perception.detection.detic import DeticPerception
+from stretch.visualization.urdf_visualizer import URDFVisualizer
 
 DEBUG_VISUALIZATION = False
 
@@ -81,7 +81,7 @@ def __init__(self, model_type: str = "dino_vits8", stride: int = 4):
         self.feature_extractor = ViTExtractor(
             model_type=model_type, stride=stride, device=self.device
         )
-        self.urdf_model = get_stretch_3_urdf()
+        self.urdf = URDFVisualizer()
 
     def get_correspondences(
         self,
@@ -157,25 +157,91 @@ def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray) -> float:
         Move the robot to the bottleneck pose
         """
         model = robot.get_robot_model()
-        joint_state = robot.get_joint_positions().copy()
-        ee_pos, ee_rot = model.manip_fk(joint_state)
-        T_ee = self.urdf_model.get_transform("gripper_camera_color_optical_frame", "base_link")
-        # Translate and rotate the t and R with T_ee
-        t_transformed = T_ee[:3, :3] @ t + T_ee[:3, 3]
-        R_transformed = T_ee[:3, :3] @ R @ np.linalg.inv(T_ee[:3, :3])
+        T_d405 = self.urdf.get_transform_fk(
+            robot.get_joint_positions(), "gripper_camera_color_optical_frame"
+        )
+        T_ee = self.urdf.get_transform_fk(robot.get_joint_positions(), "link_grasp_center")
+        D405_target = np.eye(4)
+        D405_target[:3, :3] = R
+        D405_target[:3, 3] = t
+        T_d405_target = np.matmul(T_d405, D405_target)
+
+        T_d405_ee = np.matmul(np.linalg.inv(T_d405), T_ee)
+        T_ee_target = np.matmul(T_d405_target, np.linalg.inv(T_d405_ee))
 
-        print(f"EE Pos: {ee_pos}, EE Rot: {ee_rot}", "R: ", R, "t: ", t)
-        rot = Rotation.from_matrix(R)
+        DEBUG = True
+        if DEBUG:
 
-        target_joint_state, success, info = model.manip_ik((ee_pos - t, rot.as_quat()))
+            origin_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(
+                size=0.1, origin=[0, 0, 0]
+            )
 
-        # # Create a 4x4 transformation matrix from R and t
-        # transformation_matrix = np.eye(4)
-        # transformation_matrix[:3, :3] = R
-        # transformation_matrix[:3, 3] = t
+            T_d405_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1)
+            T_d405_frame.transform(T_d405)
+            T_ee_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1)
+            T_ee_frame.transform(T_ee)
+
+            T_target_d405_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1)
+            T_target_d405_frame.transform(T_d405_target)
+            T_d405_target_blob = o3d.geometry.TriangleMesh.create_sphere(radius=0.02)
+            T_d405_target_blob.paint_uniform_color([0.15, 0.7, 0.15])  # Green for T_d405
+            T_d405_target_blob.translate(T_d405_target[:3, 3])
+
+            # T_ee_target_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1)
+            # T_ee_target_frame.transform(T_ee_target)
+            # T_ee_target_blob = o3d.geometry.TriangleMesh.create_sphere(radius=0.02)
+            # T_ee_target_blob.paint_uniform_color([0.15, 0.15, 0.7])  # Blue for T_ee
+            # T_ee_target_blob.translate(T_ee_target[:3, 3])
+
+            # Create blobs at the origins of the coordinate frames
+            origin_blob = o3d.geometry.TriangleMesh.create_sphere(radius=0.02)
+            origin_blob.paint_uniform_color([1, 0, 0])  # Red for origin
+
+            T_d405_blob = o3d.geometry.TriangleMesh.create_sphere(radius=0.02)
+            T_d405_blob.paint_uniform_color([0, 1, 0])  # Green for T_d405
+
+            T_ee_blob = o3d.geometry.TriangleMesh.create_sphere(radius=0.02)
+            T_ee_blob.paint_uniform_color([0, 0, 1])  # Blue for T_ee
+
+            # Translate blobs to the origins of the coordinate frames
+            T_d405_blob.translate(T_d405[:3, 3])
+            T_ee_blob.translate(T_ee[:3, 3])
+
+            # Add blobs to the visualization
+            o3d.visualization.draw_geometries(
+                [
+                    origin_frame,
+                    T_d405_frame,
+                    T_ee_frame,
+                    origin_blob,
+                    T_d405_blob,
+                    T_ee_blob,
+                    T_target_d405_frame,
+                    T_d405_target_blob,
+                ]
+                #  T_ee_target_frame,T_ee_target_blob]
+            )
 
+            # # Visualize the frames with labels
+            # o3d.visualization.draw_geometries(
+            #     [origin_frame, T_d405_frame, T_ee_frame]
+            # )
+        return
+
+        T_ee_d405 = self.urdf_model.get_transform(
+            "link_grasp_center", "gripper_camera_color_optical_frame"
+        )
+
+        T_ee_target = np.matmul(T_d405_target, np.linalg.inv(T_ee_d405))
+        target_ee_pos = T_ee_target[:3, 3]
+        target_ee_rot = T_ee_target[:3, :3]
+        rot = Rotation.from_matrix(target_ee_rot)
+        joint_state = robot.get_joint_positions().copy()
+        target_joint_positions, _, _, success, _ = model.manip_ik_for_grasp_frame(
+            target_ee_pos, rot.as_quat(), q0=joint_state
+        )
         robot.switch_to_manipulation_mode()
-        robot.arm_to(target_joint_state, blocking=True)
+        robot.arm_to(target_joint_positions, blocking=True, head=constants.look_at_ee)
 
     def move_to_bottleneck(
         self,
@@ -224,7 +290,6 @@ def move_to_bottleneck(
         print(f"Error: {error}")
 
         # Debug 3D Visualization
-        DEBUG_VISUALIZATION = True
         if DEBUG_VISUALIZATION:
             unique_colors = generate_unique_colors(len(points1))
             origin_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(
@@ -270,7 +335,7 @@ def move_to_bottleneck(
             # components.extend(spheres)
             # o3d.visualization.draw_geometries(components)
 
-        # self.move_robot(robot, R, t)
+        self.move_robot(robot, R, t)
         return error
 
 

From dfbe95c839a962cf1a764c708c78ec95c9e38206 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Fri, 20 Sep 2024 00:52:14 -0700
Subject: [PATCH 200/410] wip

---
 .../agent/manipulation/dinobot_prototype.py   | 47 +++++++++++--------
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index 7fbb5f5e4..270fcec78 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -160,14 +160,23 @@ def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray) -> float:
         T_d405 = self.urdf.get_transform_fk(
             robot.get_joint_positions(), "gripper_camera_color_optical_frame"
         )
+
+        # Get the transformation from the bottleneck d405 frame to the target d405 frame
         T_ee = self.urdf.get_transform_fk(robot.get_joint_positions(), "link_grasp_center")
         D405_target = np.eye(4)
         D405_target[:3, :3] = R
         D405_target[:3, 3] = t
         T_d405_target = np.matmul(T_d405, D405_target)
 
-        T_d405_ee = np.matmul(np.linalg.inv(T_d405), T_ee)
-        T_ee_target = np.matmul(T_d405_target, np.linalg.inv(T_d405_ee))
+        # Get the transformation from the target d405 frame to the target ee frame
+        T_d405_ee = np.dot(np.linalg.inv(T_d405), T_ee)
+        T_ee_target = T_d405_target.copy()
+
+        # Extract the transformation to target ee frame
+        R_d405_ee = T_d405_ee[:3, :3]
+        t_d405_ee = T_d405_ee[:3, 3]
+        T_ee_target[:3, 3] = T_ee_target[:3, 3] - t_d405_ee
+        T_ee_target[:3, :3] = np.matmul(T_ee_target[:3, :3], R_d405_ee)
 
         DEBUG = True
         if DEBUG:
@@ -176,32 +185,30 @@ def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray) -> float:
                 size=0.1, origin=[0, 0, 0]
             )
 
+            # Current frames
             T_d405_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1)
             T_d405_frame.transform(T_d405)
+            origin_blob = o3d.geometry.TriangleMesh.create_sphere(radius=0.02)
+            origin_blob.paint_uniform_color([1, 0, 0])  # Red for origin
+            T_d405_blob = o3d.geometry.TriangleMesh.create_sphere(radius=0.02)
+            T_d405_blob.paint_uniform_color([0, 1, 0])  # Green for T_d405
             T_ee_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1)
             T_ee_frame.transform(T_ee)
+            T_ee_blob = o3d.geometry.TriangleMesh.create_sphere(radius=0.02)
+            T_ee_blob.paint_uniform_color([0, 0, 1])  # Blue for T_ee
 
+            # Targets
             T_target_d405_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1)
             T_target_d405_frame.transform(T_d405_target)
             T_d405_target_blob = o3d.geometry.TriangleMesh.create_sphere(radius=0.02)
-            T_d405_target_blob.paint_uniform_color([0.15, 0.7, 0.15])  # Green for T_d405
+            T_d405_target_blob.paint_uniform_color([0.5, 0.7, 0.5])  # Green for T_d405
             T_d405_target_blob.translate(T_d405_target[:3, 3])
 
-            # T_ee_target_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1)
-            # T_ee_target_frame.transform(T_ee_target)
-            # T_ee_target_blob = o3d.geometry.TriangleMesh.create_sphere(radius=0.02)
-            # T_ee_target_blob.paint_uniform_color([0.15, 0.15, 0.7])  # Blue for T_ee
-            # T_ee_target_blob.translate(T_ee_target[:3, 3])
-
-            # Create blobs at the origins of the coordinate frames
-            origin_blob = o3d.geometry.TriangleMesh.create_sphere(radius=0.02)
-            origin_blob.paint_uniform_color([1, 0, 0])  # Red for origin
-
-            T_d405_blob = o3d.geometry.TriangleMesh.create_sphere(radius=0.02)
-            T_d405_blob.paint_uniform_color([0, 1, 0])  # Green for T_d405
-
-            T_ee_blob = o3d.geometry.TriangleMesh.create_sphere(radius=0.02)
-            T_ee_blob.paint_uniform_color([0, 0, 1])  # Blue for T_ee
+            T_ee_target_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1)
+            T_ee_target_frame.transform(T_ee_target)
+            T_ee_target_blob = o3d.geometry.TriangleMesh.create_sphere(radius=0.02)
+            T_ee_target_blob.paint_uniform_color([0.5, 0.5, 0.7])  # Blue for T_ee
+            T_ee_target_blob.translate(T_ee_target[:3, 3])
 
             # Translate blobs to the origins of the coordinate frames
             T_d405_blob.translate(T_d405[:3, 3])
@@ -218,8 +225,10 @@ def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray) -> float:
                     T_ee_blob,
                     T_target_d405_frame,
                     T_d405_target_blob,
+                    # ]
+                    T_ee_target_frame,
+                    T_ee_target_blob,
                 ]
-                #  T_ee_target_frame,T_ee_target_blob]
             )
 
             # # Visualize the frames with labels

From b99cebbfc31fe0aeb6363768d995cef62097b828 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Fri, 20 Sep 2024 00:55:52 -0700
Subject: [PATCH 201/410] Ik to bottle neck working

---
 .../agent/manipulation/dinobot_prototype.py    | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index 270fcec78..64fcf4d34 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -178,7 +178,7 @@ def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray) -> float:
         T_ee_target[:3, 3] = T_ee_target[:3, 3] - t_d405_ee
         T_ee_target[:3, :3] = np.matmul(T_ee_target[:3, :3], R_d405_ee)
 
-        DEBUG = True
+        DEBUG = False
         if DEBUG:
 
             origin_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(
@@ -225,26 +225,14 @@ def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray) -> float:
                     T_ee_blob,
                     T_target_d405_frame,
                     T_d405_target_blob,
-                    # ]
                     T_ee_target_frame,
                     T_ee_target_blob,
                 ]
             )
 
-            # # Visualize the frames with labels
-            # o3d.visualization.draw_geometries(
-            #     [origin_frame, T_d405_frame, T_ee_frame]
-            # )
-        return
-
-        T_ee_d405 = self.urdf_model.get_transform(
-            "link_grasp_center", "gripper_camera_color_optical_frame"
-        )
-
-        T_ee_target = np.matmul(T_d405_target, np.linalg.inv(T_ee_d405))
+        # Extract the target end-effector position and rotation
         target_ee_pos = T_ee_target[:3, 3]
-        target_ee_rot = T_ee_target[:3, :3]
-        rot = Rotation.from_matrix(target_ee_rot)
+        rot = Rotation.from_matrix(T_ee_target[:3, :3])
         joint_state = robot.get_joint_positions().copy()
         target_joint_positions, _, _, success, _ = model.manip_ik_for_grasp_frame(
             target_ee_pos, rot.as_quat(), q0=joint_state

From e618ba7c91049d9450d8a2bc8308c6231bf9fd09 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Fri, 20 Sep 2024 01:14:49 -0700
Subject: [PATCH 202/410] Add fk method to urdf visualizer

---
 src/stretch/agent/manipulation/dinobot_prototype.py | 5 ++---
 src/stretch/visualization/urdf_visualizer.py        | 9 +++++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index 64fcf4d34..153452be2 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -176,10 +176,9 @@ def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray) -> float:
         R_d405_ee = T_d405_ee[:3, :3]
         t_d405_ee = T_d405_ee[:3, 3]
         T_ee_target[:3, 3] = T_ee_target[:3, 3] - t_d405_ee
-        T_ee_target[:3, :3] = np.matmul(T_ee_target[:3, :3], R_d405_ee)
+        T_ee_target[:3, :3] = np.dot(T_ee_target[:3, :3], R_d405_ee)
 
-        DEBUG = False
-        if DEBUG:
+        if DEBUG_VISUALIZATION:
 
             origin_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(
                 size=0.1, origin=[0, 0, 0]
diff --git a/src/stretch/visualization/urdf_visualizer.py b/src/stretch/visualization/urdf_visualizer.py
index 30a5c243b..430d8f8b1 100644
--- a/src/stretch/visualization/urdf_visualizer.py
+++ b/src/stretch/visualization/urdf_visualizer.py
@@ -17,6 +17,8 @@
 import urchin as urdf_loader
 from trimesh import Trimesh
 
+from stretch.motion import HelloStretchIdx
+
 pkg_path = str(importlib_resources.files("stretch_urdf"))
 model_name = "SE3"  # RE1V0, RE2V0, SE3
 tool_name = "eoa_wrist_dw3_tool_sg3"  # eoa_wrist_dw3_tool_sg3, tool_stretch_gripper, etc
@@ -136,3 +138,10 @@ def get_transform(self, cfg: dict, link_name: str) -> np.ndarray:
             lk_cfg["joint_gripper_finger_left"] = cfg["gripper"]
             lk_cfg["joint_gripper_finger_right"] = cfg["gripper"]
         return self.urdf.link_fk(lk_cfg, link=link_name)
+
+    def get_transform_fk(self, q, link_name):
+        state = q.copy()
+        cfg = {}
+        for k in HelloStretchIdx.name_to_idx:
+            cfg[k] = state[HelloStretchIdx.name_to_idx[k]]
+        return self.get_transform(cfg, link_name)

From 562a0ca7cb850296045f2ba8ece855af9e8d62ec Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Fri, 20 Sep 2024 11:13:00 -0700
Subject: [PATCH 203/410] Add comments

---
 src/stretch/agent/manipulation/dinobot_prototype.py | 11 +++++++++++
 src/stretch/visualization/rerun.py                  |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index 153452be2..8e5399c8c 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -155,6 +155,10 @@ def run(
     def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray) -> float:
         """
         Move the robot to the bottleneck pose
+        Args:
+            robot (RobotClient): The robot client to use
+            R (np.ndarray): The rotation matrix in D405 frame needed to be applied
+            t (np.ndarray): The translation vector in D405 frame needed to be applied
         """
         model = robot.get_robot_model()
         T_d405 = self.urdf.get_transform_fk(
@@ -233,9 +237,13 @@ def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray) -> float:
         target_ee_pos = T_ee_target[:3, 3]
         rot = Rotation.from_matrix(T_ee_target[:3, :3])
         joint_state = robot.get_joint_positions().copy()
+
+        # Compute the IK solution for the target end-effector position and rotation
         target_joint_positions, _, _, success, _ = model.manip_ik_for_grasp_frame(
             target_ee_pos, rot.as_quat(), q0=joint_state
         )
+
+        # Move the robot to the target joint positions
         robot.switch_to_manipulation_mode()
         robot.arm_to(target_joint_positions, blocking=True, head=constants.look_at_ee)
 
@@ -305,6 +313,7 @@ def move_to_bottleneck(
             points1_frame = o3d.geometry.PointCloud()
             points1_frame.points = o3d.utility.Vector3dVector(points1)
 
+            # Visualize the unique correspondences between bottleneck and live frame
             spheres = []
             for point, color in zip(points1, unique_colors):
                 sphere = o3d.geometry.TriangleMesh.create_sphere(radius=0.01)
@@ -375,6 +384,8 @@ def generate_unique_colors(n: int) -> List[Tuple[int, int, int]]:
     semantic, instance, task_observations = detic.predict(bottleneck_image_rgb)
 
     def apply_mask_callback(image: np.ndarray) -> np.ndarray:
+        # Not applying mask to live image seems to work better
+        return image
         semantic, instance, task_observations = detic.predict(image)
         if track_object_id in task_observations["instance_classes"]:
             object_mask = semantic == track_object_id
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 950f3677d..9b2a8eb67 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -279,7 +279,7 @@ def log_ee_frame(self, obs):
         ee_arrow = rr.Arrows3D(
             origins=[0, 0, 0], vectors=[0.2, 0, 0], radii=0.02, labels="ee", colors=[0, 255, 0, 255]
         )
-        rr.log("world/ee/arrow", ee_arrow)
+        # rr.log("world/ee/arrow", ee_arrow)
         rr.log("world/ee", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3))
 
     def log_ee_camera(self, servo):

From 73b9c39eeae2ec8a54547a6d73b50cf0f9d444d4 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Fri, 20 Sep 2024 11:24:48 -0700
Subject: [PATCH 204/410] Add docstrings

---
 src/stretch/visualization/rerun.py           | 7 +++++++
 src/stretch/visualization/urdf_visualizer.py | 5 +++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 950f3677d..9a84804f0 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -182,6 +182,13 @@ def __init__(
         server_memory_limit: str = "4GB",
         collapse_panels: bool = True,
     ):
+        """Rerun visualizer class
+        Args:
+            display_robot_mesh (bool): Display robot mesh
+            open_browser (bool): Open browser at start
+            server_memory_limit (str): Server memory limit E.g. 2GB or 20%
+            collapse_panels (bool): Set to false to have customizable rerun panels
+        """
         rr.init("Stretch_robot", spawn=False)
         rr.serve(open_browser=open_browser, server_memory_limit=server_memory_limit)
 
diff --git a/src/stretch/visualization/urdf_visualizer.py b/src/stretch/visualization/urdf_visualizer.py
index 30a5c243b..23a231b7d 100644
--- a/src/stretch/visualization/urdf_visualizer.py
+++ b/src/stretch/visualization/urdf_visualizer.py
@@ -51,8 +51,9 @@ def get_absolute_path_stretch_urdf(urdf_file_path, mesh_files_directory_path) ->
 
 
 class URDFVisualizer:
-    """The `show` method in this class is modified from the
-    original implementation of `urdf_loader.URDF.show`.
+    """
+    URDF wrapper class to get trimesh objects, link poses and FK transformations
+    using urchin.urdf_loader
     """
 
     abs_urdf_file_path = get_absolute_path_stretch_urdf(urdf_file_path, mesh_files_directory_path)

From e2af36246d59b01f90a8202265e92c101916349c Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Fri, 20 Sep 2024 18:06:48 -0700
Subject: [PATCH 205/410] Added rerun logging

---
 .../agent/manipulation/dinobot_prototype.py   | 141 ++++++++++--------
 1 file changed, 80 insertions(+), 61 deletions(-)

diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index 8e5399c8c..306f8d538 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -10,12 +10,14 @@
 import sys
 
 sys.path.append("/home/hello-robot/repos/dino-vit-features")
+import math
 import time
 from typing import List, Tuple
 
 import matplotlib.pyplot as plt
 import numpy as np
 import open3d as o3d
+import rerun as rr
 import torch
 from correspondences import find_correspondences, visualize_correspondences
 from extractor import ViTExtractor
@@ -64,6 +66,15 @@ def load_demo(self, path_to_demo_folder=None):
         raise NotImplementedError
 
 
+def Rz(theta):
+    """
+    Rotation matrix about z-axis
+    """
+    return np.matrix(
+        [[math.cos(theta), -math.sin(theta), 0], [math.sin(theta), math.cos(theta), 0], [0, 0, 1]]
+    )
+
+
 class Dinobot:
     """
     Dinobot is a class that uses DINO correspondences to move the robot to the bottleneck pose.
@@ -87,7 +98,7 @@ def get_correspondences(
         self,
         image1: np.ndarray,
         image2: np.ndarray,
-        num_pairs: int = 20,
+        num_pairs: int = 10,
         load_size: int = 224,
         layer: int = 9,
         facet: str = "key",
@@ -108,7 +119,7 @@ def run(
         demo: Demo,
         visualize: bool = False,
         apply_mask_callback=None,
-        error_threshold: float = 0.01,
+        error_threshold: float = 0.03,
     ):
         """
         Run the Dinobot algorithm
@@ -182,70 +193,79 @@ def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray) -> float:
         T_ee_target[:3, 3] = T_ee_target[:3, 3] - t_d405_ee
         T_ee_target[:3, :3] = np.dot(T_ee_target[:3, :3], R_d405_ee)
 
-        if DEBUG_VISUALIZATION:
+        base_xyt = robot.get_base_pose()
+        base_4x4 = np.eye(4)
+        base_4x4[:3, :3] = Rz(base_xyt[2])
+        base_4x4[:2, 3] = base_xyt[:2]
+
+        # Rerun logging
+        # Log the computed target bottleneck d405 and end-effector frames in the world frame
+        rr.set_time_seconds("realtime", time.time())
+        T_d405_target_world = np.matmul(base_4x4, T_d405_target)
+        rr.log(
+            "world/d405_bottleneck_frame",
+            rr.Points3D(
+                [0, 0, 0],
+                colors=[255, 255, 0, 255],
+                labels="target_d405_bottleneck_frame",
+                radii=0.01,
+            ),
+            static=True,
+        )
+        rr.log(
+            "world/d405_bottleneck_frame",
+            rr.Transform3D(
+                translation=T_d405_target_world[:3, 3],
+                mat3x3=T_d405_target_world[:3, :3],
+                axis_length=0.3,
+            ),
+            static=True,
+        )
 
-            origin_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(
-                size=0.1, origin=[0, 0, 0]
-            )
+        T_ee_target_world = np.matmul(base_4x4, T_ee_target)
+        rr.log(
+            "world/ee_bottleneck_frame",
+            rr.Points3D(
+                [0, 0, 0],
+                colors=[0, 255, 255, 255],
+                labels="target_ee_bottleneck_frame",
+                radii=0.01,
+            ),
+            static=True,
+        )
+        rr.log(
+            "world/ee_bottleneck_frame",
+            rr.Transform3D(
+                translation=T_ee_target_world[:3, 3],
+                mat3x3=T_ee_target_world[:3, :3],
+                axis_length=0.3,
+            ),
+            static=True,
+        )
+        rr.log(
+            "world/target_ee_d405_line",
+            rr.LineStrips3D([T_ee_target_world[:3, 3], T_d405_target_world[:3, 3]], radii=0.005),
+            static=True,
+        )
 
-            # Current frames
-            T_d405_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1)
-            T_d405_frame.transform(T_d405)
-            origin_blob = o3d.geometry.TriangleMesh.create_sphere(radius=0.02)
-            origin_blob.paint_uniform_color([1, 0, 0])  # Red for origin
-            T_d405_blob = o3d.geometry.TriangleMesh.create_sphere(radius=0.02)
-            T_d405_blob.paint_uniform_color([0, 1, 0])  # Green for T_d405
-            T_ee_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1)
-            T_ee_frame.transform(T_ee)
-            T_ee_blob = o3d.geometry.TriangleMesh.create_sphere(radius=0.02)
-            T_ee_blob.paint_uniform_color([0, 0, 1])  # Blue for T_ee
-
-            # Targets
-            T_target_d405_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1)
-            T_target_d405_frame.transform(T_d405_target)
-            T_d405_target_blob = o3d.geometry.TriangleMesh.create_sphere(radius=0.02)
-            T_d405_target_blob.paint_uniform_color([0.5, 0.7, 0.5])  # Green for T_d405
-            T_d405_target_blob.translate(T_d405_target[:3, 3])
-
-            T_ee_target_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1)
-            T_ee_target_frame.transform(T_ee_target)
-            T_ee_target_blob = o3d.geometry.TriangleMesh.create_sphere(radius=0.02)
-            T_ee_target_blob.paint_uniform_color([0.5, 0.5, 0.7])  # Blue for T_ee
-            T_ee_target_blob.translate(T_ee_target[:3, 3])
-
-            # Translate blobs to the origins of the coordinate frames
-            T_d405_blob.translate(T_d405[:3, 3])
-            T_ee_blob.translate(T_ee[:3, 3])
-
-            # Add blobs to the visualization
-            o3d.visualization.draw_geometries(
-                [
-                    origin_frame,
-                    T_d405_frame,
-                    T_ee_frame,
-                    origin_blob,
-                    T_d405_blob,
-                    T_ee_blob,
-                    T_target_d405_frame,
-                    T_d405_target_blob,
-                    T_ee_target_frame,
-                    T_ee_target_blob,
-                ]
-            )
+        input("Press Enter to move the robot to the target pose")
 
         # Extract the target end-effector position and rotation
         target_ee_pos = T_ee_target[:3, 3]
         rot = Rotation.from_matrix(T_ee_target[:3, :3])
         joint_state = robot.get_joint_positions().copy()
 
-        # Compute the IK solution for the target end-effector position and rotation
-        target_joint_positions, _, _, success, _ = model.manip_ik_for_grasp_frame(
-            target_ee_pos, rot.as_quat(), q0=joint_state
-        )
+        try:
+            # Compute the IK solution for the target end-effector position and rotation
+            target_joint_positions, _, _, success, _ = model.manip_ik_for_grasp_frame(
+                target_ee_pos, rot.as_quat(), q0=joint_state
+            )
 
-        # Move the robot to the target joint positions
-        robot.switch_to_manipulation_mode()
-        robot.arm_to(target_joint_positions, blocking=True, head=constants.look_at_ee)
+            # Move the robot to the target joint positions
+            robot.switch_to_manipulation_mode()
+            robot.arm_to(target_joint_positions, blocking=True, head=constants.look_at_ee)
+        except Exception as e:
+            print(f"Failed to move the robot to the target pose: {e}")
 
     def move_to_bottleneck(
         self,
@@ -288,7 +308,6 @@ def move_to_bottleneck(
         # Find rigid translation and rotation that aligns the points by minimising error, using SVD.
         R, t = find_transformation(points1, points2)
         r = Rotation.from_matrix(R)
-        angles = r.as_euler("xyz")
         print(f"Camera frame to Rot: {R}, Trans: {t}")
         error = compute_error(points1, points2)
         print(f"Error: {error}")
@@ -384,8 +403,6 @@ def generate_unique_colors(n: int) -> List[Tuple[int, int, int]]:
     semantic, instance, task_observations = detic.predict(bottleneck_image_rgb)
 
     def apply_mask_callback(image: np.ndarray) -> np.ndarray:
-        # Not applying mask to live image seems to work better
-        return image
         semantic, instance, task_observations = detic.predict(image)
         if track_object_id in task_observations["instance_classes"]:
             object_mask = semantic == track_object_id
@@ -400,7 +417,9 @@ def apply_mask_callback(image: np.ndarray) -> np.ndarray:
         demo = Demo(
             bottleneck_image_rgb, bottleneck_image_depth, bottleneck_image_camera_K, object_mask
         )
-        input("Displace the object")
+        print("\nFirst frame is the bottleneck image\n")
+        print("=================================================")
+        input("Displace the object and press Enter:")
         dinobot.run(robot, demo, visualize=True, apply_mask_callback=apply_mask_callback)
     else:
         print(f"Object ID: {track_object_id} not found in the image")

From a7e03f391a0054ea449f0ff9b17c22e1cc616b28 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Fri, 20 Sep 2024 20:14:49 -0700
Subject: [PATCH 206/410] more dinobot viz fixes

---
 .../agent/manipulation/dinobot_prototype.py   | 145 +++++++++++-------
 1 file changed, 87 insertions(+), 58 deletions(-)

diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index 306f8d538..0c61b78d6 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -14,9 +14,9 @@
 import time
 from typing import List, Tuple
 
+import cv2
 import matplotlib.pyplot as plt
 import numpy as np
-import open3d as o3d
 import rerun as rr
 import torch
 from correspondences import find_correspondences, visualize_correspondences
@@ -33,8 +33,6 @@
 from stretch.perception.detection.detic import DeticPerception
 from stretch.visualization.urdf_visualizer import URDFVisualizer
 
-DEBUG_VISUALIZATION = False
-
 
 class Demo:
     """
@@ -147,14 +145,38 @@ def run(
                         im1, im2 = visualize_correspondences(
                             points1, points2, demo.bottleneck_image_rgb, ee_rgb
                         )
-                        fig, axes = plt.subplots(1, 2, figsize=(10, 5))
-                        axes[0].imshow(im1)
-                        axes[0].set_title("Bottleneck Image")
-                        axes[0].axis("off")
-                        axes[1].imshow(im2)
-                        axes[1].set_title("Live Image")
-                        axes[1].axis("off")
-                        plt.show()
+                        im1 = cv2.putText(
+                            im1,
+                            "Bottleneck Image",
+                            (10, 30),
+                            cv2.FONT_HERSHEY_SIMPLEX,
+                            1,
+                            (255, 255, 255),
+                            2,
+                            cv2.LINE_AA,
+                        )
+                        im2 = cv2.putText(
+                            im2,
+                            "Live Image",
+                            (10, 30),
+                            cv2.FONT_HERSHEY_SIMPLEX,
+                            1,
+                            (255, 255, 255),
+                            2,
+                            cv2.LINE_AA,
+                        )
+                        stacked_images = np.hstack((im1, im2))
+                        # Enlarge the image for better visualization
+                        stacked_images = cv2.resize(stacked_images, (0, 0), fx=2, fy=2)
+                        cv2.imshow(
+                            "Correspondences", cv2.cvtColor(stacked_images, cv2.COLOR_RGB2BGR)
+                        )
+                        while True:
+                            if cv2.waitKey(1) & 0xFF == ord("q"):
+                                break
+                            if cv2.getWindowProperty("Correspondences", cv2.WND_PROP_VISIBLE) < 1:
+                                break
+                        cv2.destroyAllWindows()
                     else:
                         print("No correspondences found")
                 # Move the robot to the bottleneck pose
@@ -244,10 +266,40 @@ def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray) -> float:
         )
         rr.log(
             "world/target_ee_d405_line",
-            rr.LineStrips3D([T_ee_target_world[:3, 3], T_d405_target_world[:3, 3]], radii=0.005),
+            rr.LineStrips3D(
+                [T_ee_target_world[:3, 3], T_d405_target_world[:3, 3]],
+                colors=[255, 165, 0, 255],
+                radii=0.005,
+            ),
+            static=True,
+        )
+
+        rr.log(
+            "world/robot/current_ee_d405_line",
+            rr.LineStrips3D([T_ee[:3, 3], T_d405[:3, 3]], colors=[128, 0, 128, 255], radii=0.005),
             static=True,
         )
 
+        rr.log(
+            "world/ee_camera",
+            rr.Points3D(
+                [0, 0, 0],
+                colors=[255, 255, 0, 255],
+                labels="current_d405_frame",
+                radii=0.01,
+            ),
+            static=True,
+        )
+        rr.log(
+            "world/ee",
+            rr.Points3D(
+                [0, 0, 0],
+                colors=[0, 255, 255, 255],
+                labels="current_ee_frame",
+                radii=0.01,
+            ),
+            static=True,
+        )
         input("Press Enter to move the robot to the target pose")
 
         # Extract the target end-effector position and rotation
@@ -312,52 +364,29 @@ def move_to_bottleneck(
         error = compute_error(points1, points2)
         print(f"Error: {error}")
 
-        # Debug 3D Visualization
-        if DEBUG_VISUALIZATION:
-            unique_colors = generate_unique_colors(len(points1))
-            origin_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(
-                size=0.03, origin=[0, 0, 0]
-            )
-
-            transformed_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(
-                size=0.03, origin=t
-            )
-            transformed_frame.rotate(R, center=(0, 0, 0))
-
-            # Bottleneck frame
-            bottleneck_frame = o3d.geometry.PointCloud()
-            bottleneck_frame.points = o3d.utility.Vector3dVector(
-                bottleneck_xyz.reshape((bottleneck_xyz.shape[0] * bottleneck_xyz.shape[1], 3))
-            )
-            points1_frame = o3d.geometry.PointCloud()
-            points1_frame.points = o3d.utility.Vector3dVector(points1)
-
-            # Visualize the unique correspondences between bottleneck and live frame
-            spheres = []
-            for point, color in zip(points1, unique_colors):
-                sphere = o3d.geometry.TriangleMesh.create_sphere(radius=0.01)
-                sphere.paint_uniform_color(color)
-                sphere.translate(point)
-                spheres.append(sphere)
-            components = [origin_frame, bottleneck_frame, transformed_frame]
-            components.extend(spheres)
-            o3d.visualization.draw_geometries(components)
-
-            # Live frame
-            # live_frame = o3d.geometry.PointCloud()
-            # live_frame.points = o3d.utility.Vector3dVector(live_xyz.reshape((live_xyz.shape[0]*live_xyz.shape[1],3)))
-            # points2_frame = o3d.geometry.PointCloud()
-            # points2_frame.points = o3d.utility.Vector3dVector(points2)
-
-            # spheres = []
-            # for point,color in zip(points2,unique_colors):
-            #     sphere = o3d.geometry.TriangleMesh.create_sphere(radius=0.01)
-            #     sphere.paint_uniform_color(color)
-            #     sphere.translate(point)
-            #     spheres.append(sphere)
-            # components = [origin_frame,live_frame]
-            # components.extend(spheres)
-            # o3d.visualization.draw_geometries(components)
+        # Rerun logging
+        # Log correspondences 3d points from live image frame
+        unique_colors = generate_unique_colors(len(points2))
+        colors = np.array(unique_colors) * 255
+        T_d405 = self.urdf.get_transform_fk(
+            robot.get_joint_positions(), "gripper_camera_color_optical_frame"
+        )
+        base_xyt = robot.get_base_pose()
+        base_4x4 = np.eye(4)
+        base_4x4[:3, :3] = Rz(base_xyt[2])
+        base_4x4[:2, 3] = base_xyt[:2]
+        T_d405 = np.matmul(base_4x4, T_d405)
+        rr.set_time_seconds("realtime", time.time())
+        rr.log(
+            "world/perceived_correspondences",
+            rr.Points3D(points2, colors=colors.astype(np.uint8), radii=0.007),
+            static=True,
+        )
+        rr.log(
+            "world/perceived_correspondences",
+            rr.Transform3D(translation=T_d405[:3, 3], mat3x3=T_d405[:3, :3], axis_length=0.00),
+            static=True,
+        )
 
         self.move_robot(robot, R, t)
         return error

From 83fb51bf9cd258b35e1e331a251c9073e3a1e31f Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Fri, 20 Sep 2024 21:54:43 -0700
Subject: [PATCH 207/410] Add custom end effector IK solution

---
 .../agent/manipulation/dinobot_prototype.py   |  37 ++-
 .../config/urdf/stretch_manip_mode.urdf       | 234 ++++++++++++------
 src/stretch/motion/base/ik_solver_base.py     |   3 +-
 src/stretch/motion/kinematics.py              |   8 +-
 src/stretch/motion/pinocchio_ik_solver.py     |  10 +-
 5 files changed, 202 insertions(+), 90 deletions(-)

diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index 0c61b78d6..e45fe6e33 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -136,10 +136,11 @@ def run(
                 ee_rgb = apply_mask_callback(ee_rgb)
             if not isinstance(demo.bottleneck_image_rgb, type(None)):
                 start = time.perf_counter()
+                print("Extracting correspondences...")
                 with torch.no_grad():
                     points1, points2 = self.get_correspondences(demo.bottleneck_image_rgb, ee_rgb)
                 inf_ts = (time.perf_counter() - start) * 1000
-                print(f"\n  current: {inf_ts} ms")
+                print(f"\n  Total correspondence extraction time: {inf_ts} ms")
                 if visualize:
                     if len(points1) == len(points2):
                         im1, im2 = visualize_correspondences(
@@ -185,7 +186,7 @@ def run(
                 print("No bottleneck image found")
         print("Dinobot finished")
 
-    def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray) -> float:
+    def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray):
         """
         Move the robot to the bottleneck pose
         Args:
@@ -307,17 +308,27 @@ def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray) -> float:
         rot = Rotation.from_matrix(T_ee_target[:3, :3])
         joint_state = robot.get_joint_positions().copy()
 
-        try:
-            # Compute the IK solution for the target end-effector position and rotation
-            target_joint_positions, _, _, success, _ = model.manip_ik_for_grasp_frame(
-                target_ee_pos, rot.as_quat(), q0=joint_state
-            )
-
-            # Move the robot to the target joint positions
-            robot.switch_to_manipulation_mode()
-            robot.arm_to(target_joint_positions, blocking=True, head=constants.look_at_ee)
-        except Exception as e:
-            print(f"Failed to move the robot to the target pose: {e}")
+        # try:
+        # Compute the IK solution for the target end-effector position and rotation
+        # target_joint_positions, _, _, success, _ = model.manip_ik_for_grasp_frame(
+        #     target_ee_pos, rot.as_quat(), q0=joint_state
+        # )
+
+        # Compute the IK solution for the target D405
+        target_d405_pos = T_d405_target[:3, 3]
+        rot = Rotation.from_matrix(T_d405_target[:3, :3])
+        joint_state = model._to_manip_format(joint_state)
+        target_joint_positions, success, _ = model.manip_ik(
+            pose_query=(target_d405_pos, rot.as_quat()),
+            q0=joint_state,
+            custom_ee_frame="gripper_camera_color_optical_frame",
+        )
+
+        # Move the robot to the target joint positions
+        robot.switch_to_manipulation_mode()
+        robot.arm_to(target_joint_positions, blocking=True, head=constants.look_at_ee)
+        # except Exception as e:
+        #     print(f"Failed to move the robot to the target pose: {e}")
 
     def move_to_bottleneck(
         self,
diff --git a/src/stretch/config/urdf/stretch_manip_mode.urdf b/src/stretch/config/urdf/stretch_manip_mode.urdf
index 6f1e01073..e39495907 100644
--- a/src/stretch/config/urdf/stretch_manip_mode.urdf
+++ b/src/stretch/config/urdf/stretch_manip_mode.urdf
@@ -1,6 +1,6 @@
 <?xml version="1.0" ?>
 <!-- =================================================================================== -->
-<!-- |    This document was autogenerated by xacro from /home/hello-robot/ament_ws/src/stretch_ros2/stretch_description/urdf/stretch_description.xacro | -->
+<!-- |    This document was autogenerated by xacro from ./stretch_urdf/SE3/xacro/stretch_description_SE3_eoa_wrist_dw3_tool_sg3.xacro | -->
 <!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
 <!-- =================================================================================== -->
 <robot name="stretch">
@@ -13,7 +13,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/base_link.STL"/>
+        <mesh filename="./meshes/base_link.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.79216 0.81961 0.93333 1"/>
@@ -22,7 +22,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/base_link_collision.STL"/>
+        <mesh filename="./meshes/base_link_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -35,7 +35,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_right_wheel.STL"/>
+        <mesh filename="./meshes/link_right_wheel.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.79216 0.81961 0.93333 1"/>
@@ -44,7 +44,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_right_wheel_collision.STL"/>
+        <mesh filename="./meshes/link_right_wheel_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -64,7 +64,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_left_wheel.STL"/>
+        <mesh filename="./meshes/link_left_wheel.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.79216 0.81961 0.93333 1"/>
@@ -73,7 +73,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_left_wheel_collision.STL"/>
+        <mesh filename="./meshes/link_left_wheel_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -104,7 +104,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/omni_wheel_m.STL"/>
+        <mesh filename="./meshes/omni_wheel_m.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
@@ -131,7 +131,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_mast.STL"/>
+        <mesh filename="./meshes/link_mast.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.79216 0.81961 0.93333 1"/>
@@ -140,7 +140,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_mast_collision.STL"/>
+        <mesh filename="./meshes/link_mast_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -159,7 +159,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_lift.STL"/>
+        <mesh filename="./meshes/link_lift.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.79216 0.81961 0.93333 1"/>
@@ -168,7 +168,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_lift_collision.STL"/>
+        <mesh filename="./meshes/link_lift_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -191,7 +191,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_arm_l4.STL"/>
+        <mesh filename="./meshes/link_arm_l4.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.79216 0.81961 0.93333 1"/>
@@ -200,7 +200,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_arm_l4_collision.STL"/>
+        <mesh filename="./meshes/link_arm_l4_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -219,7 +219,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_arm_l3.STL"/>
+        <mesh filename="./meshes/link_arm_l3.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
@@ -228,7 +228,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_arm_l3_collision.STL"/>
+        <mesh filename="./meshes/link_arm_l3_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -249,7 +249,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_arm_l2.STL"/>
+        <mesh filename="./meshes/link_arm_l2.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
@@ -258,7 +258,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_arm_l2_collision.STL"/>
+        <mesh filename="./meshes/link_arm_l2_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -279,7 +279,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_arm_l1.STL"/>
+        <mesh filename="./meshes/link_arm_l1.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.79216 0.81961 0.93333 1"/>
@@ -288,7 +288,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_arm_l1_collision.STL"/>
+        <mesh filename="./meshes/link_arm_l1_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -309,7 +309,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_arm_l0.STL"/>
+        <mesh filename="./meshes/link_arm_l0.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
@@ -318,7 +318,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_arm_l0_collision.STL"/>
+        <mesh filename="./meshes/link_arm_l0_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -339,7 +339,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_wrist_yaw.STL"/>
+        <mesh filename="./meshes/link_wrist_yaw.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.898039215686275 0.917647058823529 0.929411764705882 1"/>
@@ -348,7 +348,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_wrist_yaw_collision.STL"/>
+        <mesh filename="./meshes/link_wrist_yaw_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -374,7 +374,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_head.STL"/>
+        <mesh filename="./meshes/link_head.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
@@ -383,7 +383,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_head_collision.STL"/>
+        <mesh filename="./meshes/link_head_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -402,7 +402,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_head_pan.STL"/>
+        <mesh filename="./meshes/link_head_pan.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
@@ -411,7 +411,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_head_pan_collision.STL"/>
+        <mesh filename="./meshes/link_head_pan_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -432,7 +432,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_head_tilt.STL"/>
+        <mesh filename="./meshes/link_head_tilt.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
@@ -441,7 +441,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_head_tilt_collision.STL"/>
+        <mesh filename="./meshes/link_head_tilt_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -462,7 +462,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_aruco_right_base.STL"/>
+        <mesh filename="./meshes/link_aruco_right_base.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
@@ -471,7 +471,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_aruco_right_base.STL"/>
+        <mesh filename="./meshes/link_aruco_right_base.STL"/>
       </geometry>
     </collision>
   </link>
@@ -490,7 +490,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_aruco_left_base.STL"/>
+        <mesh filename="./meshes/link_aruco_left_base.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
@@ -499,7 +499,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_aruco_left_base.STL"/>
+        <mesh filename="./meshes/link_aruco_left_base.STL"/>
       </geometry>
     </collision>
   </link>
@@ -518,7 +518,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_aruco_shoulder.STL"/>
+        <mesh filename="./meshes/link_aruco_shoulder.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
@@ -527,7 +527,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_aruco_shoulder.STL"/>
+        <mesh filename="./meshes/link_aruco_shoulder.STL"/>
       </geometry>
     </collision>
   </link>
@@ -546,7 +546,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_aruco_top_wrist.STL"/>
+        <mesh filename="./meshes/link_aruco_top_wrist.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
@@ -555,7 +555,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_aruco_top_wrist.STL"/>
+        <mesh filename="./meshes/link_aruco_top_wrist.STL"/>
       </geometry>
     </collision>
   </link>
@@ -574,7 +574,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_aruco_inner_wrist.STL"/>
+        <mesh filename="./meshes/link_aruco_inner_wrist.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
@@ -583,7 +583,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_aruco_inner_wrist.STL"/>
+        <mesh filename="./meshes/link_aruco_inner_wrist.STL"/>
       </geometry>
     </collision>
   </link>
@@ -616,7 +616,7 @@
       <!-- the mesh origin is at front plate in between the two infrared camera axes -->
       <origin rpy="1.5707963267948966 0 1.5707963267948966" xyz="0.0043 -0.0175 0"/>
       <geometry>
-        <mesh filename="../meshes/d435.dae"/>
+        <mesh filename="./meshes/d435.dae"/>
       </geometry>
     </visual>
     <collision>
@@ -708,6 +708,98 @@
     <parent link="camera_gyro_frame"/>
     <child link="camera_gyro_optical_frame"/>
   </joint>
+  <material name="aluminum_d405">
+    <color rgba="0.5 0.5 0.5 1"/>
+  </material>
+  <material name="plastic_d405">
+    <color rgba="0.1 0.1 0.1 1"/>
+  </material>
+  <!-- camera body, with origin at bottom screw mount -->
+  <joint name="gripper_camera_joint" type="fixed">
+    <origin rpy="0 -1.3963 -1.5708" xyz="0 0.044142 0.042067"/>
+    <parent link="link_gripper_s3_body"/>
+    <child link="gripper_camera_bottom_screw_frame"/>
+  </joint>
+  <link name="gripper_camera_bottom_screw_frame"/>
+  <joint name="gripper_camera_link_joint" type="fixed">
+    <origin rpy="0 0 0" xyz="0.01085 0.009 0.021"/>
+    <parent link="gripper_camera_bottom_screw_frame"/>
+    <child link="gripper_camera_link"/>
+  </joint>
+  <link name="gripper_camera_link">
+    <visual>
+      <!-- the mesh origin is at front plate in between the two infrared camera axes -->
+      <origin rpy="1.5707963267948966 0 1.5707963267948966" xyz="0.0038 -0.009 0"/>
+      <geometry>
+        <mesh filename="./meshes/d405.stl" scale="0.001 0.001 0.001"/>
+      </geometry>
+      <material name="aluminum_d405"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="-0.0078 -0.009 0"/>
+      <geometry>
+        <box size="0.023 0.042 0.042"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <!-- The following are not reliable values, and should not be used for modeling -->
+      <mass value="0.072"/>
+      <origin xyz="0 0 0"/>
+      <inertia ixx="0.003881243" ixy="0.0" ixz="0.0" iyy="0.000498940" iyz="0.0" izz="0.003879257"/>
+    </inertial>
+  </link>
+  <!-- camera depth joints and links -->
+  <joint name="gripper_camera_depth_joint" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0 0"/>
+    <parent link="gripper_camera_link"/>
+    <child link="gripper_camera_depth_frame"/>
+  </joint>
+  <link name="gripper_camera_depth_frame"/>
+  <joint name="gripper_camera_depth_optical_joint" type="fixed">
+    <origin rpy="-1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0"/>
+    <parent link="gripper_camera_depth_frame"/>
+    <child link="gripper_camera_depth_optical_frame"/>
+  </joint>
+  <link name="gripper_camera_depth_optical_frame"/>
+  <!-- camera left IR joints and links -->
+  <joint name="gripper_camera_infra1_joint" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0.0 0"/>
+    <parent link="gripper_camera_link"/>
+    <child link="gripper_camera_infra1_frame"/>
+  </joint>
+  <link name="gripper_camera_infra1_frame"/>
+  <joint name="gripper_camera_infra1_optical_joint" type="fixed">
+    <origin rpy="-1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0"/>
+    <parent link="gripper_camera_infra1_frame"/>
+    <child link="gripper_camera_infra1_optical_frame"/>
+  </joint>
+  <link name="gripper_camera_infra1_optical_frame"/>
+  <!-- camera right IR joints and links -->
+  <joint name="gripper_camera_infra2_joint" type="fixed">
+    <origin rpy="0 0 0" xyz="0 -0.018 0"/>
+    <parent link="gripper_camera_link"/>
+    <child link="gripper_camera_infra2_frame"/>
+  </joint>
+  <link name="gripper_camera_infra2_frame"/>
+  <joint name="gripper_camera_infra2_optical_joint" type="fixed">
+    <origin rpy="-1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0"/>
+    <parent link="gripper_camera_infra2_frame"/>
+    <child link="gripper_camera_infra2_optical_frame"/>
+  </joint>
+  <link name="gripper_camera_infra2_optical_frame"/>
+  <!-- camera color joints and links -->
+  <joint name="gripper_camera_color_joint" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0.0 0"/>
+    <parent link="gripper_camera_link"/>
+    <child link="gripper_camera_color_frame"/>
+  </joint>
+  <link name="gripper_camera_color_frame"/>
+  <joint name="gripper_camera_color_optical_joint" type="fixed">
+    <origin rpy="-1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0"/>
+    <parent link="gripper_camera_color_frame"/>
+    <child link="gripper_camera_color_optical_frame"/>
+  </joint>
+  <link name="gripper_camera_color_optical_frame"/>
   <link name="laser">
     <inertial>
       <origin rpy="0 0 0" xyz="0.011979 -0.000523 -0.01998"/>
@@ -717,7 +809,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/laser.STL"/>
+        <mesh filename="./meshes/laser.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.250980392156863 0.250980392156863 0.250980392156863 1"/>
@@ -726,7 +818,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/laser_collision.STL"/>
+        <mesh filename="./meshes/laser_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -745,7 +837,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/base_imu.STL"/>
+        <mesh filename="./meshes/base_imu.STL"/>
       </geometry>
       <material name="">
         <color rgba="0 0.772549019607843 0.207843137254902 1"/>
@@ -754,7 +846,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/base_imu.STL"/>
+        <mesh filename="./meshes/base_imu.STL"/>
       </geometry>
     </collision>
   </link>
@@ -773,7 +865,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/respeaker_base.STL"/>
+        <mesh filename="./meshes/respeaker_base.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.79216 0.81961 0.93333 1"/>
@@ -782,7 +874,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/respeaker_base.STL"/>
+        <mesh filename="./meshes/respeaker_base.STL"/>
       </geometry>
     </collision>
   </link>
@@ -801,7 +893,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_DW3_wrist_yaw_bottom.STL"/>
+        <mesh filename="./meshes/link_DW3_wrist_yaw_bottom.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
@@ -810,7 +902,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_DW3_wrist_yaw_bottom_collision.STL"/>
+        <mesh filename="./meshes/link_DW3_wrist_yaw_bottom_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -829,7 +921,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_DW3_wrist_pitch.STL"/>
+        <mesh filename="./meshes/link_DW3_wrist_pitch.STL"/>
       </geometry>
       <material name="">
         <color rgba="1 1 1 1"/>
@@ -838,7 +930,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_DW3_wrist_pitch_collision.STL"/>
+        <mesh filename="./meshes/link_DW3_wrist_pitch_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -858,7 +950,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_DW3_wrist_roll.STL"/>
+        <mesh filename="./meshes/link_DW3_wrist_roll.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.898039215686275 0.917647058823529 0.929411764705882 1"/>
@@ -867,7 +959,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_DW3_wrist_roll_collision.STL"/>
+        <mesh filename="./meshes/link_DW3_wrist_roll_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -895,7 +987,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_SG3_gripper_body.STL"/>
+        <mesh filename="./meshes/link_SG3_gripper_body.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
@@ -904,7 +996,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_SG3_gripper_body_collision.STL"/>
+        <mesh filename="./meshes/link_SG3_gripper_body_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -923,7 +1015,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_SG3_gripper_finger_right.STL"/>
+        <mesh filename="./meshes/link_SG3_gripper_finger_right.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
@@ -932,7 +1024,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_SG3_gripper_finger_right_collision.STL"/>
+        <mesh filename="./meshes/link_SG3_gripper_finger_right_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -952,7 +1044,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_SG3_gripper_fingertip_right.STL"/>
+        <mesh filename="./meshes/link_SG3_gripper_fingertip_right.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.250980392156863 0.250980392156863 0.250980392156863 1"/>
@@ -961,7 +1053,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_SG3_gripper_fingertip_right_collision.STL"/>
+        <mesh filename="./meshes/link_SG3_gripper_fingertip_right_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -980,7 +1072,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_SG3_aruco_fingertip_right.STL"/>
+        <mesh filename="./meshes/link_SG3_aruco_fingertip_right.STL"/>
       </geometry>
       <material name="">
         <color rgba="1 1 1 1"/>
@@ -989,7 +1081,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_SG3_aruco_fingertip_right.STL"/>
+        <mesh filename="./meshes/link_SG3_aruco_fingertip_right.STL"/>
       </geometry>
     </collision>
   </link>
@@ -1008,7 +1100,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_SG3_gripper_finger_left.STL"/>
+        <mesh filename="./meshes/link_SG3_gripper_finger_left.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
@@ -1017,7 +1109,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_SG3_gripper_finger_left_collision.STL"/>
+        <mesh filename="./meshes/link_SG3_gripper_finger_left_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -1037,7 +1129,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_SG3_gripper_fingertip_left.STL"/>
+        <mesh filename="./meshes/link_SG3_gripper_fingertip_left.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.250980392156863 0.250980392156863 0.250980392156863 1"/>
@@ -1046,7 +1138,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_SG3_gripper_fingertip_left_collision.STL"/>
+        <mesh filename="./meshes/link_SG3_gripper_fingertip_left_collision.STL"/>
       </geometry>
     </collision>
   </link>
@@ -1065,7 +1157,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_SG3_aruco_fingertip_left.STL"/>
+        <mesh filename="./meshes/link_SG3_aruco_fingertip_left.STL"/>
       </geometry>
       <material name="">
         <color rgba="1 1 1 1"/>
@@ -1074,7 +1166,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_SG3_aruco_fingertip_left.STL"/>
+        <mesh filename="./meshes/link_SG3_aruco_fingertip_left.STL"/>
       </geometry>
     </collision>
   </link>
@@ -1093,7 +1185,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_SG3_aruco_d405.STL"/>
+        <mesh filename="./meshes/link_SG3_aruco_d405.STL"/>
       </geometry>
       <material name="">
         <color rgba="1 1 1 1"/>
@@ -1102,7 +1194,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_SG3_aruco_d405.STL"/>
+        <mesh filename="./meshes/link_SG3_aruco_d405.STL"/>
       </geometry>
     </collision>
   </link>
@@ -1128,7 +1220,7 @@
     <visual>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_SE3_head_nav_cam.STL"/>
+        <mesh filename="./meshes/link_SE3_head_nav_cam.STL"/>
       </geometry>
       <material name="">
         <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
@@ -1137,7 +1229,7 @@
     <collision>
       <origin rpy="0 0 0" xyz="0 0 0"/>
       <geometry>
-        <mesh filename="../meshes/link_SE3_head_nav_cam_collision.STL"/>
+        <mesh filename="./meshes/link_SE3_head_nav_cam_collision.STL"/>
       </geometry>
     </collision>
   </link>
diff --git a/src/stretch/motion/base/ik_solver_base.py b/src/stretch/motion/base/ik_solver_base.py
index b25285429..cea88080e 100644
--- a/src/stretch/motion/base/ik_solver_base.py
+++ b/src/stretch/motion/base/ik_solver_base.py
@@ -13,7 +13,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
-from typing import Tuple
+from typing import Optional, Tuple
 
 import numpy as np
 
@@ -44,6 +44,7 @@ def compute_ik(
         num_attempts: int = 1,
         verbose: bool = False,
         ignore_missing_joints: bool = False,
+        custom_ee_frame: Optional[str] = None,
     ) -> Tuple[np.ndarray, bool, dict]:
 
         """
diff --git a/src/stretch/motion/kinematics.py b/src/stretch/motion/kinematics.py
index df976b05f..fdd560e1f 100644
--- a/src/stretch/motion/kinematics.py
+++ b/src/stretch/motion/kinematics.py
@@ -566,6 +566,7 @@ def manip_ik(
         update_pb: bool = True,
         num_attempts: int = 1,
         verbose: bool = False,
+        custom_ee_frame: Optional[str] = None,
     ):
         """IK in manipulation mode. Takes in a 4x4 pose_query matrix in se(3) and initial
         configuration of the robot.
@@ -590,7 +591,12 @@ def manip_ik(
             raise NotImplementedError()
 
         q, success, debug_info = self.manip_ik_solver.compute_ik(
-            pos, quat, q0, num_attempts=num_attempts, verbose=verbose
+            pos,
+            quat,
+            q0,
+            num_attempts=num_attempts,
+            verbose=verbose,
+            custom_ee_frame=custom_ee_frame,
         )
 
         if q is not None and success:
diff --git a/src/stretch/motion/pinocchio_ik_solver.py b/src/stretch/motion/pinocchio_ik_solver.py
index 81ec1c43c..69dada759 100644
--- a/src/stretch/motion/pinocchio_ik_solver.py
+++ b/src/stretch/motion/pinocchio_ik_solver.py
@@ -171,6 +171,7 @@ def compute_ik(
         num_attempts: int = 1,
         verbose: bool = False,
         ignore_missing_joints: bool = False,
+        custom_ee_frame: Optional[str] = None,
     ) -> Tuple[np.ndarray, bool, dict]:
         """given end-effector position and quaternion, return joint values.
 
@@ -181,7 +182,8 @@ def compute_ik(
             max iterations: time budget in number of steps; included for compatibility with pb
         """
         i = 0
-
+        _ee_frame = self.ee_frame_idx if custom_ee_frame is None else custom_ee_frame
+        _ee_frame_idx = [f.name for f in self.model.frames].index(_ee_frame)
         if q_init is None:
             q = self.q_neutral.copy()
             if num_attempts > 1:
@@ -196,9 +198,9 @@ def compute_ik(
         desired_ee_pose = pinocchio.SE3(R.from_quat(quat_desired).as_matrix(), pos_desired)
         while True:
             pinocchio.forwardKinematics(self.model, self.data, q)
-            pinocchio.updateFramePlacement(self.model, self.data, self.ee_frame_idx)
+            pinocchio.updateFramePlacement(self.model, self.data, _ee_frame_idx)
 
-            dMi = desired_ee_pose.actInv(self.data.oMf[self.ee_frame_idx])
+            dMi = desired_ee_pose.actInv(self.data.oMf[_ee_frame_idx])
             err = pinocchio.log(dMi).vector
             if verbose:
                 print(f"[pinocchio_ik_solver] iter={i}; error={err}")
@@ -212,7 +214,7 @@ def compute_ik(
                 self.model,
                 self.data,
                 q,
-                self.ee_frame_idx,
+                _ee_frame_idx,
                 pinocchio.ReferenceFrame.LOCAL,
             )
             v = -J.T.dot(np.linalg.solve(J.dot(J.T) + self.DAMP * np.eye(6), err))

From 3aafdc64193c79d14f918bc8fa77b51bb95708d7 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Fri, 20 Sep 2024 22:57:49 -0700
Subject: [PATCH 208/410] Servo to bottleneck working

---
 .../agent/manipulation/dinobot_prototype.py   | 106 ++++++++----------
 1 file changed, 46 insertions(+), 60 deletions(-)

diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index e45fe6e33..71a59343c 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -33,6 +33,8 @@
 from stretch.perception.detection.detic import DeticPerception
 from stretch.visualization.urdf_visualizer import URDFVisualizer
 
+DEBUG_VERIFICATION = False
+
 
 class Demo:
     """
@@ -226,7 +228,7 @@ def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray):
         rr.set_time_seconds("realtime", time.time())
         T_d405_target_world = np.matmul(base_4x4, T_d405_target)
         rr.log(
-            "world/d405_bottleneck_frame",
+            "world/d405_bottleneck_frame/blob",
             rr.Points3D(
                 [0, 0, 0],
                 colors=[255, 255, 0, 255],
@@ -245,90 +247,74 @@ def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray):
             static=True,
         )
 
-        T_ee_target_world = np.matmul(base_4x4, T_ee_target)
-        rr.log(
-            "world/ee_bottleneck_frame",
-            rr.Points3D(
-                [0, 0, 0],
-                colors=[0, 255, 255, 255],
-                labels="target_ee_bottleneck_frame",
-                radii=0.01,
-            ),
-            static=True,
-        )
-        rr.log(
-            "world/ee_bottleneck_frame",
-            rr.Transform3D(
-                translation=T_ee_target_world[:3, 3],
-                mat3x3=T_ee_target_world[:3, :3],
-                axis_length=0.3,
-            ),
-            static=True,
-        )
-        rr.log(
-            "world/target_ee_d405_line",
-            rr.LineStrips3D(
-                [T_ee_target_world[:3, 3], T_d405_target_world[:3, 3]],
-                colors=[255, 165, 0, 255],
-                radii=0.005,
-            ),
-            static=True,
-        )
-
-        rr.log(
-            "world/robot/current_ee_d405_line",
-            rr.LineStrips3D([T_ee[:3, 3], T_d405[:3, 3]], colors=[128, 0, 128, 255], radii=0.005),
-            static=True,
+        d405_bottleneck_arrow = rr.Arrows3D(
+            origins=[0, 0, 0],
+            vectors=[0, 0, 0.2],
+            radii=0.005,
+            labels="d405_bottleneck_frame",
+            colors=[128, 0, 128, 255],
         )
+        rr.log("world/d405_bottleneck_frame/arrow", d405_bottleneck_arrow)
 
         rr.log(
             "world/ee_camera",
             rr.Points3D(
                 [0, 0, 0],
                 colors=[255, 255, 0, 255],
-                labels="current_d405_frame",
+                labels="ee_camera_frame",
                 radii=0.01,
             ),
             static=True,
         )
+
+        current_d405_arrow = rr.Arrows3D(
+            origins=[0, 0, 0],
+            vectors=[0, 0, 0.5],
+            radii=0.005,
+            labels="d405_frame",
+            colors=[255, 105, 180, 255],
+        )
+        rr.log("world/ee_camera/arrow", current_d405_arrow)
+
         rr.log(
             "world/ee",
             rr.Points3D(
                 [0, 0, 0],
                 colors=[0, 255, 255, 255],
-                labels="current_ee_frame",
+                labels="ee_frame",
                 radii=0.01,
             ),
             static=True,
         )
-        input("Press Enter to move the robot to the target pose")
+        if DEBUG_VERIFICATION:
+            input("Press Enter to move the robot to the target pose")
 
         # Extract the target end-effector position and rotation
         target_ee_pos = T_ee_target[:3, 3]
         rot = Rotation.from_matrix(T_ee_target[:3, :3])
         joint_state = robot.get_joint_positions().copy()
 
-        # try:
-        # Compute the IK solution for the target end-effector position and rotation
-        # target_joint_positions, _, _, success, _ = model.manip_ik_for_grasp_frame(
-        #     target_ee_pos, rot.as_quat(), q0=joint_state
-        # )
-
-        # Compute the IK solution for the target D405
-        target_d405_pos = T_d405_target[:3, 3]
-        rot = Rotation.from_matrix(T_d405_target[:3, :3])
-        joint_state = model._to_manip_format(joint_state)
-        target_joint_positions, success, _ = model.manip_ik(
-            pose_query=(target_d405_pos, rot.as_quat()),
-            q0=joint_state,
-            custom_ee_frame="gripper_camera_color_optical_frame",
-        )
-
-        # Move the robot to the target joint positions
-        robot.switch_to_manipulation_mode()
-        robot.arm_to(target_joint_positions, blocking=True, head=constants.look_at_ee)
-        # except Exception as e:
-        #     print(f"Failed to move the robot to the target pose: {e}")
+        try:
+            # Compute the IK solution for the target end-effector position and rotation
+            # target_joint_positions, _, _, success, _ = model.manip_ik_for_grasp_frame(
+            #     target_ee_pos, rot.as_quat(), q0=joint_state
+            # )
+
+            # Compute the IK solution for the target D405
+            target_d405_pos = T_d405_target[:3, 3]
+            rot = Rotation.from_matrix(T_d405_target[:3, :3])
+            joint_state = model._to_manip_format(joint_state)
+            target_joint_positions, success, _ = model.manip_ik(
+                pose_query=(target_d405_pos, rot.as_quat()),
+                q0=joint_state,
+                custom_ee_frame="gripper_camera_color_optical_frame",
+            )
+
+            # Move the robot to the target joint positions
+            robot.switch_to_manipulation_mode()
+            robot.arm_to(target_joint_positions, blocking=True, head=constants.look_at_ee)
+        except Exception as e:
+            print(f"Failed to move the robot to the target pose: {e}")
 
     def move_to_bottleneck(
         self,
@@ -460,6 +446,6 @@ def apply_mask_callback(image: np.ndarray) -> np.ndarray:
         print("\nFirst frame is the bottleneck image\n")
         print("=================================================")
         input("Displace the object and press Enter:")
-        dinobot.run(robot, demo, visualize=True, apply_mask_callback=apply_mask_callback)
+        dinobot.run(robot, demo, visualize=False, apply_mask_callback=apply_mask_callback)
     else:
         print(f"Object ID: {track_object_id} not found in the image")

From 0e674e52787f071002e48c88a63d115ca71d12b1 Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Fri, 20 Sep 2024 23:08:42 -0700
Subject: [PATCH 209/410] Add debug var

---
 src/stretch/agent/manipulation/dinobot_prototype.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index 71a59343c..080749ad9 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -33,7 +33,7 @@
 from stretch.perception.detection.detic import DeticPerception
 from stretch.visualization.urdf_visualizer import URDFVisualizer
 
-DEBUG_VERIFICATION = False
+DEBUG = False
 
 
 class Demo:
@@ -286,7 +286,7 @@ def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray):
             ),
             static=True,
         )
-        if DEBUG_VERIFICATION:
+        if DEBUG:
             input("Press Enter to move the robot to the target pose")
 
         # Extract the target end-effector position and rotation
@@ -417,6 +417,7 @@ def generate_unique_colors(n: int) -> List[Tuple[int, int, int]]:
 
 
 if __name__ == "__main__":
+    DEBUG = True
     robot = RobotClient(robot_ip="10.0.0.2")
     dinobot = Dinobot()
     detic = DeticPerception()
@@ -446,6 +447,6 @@ def apply_mask_callback(image: np.ndarray) -> np.ndarray:
         print("\nFirst frame is the bottleneck image\n")
         print("=================================================")
         input("Displace the object and press Enter:")
-        dinobot.run(robot, demo, visualize=False, apply_mask_callback=apply_mask_callback)
+        dinobot.run(robot, demo, visualize=DEBUG, apply_mask_callback=apply_mask_callback)
     else:
         print(f"Object ID: {track_object_id} not found in the image")

From 7333da3af3a69ed1fd92a708ce9901bf149b409c Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Sat, 21 Sep 2024 01:34:33 -0700
Subject: [PATCH 210/410] Data recording and replay working

---
 .../agent/manipulation/dinobot_prototype.py   | 282 +++++++++++++-----
 1 file changed, 204 insertions(+), 78 deletions(-)

diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index 080749ad9..7f19c30b9 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -11,9 +11,12 @@
 
 sys.path.append("/home/hello-robot/repos/dino-vit-features")
 import math
+import os
+import pickle
 import time
 from typing import List, Tuple
 
+import click
 import cv2
 import matplotlib.pyplot as plt
 import numpy as np
@@ -119,7 +122,7 @@ def run(
         demo: Demo,
         visualize: bool = False,
         apply_mask_callback=None,
-        error_threshold: float = 0.03,
+        error_threshold: float = 0.17,
     ):
         """
         Run the Dinobot algorithm
@@ -218,81 +221,15 @@ def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray):
         T_ee_target[:3, 3] = T_ee_target[:3, 3] - t_d405_ee
         T_ee_target[:3, :3] = np.dot(T_ee_target[:3, :3], R_d405_ee)
 
-        base_xyt = robot.get_base_pose()
-        base_4x4 = np.eye(4)
-        base_4x4[:3, :3] = Rz(base_xyt[2])
-        base_4x4[:2, 3] = base_xyt[:2]
-
-        # Rerun logging
-        # Log the computed target bottleneck d405 and end-effector frames in the world frame
-        rr.set_time_seconds("realtime", time.time())
-        T_d405_target_world = np.matmul(base_4x4, T_d405_target)
-        rr.log(
-            "world/d405_bottleneck_frame/blob",
-            rr.Points3D(
-                [0, 0, 0],
-                colors=[255, 255, 0, 255],
-                labels="target_d405_bottleneck_frame",
-                radii=0.01,
-            ),
-            static=True,
-        )
-        rr.log(
-            "world/d405_bottleneck_frame",
-            rr.Transform3D(
-                translation=T_d405_target_world[:3, 3],
-                mat3x3=T_d405_target_world[:3, :3],
-                axis_length=0.3,
-            ),
-            static=True,
-        )
-
-        d405_bottleneck_arrow = rr.Arrows3D(
-            origins=[0, 0, 0],
-            vectors=[0, 0, 0.2],
-            radii=0.005,
-            labels="d405_bottleneck_frame",
-            colors=[128, 0, 128, 255],
-        )
-        rr.log("world/d405_bottleneck_frame/arrow", d405_bottleneck_arrow)
-
-        rr.log(
-            "world/ee_camera",
-            rr.Points3D(
-                [0, 0, 0],
-                colors=[255, 255, 0, 255],
-                labels="ee_camera_frame",
-                radii=0.01,
-            ),
-            static=True,
-        )
-
-        current_d405_arrow = rr.Arrows3D(
-            origins=[0, 0, 0],
-            vectors=[0, 0, 0.5],
-            radii=0.005,
-            labels="d405_frame",
-            colors=[255, 105, 180, 255],
-        )
-        rr.log("world/ee_camera/arrow", current_d405_arrow)
+        rerun_log(robot, T_d405_target)
+        joint_state = robot.get_joint_positions().copy()
 
-        rr.log(
-            "world/ee",
-            rr.Points3D(
-                [0, 0, 0],
-                colors=[0, 255, 255, 255],
-                labels="ee_frame",
-                radii=0.01,
-            ),
-            static=True,
-        )
         if DEBUG:
             input("Press Enter to move the robot to the target pose")
 
         # Extract the target end-effector position and rotation
-        target_ee_pos = T_ee_target[:3, 3]
-        rot = Rotation.from_matrix(T_ee_target[:3, :3])
-        joint_state = robot.get_joint_positions().copy()
+        # target_ee_pos = T_ee_target[:3, 3]
+        # rot = Rotation.from_matrix(T_ee_target[:3, :3])
 
         try:
             # Compute the IK solution for the target end-effector position and rotation
@@ -300,7 +237,7 @@ def move_robot(self, robot: RobotClient, R: np.ndarray, t: np.ndarray):
             #     target_ee_pos, rot.as_quat(), q0=joint_state
             # )
 
-            # Compute the IK solution for the target D405
+            # Compute the IK solution for the target D405 as custom ee frame
             target_d405_pos = T_d405_target[:3, 3]
             rot = Rotation.from_matrix(T_d405_target[:3, :3])
             joint_state = model._to_manip_format(joint_state)
@@ -377,12 +314,10 @@ def move_to_bottleneck(
         rr.log(
             "world/perceived_correspondences",
             rr.Points3D(points2, colors=colors.astype(np.uint8), radii=0.007),
-            static=True,
         )
         rr.log(
             "world/perceived_correspondences",
             rr.Transform3D(translation=T_d405[:3, 3], mat3x3=T_d405[:3, :3], axis_length=0.00),
-            static=True,
         )
 
         self.move_robot(robot, R, t)
@@ -416,12 +351,181 @@ def generate_unique_colors(n: int) -> List[Tuple[int, int, int]]:
     return [tuple(c for c in colors(i)[:3]) for i in range(n)]
 
 
+class DemoRecorder(Demo):
+    def __init__(
+        self, image: np.ndarray, depth: np.ndarray, camera_K: np.ndarray, mask: np.ndarray
+    ):
+        super().__init__(image, depth, camera_K, mask)
+        self.__first_frame = False
+        self.trajectories = {}
+        self.start_ts = None
+        self.__id = 0
+        self._delay = 0.5
+        self.urdf = URDFVisualizer()
+
+    def _step(self, ee_rgb: np.ndarray, ee_depth: np.ndarray, d405_frame: np.ndarray):
+        if not self.__first_frame:
+            self.__first_frame = True
+            self.start_ts = time.time()
+        self.trajectories[self.__id] = {
+            "ee_rgb": ee_rgb,
+            "ee_depth": ee_depth,
+            "d405_frame": d405_frame,
+        }
+        self.__id += 1
+
+    def save_demo(self, filepath: str = f"./demo_{time.strftime('%Y%m%d-%H%M%S')}"):
+        demo_data = {
+            "bottleneck_image_rgb": self.bottleneck_image_rgb,
+            "bottleneck_image_depth": self.bottleneck_image_depth,
+            "bottleneck_image_camera_K": self.bottleneck_image_camera_K,
+            "object_mask": self.object_mask,
+            "trajectories": self.trajectories,
+        }
+
+        with open(os.path.join(f"{filepath}.pkl"), "wb") as f:
+            pickle.dump(demo_data, f)
+
+    def collect_demo(
+        self, robot: RobotClient, filepath: str = f"./demo_{time.strftime('%Y%m%d-%H%M%S')}"
+    ):
+        print("=================================================")
+        print("Collect Demonstration through back driving on run stop")
+        print("-------------------------------------------------")
+        input(
+            click.style(
+                "Press Enter to start recording the demonstration frame by frame...",
+                fg="yellow",
+                bold=True,
+            )
+        )
+        click.secho("Recording demonstration...", fg="green", bold=True)
+        while True:
+            servo = robot.get_servo_observation()
+            ee_rgb = servo.ee_rgb.copy()
+            ee_depth = servo.ee_depth.copy()
+            d405_frame = self.urdf.get_transform_fk(
+                robot.get_joint_positions(), "gripper_camera_color_optical_frame"
+            )
+            self._step(ee_rgb, ee_depth, d405_frame)
+            if click.confirm("Record Next Frame?"):
+                continue
+            else:
+                self.save_demo(filepath)
+                click.secho(
+                    f"Demonstration recording finished. N_Frames: {self.__id+1}",
+                    fg="green",
+                    bold=True,
+                )
+                break
+
+    def replay_demo(self, robot: RobotClient):
+        print("=================================================")
+        print("            Replay Demonstration")
+        print("-------------------------------------------------")
+        for id in self.trajectories:
+            print(f"Frame ID: {id}")
+            self.move_to_d405_frame(robot, self.trajectories[id]["d405_frame"])
+            time.sleep(self._delay)
+
+    def move_to_d405_frame(self, robot: RobotClient, T_d405_target: np.ndarray):
+        model = robot.get_robot_model()
+        # Compute the IK solution for the target D405 as custom ee frame
+        target_d405_pos = T_d405_target[:3, 3]
+        rerun_log(robot, T_d405_target)
+        rot = Rotation.from_matrix(T_d405_target[:3, :3])
+        joint_state = robot.get_joint_positions().copy()
+        joint_state = model._to_manip_format(joint_state)
+        target_joint_positions, success, _ = model.manip_ik(
+            pose_query=(target_d405_pos, rot.as_quat()),
+            q0=joint_state,
+            custom_ee_frame="gripper_camera_color_optical_frame",
+        )
+
+        # Move the robot to the target joint positions
+        robot.switch_to_manipulation_mode()
+        robot.arm_to(target_joint_positions, blocking=True, head=constants.look_at_ee)
+
+
+def rerun_log(robot: RobotClient, T_d405_target: np.ndarray):
+    """
+    Rerun logging method
+    """
+    base_xyt = robot.get_base_pose()
+    base_4x4 = np.eye(4)
+    base_4x4[:3, :3] = Rz(base_xyt[2])
+    base_4x4[:2, 3] = base_xyt[:2]
+
+    # Rerun logging
+    # Log the computed target bottleneck d405 and end-effector frames in the world frame
+    rr.set_time_seconds("realtime", time.time())
+    T_d405_target_world = np.matmul(base_4x4, T_d405_target)
+    rr.log(
+        "world/d405_bottleneck_frame/blob",
+        rr.Points3D(
+            [0, 0, 0],
+            colors=[255, 255, 0, 255],
+            labels="target_d405_bottleneck_frame",
+            radii=0.01,
+        ),
+    )
+    rr.log(
+        "world/d405_bottleneck_frame",
+        rr.Transform3D(
+            translation=T_d405_target_world[:3, 3],
+            mat3x3=T_d405_target_world[:3, :3],
+            axis_length=0.3,
+        ),
+    )
+
+    d405_bottleneck_arrow = rr.Arrows3D(
+        origins=[0, 0, 0],
+        vectors=[0, 0, 0.2],
+        radii=0.0025,
+        labels="d405_bottleneck_frame",
+        colors=[128, 0, 128, 255],
+    )
+    rr.log("world/d405_bottleneck_frame/arrow", d405_bottleneck_arrow)
+
+    rr.log(
+        "world/ee_camera",
+        rr.Points3D(
+            [0, 0, 0],
+            colors=[255, 255, 0, 255],
+            labels="ee_camera_frame",
+            radii=0.01,
+        ),
+    )
+
+    current_d405_arrow = rr.Arrows3D(
+        origins=[0, 0, 0],
+        vectors=[0, 0, 0.5],
+        radii=0.005,
+        labels="d405_frame",
+        colors=[255, 105, 180, 255],
+    )
+    rr.log("world/ee_camera/arrow", current_d405_arrow)
+
+    rr.log(
+        "world/ee",
+        rr.Points3D(
+            [0, 0, 0],
+            colors=[0, 255, 255, 255],
+            labels="ee_frame",
+            radii=0.01,
+        ),
+    )
+
+
 if __name__ == "__main__":
-    DEBUG = True
+    DEBUG = False
     robot = RobotClient(robot_ip="10.0.0.2")
     dinobot = Dinobot()
+    error_threshold = 0.17
     detic = DeticPerception()
     track_object_id = 41  # detic object id for cup
+    demo = DemoRecorder
+    # demo = None
 
     # First frame is the bottleneck image for now
     bottleneck_image_rgb = robot.get_servo_observation().ee_rgb
@@ -441,12 +545,34 @@ def apply_mask_callback(image: np.ndarray) -> np.ndarray:
 
     if track_object_id in task_observations["instance_classes"]:
         object_mask = semantic == track_object_id
-        demo = Demo(
+
+        # Collect demonstration
+        demo = DemoRecorder(
             bottleneck_image_rgb, bottleneck_image_depth, bottleneck_image_camera_K, object_mask
         )
+        demo.collect_demo(robot)
+
         print("\nFirst frame is the bottleneck image\n")
         print("=================================================")
-        input("Displace the object and press Enter:")
-        dinobot.run(robot, demo, visualize=DEBUG, apply_mask_callback=apply_mask_callback)
+        input(
+            click.style(
+                "Displace the object and backdrive robot to desired location and press Enter to replay trajectory:",
+                fg="yellow",
+                bold=True,
+            )
+        )
+
+        print("Visually servoing to the bottleneck pose...")
+        # Visual Servo to bottleneck pose
+        dinobot.run(
+            robot,
+            demo,
+            visualize=DEBUG,
+            apply_mask_callback=apply_mask_callback,
+            error_threshold=error_threshold,
+        )
+        print("Replaying 3D trajectories...")
+        # Replay demonstration
+        demo.replay_demo(robot)
     else:
         print(f"Object ID: {track_object_id} not found in the image")

From a840a38794a8d5774a9117f7dac64a3de35865fc Mon Sep 17 00:00:00 2001
From: Mohamed Fazil <mfazil@hello-robot.com>
Date: Sat, 21 Sep 2024 02:27:13 -0700
Subject: [PATCH 211/410] Instruction clean up

---
 .../agent/manipulation/dinobot_prototype.py   | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/stretch/agent/manipulation/dinobot_prototype.py b/src/stretch/agent/manipulation/dinobot_prototype.py
index 7f19c30b9..c0c1414d9 100644
--- a/src/stretch/agent/manipulation/dinobot_prototype.py
+++ b/src/stretch/agent/manipulation/dinobot_prototype.py
@@ -360,7 +360,7 @@ def __init__(
         self.trajectories = {}
         self.start_ts = None
         self.__id = 0
-        self._delay = 0.5
+        self._delay = 2
         self.urdf = URDFVisualizer()
 
     def _step(self, ee_rgb: np.ndarray, ee_depth: np.ndarray, d405_frame: np.ndarray):
@@ -524,8 +524,16 @@ def rerun_log(robot: RobotClient, T_d405_target: np.ndarray):
     error_threshold = 0.17
     detic = DeticPerception()
     track_object_id = 41  # detic object id for cup
-    demo = DemoRecorder
-    # demo = None
+
+    print("\nFirst frame is the bottleneck image\n")
+    print("=================================================")
+    input(
+        click.style(
+            "Displace the object and backdrive robot to desired location and press Enter to collect demo:",
+            fg="yellow",
+            bold=True,
+        )
+    )
 
     # First frame is the bottleneck image for now
     bottleneck_image_rgb = robot.get_servo_observation().ee_rgb
@@ -551,17 +559,13 @@ def apply_mask_callback(image: np.ndarray) -> np.ndarray:
             bottleneck_image_rgb, bottleneck_image_depth, bottleneck_image_camera_K, object_mask
         )
         demo.collect_demo(robot)
-
-        print("\nFirst frame is the bottleneck image\n")
-        print("=================================================")
         input(
             click.style(
-                "Displace the object and backdrive robot to desired location and press Enter to replay trajectory:",
+                "Displace the object and backdrive robot to another location and press Enter to try a grasp:",
                 fg="yellow",
                 bold=True,
             )
         )
-
         print("Visually servoing to the bottleneck pose...")
         # Visual Servo to bottleneck pose
         dinobot.run(

From 524d436a96843dabe0dd205b3027d5627b365ddd Mon Sep 17 00:00:00 2001
From: Peiqi Liu <leo20021210@163.com>
Date: Mon, 23 Sep 2024 09:07:19 -0400
Subject: [PATCH 212/410] update

---
 src/stretch/config/default_planner.yaml       |   2 +-
 src/stretch/config/dynav_config.yaml          |   4 +-
 src/stretch/dynav/llm_localizer.py            | 236 ++++++++++--------
 src/stretch/dynav/llm_server.py               |  64 +++--
 src/stretch/dynav/mapping_utils/a_star.py     |   3 +-
 src/stretch/dynav/mapping_utils/voxel.py      |   2 +-
 src/stretch/dynav/mapping_utils/voxel_map.py  |   4 +-
 .../dynav/ok_robot_hw/utils/grasper_utils.py  |   2 +-
 src/stretch/dynav/robot_agent_manip_mdp.py    |  19 +-
 src/stretch/dynav/run_manip.py                |  16 +-
 src/stretch/dynav/run_ok_nav.py               |  20 +-
 src/stretch/dynav/voxel_map_localizer.py      |   3 +
 src/stretch/dynav/voxel_map_server.py         |  59 +++--
 13 files changed, 252 insertions(+), 182 deletions(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 2586e368c..e42dfce02 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -12,7 +12,7 @@ voxel_size: 0.05
 # 0.05 seems too low
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
-obs_min_density: 10  # This many points makes it an obstacle
+obs_min_density: 5  # This many points makes it an obstacle
 
 # Padding
 # pad_obstacles: 2  # Add this many units (voxel_size) to the area around obstacles
diff --git a/src/stretch/config/dynav_config.yaml b/src/stretch/config/dynav_config.yaml
index e5955a1a7..95c1d8e73 100644
--- a/src/stretch/config/dynav_config.yaml
+++ b/src/stretch/config/dynav_config.yaml
@@ -8,7 +8,7 @@ open_vocab_cat_map_file: projects/real_world_ovmm/configs/example_cat_map.json
 voxel_size: 0.1
 obs_min_height: 0.2  # Ignore things less than this high when planning motions
 obs_max_height: 1.5  # Ignore things over this height (eg ceilings)
-obs_min_density: 15  # This many points makes it an obstacle
+obs_min_density: 5  # This many points makes it an obstacle
 exp_min_density: 1
 
 # Padding
@@ -26,7 +26,7 @@ filters:
         use_median_filter: True
         median_filter_size: 2
         median_filter_max_error: 0.01
-        use_derivative_filter: True
+        use_derivative_filter: False
         derivative_filter_threshold: 0.1
         # use_voxel_filter: True
 
diff --git a/src/stretch/dynav/llm_localizer.py b/src/stretch/dynav/llm_localizer.py
index a0d273e28..4dd767e20 100644
--- a/src/stretch/dynav/llm_localizer.py
+++ b/src/stretch/dynav/llm_localizer.py
@@ -101,13 +101,28 @@ def get_xyz(depth, pose, intrinsics):
 
     return xyz
 
+
 class LLM_Localizer():
-    def __init__(self, voxel_map_wrapper = None, exist_model = 'gpt-4o', loc_model = 'owlv2', device = 'cuda'):
+    def __init__(self, voxel_map_wrapper = None, exist_model = 'gemini-1.5-pro', loc_model = 'owlv2', device = 'cuda'):
         self.voxel_map_wrapper = voxel_map_wrapper
         self.device = device
         self.voxel_pcd = VoxelizedPointcloud(voxel_size=0.2).to(self.device)
-        # self.exist_model = YOLOWorld("yolov8l-worldv2.pt")
         self.existence_checking_model = exist_model
+        
+        self.api_key_1 = os.getenv('GOOGLE_API_KEY')
+        self.api_key_2 = os.getenv('GOOGLE_API_KEY_2')
+        self.api_key_3 = os.getenv('GOOGLE_API_KEY_3')
+        self.api_key = self.api_key_1
+
+
+        self.context_length = 60
+        self.count_threshold = 3
+        if 'gpt' in self.existence_checking_model:
+            self.max_img_per_request = 30
+        else:
+            self.max_img_per_request = 200
+
+
         if exist_model == 'gpt-4o':
             print('WE ARE USING OPENAI GPT4o')
             self.gpt_client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
@@ -148,7 +163,7 @@ def add(self,
 
     def compute_coord(self, text, image_info, threshold = 0.2):
         rgb = image_info['image']
-        inputs = self.exist_processor(text=[['a photo of a ' + text]], images=rgb, return_tensors="pt")
+        inputs = self.exist_processor(text=text, images=rgb, return_tensors="pt")
         for input in inputs:
             inputs[input] = inputs[input].to('cuda')
         
@@ -161,162 +176,185 @@ def compute_coord(self, text, image_info, threshold = 0.2):
         )[0]
         depth = image_info['depth']
         xyzs = image_info['xyz']
+        temp_lst = []
         for idx, (score, bbox) in enumerate(sorted(zip(results['scores'], results['boxes']), key=lambda x: x[0], reverse=True)):
         
             tl_x, tl_y, br_x, br_y = bbox
             w, h = depth.shape
             tl_x, tl_y, br_x, br_y = int(max(0, tl_x.item())), int(max(0, tl_y.item())), int(min(h, br_x.item())), int(min(w, br_y.item()))
             if np.median(depth[tl_y: br_y, tl_x: br_x].reshape(-1)) < 3:
-                return torch.from_numpy(np.median(xyzs[tl_y: br_y, tl_x: br_x].reshape(-1, 3), axis = 0))
+                coordinate = torch.from_numpy(np.median(xyzs[tl_y: br_y, tl_x: br_x].reshape(-1, 3), axis = 0))
+                # temp_lst.append(coordinate)
+                return coordinate
         return None
+        # return temp_lst
 
     def owl_locater(self, A, encoded_image, timestamps_lst):
+        # query_coord_lst = []
+        # # dbscan = DBSCAN(eps=1.5, min_samples=1)
+        # centroid = None
         for i in timestamps_lst:
-            if i not in encoded_image:
-                continue
-            image_info = encoded_image[i][-1]
-            res = self.compute_coord(A, image_info, threshold=0.2)
-            if res is not None:
-                debug_text = '#### - Obejct is detected in observations where instance' + str(i + 1) + ' comes from. **😃** Directly navigate to it.\n'
-                return res, debug_text, i, None
-
+            if i in encoded_image:
+                image_info = encoded_image[i][-1]
+                res = self.compute_coord(A, image_info, threshold=0.2)
+                if res is not None:
+                    debug_text = '#### - Obejct is detected in observations where instance' + str(i + 1) + ' comes from. **😃** Directly navigate to it.\n'
+                    return res, debug_text, i, None
         debug_text = '#### - All instances are not the target! Maybe target object has not been observed yet. **😭**\n'
+
+        # if query_coord_lst != []:
+        #     query_coord_lst = np.array(query_coord_lst)
+        #     dbscan.fit(query_coord_lst)
+        #     labels = dbscan.labels_
+        #     unique_labels = set(labels) - {-1}
+        #     largest_cluster_label = None
+        #     largest_cluster_size = 0
+        #     for label in unique_labels:
+        #         cluster_size = np.sum(labels == label)
+        #         if cluster_size > largest_cluster_size:
+        #             largest_cluster_size = cluster_size
+        #             largest_cluster_label = label
+        #     largest_cluster_points = query_coord_lst[labels == largest_cluster_label]
+        #     centroid = largest_cluster_points.mean(axis=0)
+        #     return centroid, debug_text, None, None
         return None, debug_text, None, None
     
-    def gpt_chunk(self, chunk, sys_prompt, user_prompt):
-        for i in range(3):
+    def process_chunk(self, chunk, sys_prompt, user_prompt):
+        for i in range(50):
             try:
-                response = self.gpt_client.chat.completions.create(
-                    model=self.existence_checking_model,
-                    messages=[
-                        {"role": "system", "content": sys_prompt},
-                        {"role": "user", "content": user_prompt},
-                        {"role": "user", "content": chunk}
-                    ],
-                    temperature=0.0,
-                )                
-                timestamps = response.choices[0].message.content
-                if 'None' in timestamps:
-                    return None
+                if 'gpt' in self.existence_checking_model:
+                    start_time = time.time()
+                    response = self.gpt_client.chat.completions.create(
+                        model=self.existence_checking_model,
+                        messages=[
+                            {"role": "system", "content": sys_prompt},
+                            {"role": "user", "content": user_prompt},
+                            {"role": "user", "content": chunk}
+                            ],
+                            temperature=0.0,
+                        ).choices[0].message.content
+                    
+                    end_time = time.time()
+                    print('GPT request cost:', end_time-start_time)
                 else:
-                    return list(map(int, timestamps.replace(' ', '').split(':')[1].split(',')))
+                    model = genai.GenerativeModel(model_name=f"models/{self.existence_checking_model}-exp-0827", system_instruction=sys_prompt)
+                    # "models/{self.existence_checking_model}-exp-0827"
+                    response = model.generate_content(chunk + [user_prompt], generation_config=generation_config, safety_settings=safety_settings).text
+                # print("Assistant: ", response)
+                return response
             except Exception as e:
+                if self.api_key == self.api_key_1:
+                    self.api_key = self.api_key_2
+                elif self.api_key == self.api_key_2:
+                    self.api_key = self.api_key_3
+                elif self.api_key == self.api_key_3:
+                    self.api_key = self.api_key_1
+                genai.configure(api_key=self.api_key)
                 print(f"Error: {e}")
-                time.sleep(10)
+                # time.sleep(10)
+        print("Execution Failed")
         return "Execution Failed"
 
-    def gemini_chunk(self, chunk, sys_prompt, user_prompt):
-        if self.existence_checking_model == 'gemini-1.5-pro':
-            model_name="models/gemini-1.5-pro-exp-0827"
-        elif self.existence_checking_model == 'gemini-1.5-flash':
-            model_name="models/gemini-1.5-flash-exp-0827"
-
-        for i in range(3):
-            try:
-                model = genai.GenerativeModel(model_name=model_name, system_instruction=sys_prompt)
-                timestamps = model.generate_content(chunk + [user_prompt], generation_config=generation_config, safety_settings=safety_settings).text
-                timestamps = timestamps.split('\n')[0]
-                if 'None' in timestamps:
-                    return None
-                else:
-                    return list(map(int, timestamps.replace(' ', '').split(':')[1].split(',')))
-            except Exception as e:
-                print(f"Error: {e}")
-                time.sleep(10)
-        return "Execution Failed"
-
-
-
-    def llm_locator(self, A, encoded_image, process_chunk, context_length = 30):
-        timestamps_lst = []
-
+    def update_dict(self, A, timestamps_dict, content):
+        timestamps_lst = content.split('\n')
+        for item in timestamps_lst:
+            if len(item) < 3 or ':' not in item:
+                continue
+            key, value_str = item.split(':')
+            if A not in key:
+                continue
+            if 'None' in value_str:
+                value = None
+            else:
+                value = list(map(int, value_str.replace(' ', '').split(',')))
+            if key in timestamps_dict:
+                if timestamps_dict[key] is None:
+                    timestamps_dict[key] = value
+                elif value is not None:
+                    timestamps_dict[key].extend(value)
+            else:
+                timestamps_dict[key] = value
+
+    def llm_locator(self, A, encoded_image):
+        timestamps_dict = {}
+    
         sys_prompt = f"""
-        For each object query provided, list at most 10 timestamps that the object is most clearly shown. If the object does not appear, simply output the object name and the word "None" for the timestamp. Avoid any unnecessary explanations or additional formatting.
-        
+        For object query I give, you need to find timestamps of images that the object is shown, without any unnecessary explanation or space. If the object never exist, please output the object name and the word "None" for timestamps.
+
         Example:
         Input:
-        The object you need to find are cat, car
+        The object you need to find is blue bottle, orange chair, white box
 
         Output: 
-        cat: 1,4,6,9
-        car: None
-        """
+        blue bottle: 3,6,9
+        orange chair: None
+        white box: 2,4,10"""
 
-        user_prompt = f"""The object you need to find are {A}, car, bottle, shoes, watch, clothes, desk, chair, shoes, cup"""
+        user_prompt = f"""The object you need to find is {A}"""
         if 'gpt' in self.existence_checking_model:
-            content = [item for sublist in list(encoded_image.values()) for item in sublist[:2]][-120:] # adjust to [-60:] for taking only the last 30 and have faster speed
-        elif 'gemini' in self.existence_checking_model:
-            content = [item for sublist in list(encoded_image.values()) for item in sublist[0]][-120:]
-        content_chunks = [content[i:i + 2 * context_length] for i in range(0, len(content), 2 * context_length)]
+            content = [item for sublist in list(encoded_image.values()) for item in sublist[:2]][-self.context_length*2:]
+        else:
+            content = [item for sublist in list(encoded_image.values()) for item in sublist[0]][-self.context_length*2:]
+        
+        content_chunks = [content[i:i + 2* self.max_img_per_request] for i in range(0, len(content), 2* self.max_img_per_request)]
         
-        with ThreadPoolExecutor(max_workers=2) as executor:
-            future_to_chunk = {executor.submit(process_chunk, chunk, sys_prompt, user_prompt): chunk for chunk in content_chunks}
+        with ThreadPoolExecutor(max_workers=5) as executor:
+            future_to_chunk = {executor.submit(self.process_chunk, chunk, sys_prompt, user_prompt): chunk for chunk in content_chunks}
             
             for future in as_completed(future_to_chunk):
                 chunk = future_to_chunk[future]
                 try:
                     result = future.result()
                     if result:
-                        timestamps_lst.extend(result)    
+                        self.update_dict(A, timestamps_dict, result)
                 except Exception as e:
                     print(f"Exception occurred: {e}")
+        if A not in timestamps_dict:
+            return None, 'debug_text', None, None
+
+        timestamps_lst = timestamps_dict[A]
+        if timestamps_lst is None:
+            return None, 'debug_text', None, None
         timestamps_lst = sorted(timestamps_lst, reverse=True)
-        # print(A, timestamps_lst)
+        # return None
         return self.owl_locater(A, encoded_image, timestamps_lst)
+    
 
-    def localize_A(self, A, debug = True, return_debug = False, count_threshold = -1):
-        start_time = time.time()
-
+    def localize_A(self, A, debug = True, return_debug = False):
         encoded_image = OrderedDict()
-
         counts = torch.bincount(self.voxel_pcd._obs_counts)
-        cur_obs = max(self.voxel_pcd._obs_counts)
-        filtered_obs = (counts > count_threshold).nonzero(as_tuple=True)[0].tolist()
-        # filtered_obs = sorted(set(filtered_obs + [i for i in range(cur_obs-10, cur_obs+1)]))
+        filtered_obs = (counts > self.count_threshold).nonzero(as_tuple=True)[0].tolist()
         filtered_obs = sorted(filtered_obs)
 
-        # filtered_obs = (counts <= count_threshold).nonzero(as_tuple=True)[0].tolist()
-        # filtered_obs = [obs for obs in filtered_obs if (cur_obs - obs) >= 10]
-
-        if 'gemini' in self.existence_checking_model:
-            process_chunk = self.gemini_chunk
-            context_length = 60
-        elif 'gpt' in self.existence_checking_model:
-            process_chunk = self.gpt_chunk
-            context_length = 30
-
         for obs_id in filtered_obs: 
             obs_id -= 1
-            rgb = self.voxel_map_wrapper.observations[obs_id].rgb.numpy()
+            rgb = np.copy(self.voxel_map_wrapper.observations[obs_id].rgb.numpy())
             depth = self.voxel_map_wrapper.observations[obs_id].depth
             camera_pose = self.voxel_map_wrapper.observations[obs_id].camera_pose
             camera_K = self.voxel_map_wrapper.observations[obs_id].camera_K
-
-            # full_world_xyz = unproject_masked_depth_to_xyz_coordinates(  # Batchable!
-            #     depth=depth.unsqueeze(0).unsqueeze(1),
-            #     pose=camera_pose.unsqueeze(0),
-            #     inv_intrinsics=torch.linalg.inv(camera_K[:3, :3]).unsqueeze(0),
-            # )
             xyz = get_xyz(depth, camera_pose, camera_K)[0]
-            # print(full_world_xyz.shape)
             depth = depth.numpy()
-            # rgb[depth > 2.5] = [0, 0, 0]
+
+            rgb[depth > 2.5] = [0, 0, 0]
+
             image = Image.fromarray(rgb.astype(np.uint8), mode='RGB')
             if 'gemini' in self.existence_checking_model:
-                encoded_image[obs_id] = [[f"Following is the image took on timestep {obs_id}: ", image], {'image':image, 'xyz': xyz, 'depth':depth}]
+                encoded_image[obs_id] = [[f"Following is the image took on timestamp {obs_id}: ", image], {'image':image, 'xyz': xyz, 'depth':depth}]
             elif 'gpt' in self.existence_checking_model:
                 buffered = BytesIO()
                 image.save(buffered, format="PNG")
                 img_bytes = buffered.getvalue()
                 base64_encoded = base64.b64encode(img_bytes).decode('utf-8')
-                encoded_image[obs_id] = [{"type": "text", "text": f"Following is the image took on timestep {obs_id}"},
+                encoded_image[obs_id] = [{"type": "text", "text": f"Following is the image took on timestamp {obs_id}: "},
                     {"type": "image_url", "image_url": {
                         "url": f"data:image/png;base64,{base64_encoded}"}
                     }, {'image':image, 'xyz':xyz, 'depth':depth}]
-        target_point, debug_text, obs, point = self.llm_locator(A, encoded_image, process_chunk, context_length)
-
+                # print(obs_id)
+        
+        start_time = time.time()
+        target_point, debug_text, obs, point = self.llm_locator(A, encoded_image)
         end_time = time.time()
-        print('visual grounding takes', end_time - start_time, 'seconds')
+        print('It takes', end_time - start_time)
 
         if not debug:
             return target_point
diff --git a/src/stretch/dynav/llm_server.py b/src/stretch/dynav/llm_server.py
index 85c2b01ac..de83a4a6d 100644
--- a/src/stretch/dynav/llm_server.py
+++ b/src/stretch/dynav/llm_server.py
@@ -110,12 +110,16 @@ def __init__(self,
         text_port = 5556,
         open_communication = True,
         rerun = True,
-        static = True
+        static = True,
+        log = None
     ):
         self.static = static
         self.siglip = siglip
         current_datetime = datetime.datetime.now()
-        self.log = 'debug_' + current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
+        if log is None:
+            self.log = 'debug_' + current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
+        else:
+            self.log = log
         self.rerun = rerun
         if self.rerun:
             rr.init(self.log, spawn = True)
@@ -260,14 +264,14 @@ def process_text(self, text, start_pose):
             debug_text = '# Robot\'s monologue: \n' + debug_text
             rr.log("robot_monologue", rr.TextDocument(debug_text, media_type = rr.MediaType.MARKDOWN), static = self.static)
 
-        # if obs is not None and mode == 'navigation':
-        #     rgb = self.voxel_map.observations[obs - 1].rgb
-        #     if not self.rerun:
-        #         if isinstance(rgb, torch.Tensor):
-        #             rgb = np.array(rgb)
-        #         cv2.imwrite(self.log + '/debug_' + text + '.png', rgb[:, :, [2, 1, 0]])
-        #     else:
-        #         rr.log('/Past_observation_most_similar_to_text', rr.Image(rgb), static = self.static)
+        if obs is not None and mode == 'navigation':
+            rgb = self.voxel_map.observations[obs].rgb
+            if not self.rerun:
+                if isinstance(rgb, torch.Tensor):
+                    rgb = np.array(rgb)
+                cv2.imwrite(self.log + '/debug_' + text + '.png', rgb[:, :, [2, 1, 0]])
+            else:
+                rr.log('/Past_observation_most_similar_to_text', rr.Image(rgb), static = self.static)
         traj = []
         waypoints = None
 
@@ -422,6 +426,8 @@ def run_llm_owl(self, rgb, mask, world_xyz):
             self.add_to_voxel_pcd(valid_xyz, None, valid_rgb)
     
     def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
+        print('Reading from ', pickle_file_name)
+        rr.init('Debug', spawn = True)
         if isinstance(pickle_file_name, str):
             pickle_file_name = Path(pickle_file_name)
         assert pickle_file_name.exists(), f"No file found at {pickle_file_name}"
@@ -452,16 +458,15 @@ def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
             if num_frames > 0 and i >= num_frames:
                 break
 
-            if rgb is None:
-                continue
-
             camera_pose = self.voxel_map.fix_data_type(camera_pose)
             xyz = self.voxel_map.fix_data_type(xyz)
             rgb = self.voxel_map.fix_data_type(rgb)
             depth = self.voxel_map.fix_data_type(depth)
+            intrinsics = self.voxel_map.fix_data_type(K)
             if feats is not None:
                 feats = self.voxel_map.fix_data_type(feats)
             base_pose = self.voxel_map.fix_data_type(base_pose)
+            self.voxel_map.voxel_pcd.clear_points(depth, intrinsics, camera_pose)
             self.voxel_map.add(
                 camera_pose=camera_pose,
                 xyz=xyz,
@@ -481,6 +486,7 @@ def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
         self.voxel_map_localizer.voxel_pcd.obs_count = max(self.voxel_map_localizer.voxel_pcd._obs_counts).item()
         self.voxel_map.voxel_pcd.obs_count = max(self.voxel_map_localizer.voxel_pcd._obs_counts).item()
 
+
     def write_to_pickle(self):
         """Write out to a pickle file. This is a rough, quick-and-easy output for debugging, not intended to replace the scalable data writer in data_tools for bigger efforts."""
         if not os.path.exists('debug'):
@@ -495,31 +501,17 @@ def write_to_pickle(self):
         data["rgb"] = []
         data["depth"] = []
         data["feats"] = []
-
-        image_ids = torch.unique(self.voxel_map.voxel_pcd._obs_counts).tolist()
-
-        for idx, frame in enumerate(self.voxel_map.observations):
+        for frame in self.voxel_map.observations:
             # add it to pickle
             # TODO: switch to using just Obs struct?
-            if idx in image_ids:
-                data["camera_poses"].append(frame.camera_pose)
-                data["camera_K"].append(frame.camera_K)
-                data["rgb"].append(frame.rgb)
-                data["depth"].append(frame.depth)
-            else:
-                data["camera_poses"].append(None)
-                data["camera_K"].append(None)
-                data["rgb"].append(None)
-                data["depth"].append(None)
-            # We might not need this
-            # data["xyz"].append(frame.xyz)
-            # data["world_xyz"].append(frame.full_world_xyz)
-            # data["feats"].append(frame.feats)
-            # data["base_poses"].append(frame.base_pose)
-            data["xyz"].append(None)
-            data["world_xyz"].append(None)
-            data["feats"].append(None)
-            data["base_poses"].append(None)
+            data["camera_poses"].append(frame.camera_pose)
+            data["base_poses"].append(frame.base_pose)
+            data["camera_K"].append(frame.camera_K)
+            data["xyz"].append(frame.xyz)
+            data["world_xyz"].append(frame.full_world_xyz)
+            data["rgb"].append(frame.rgb)
+            data["depth"].append(frame.depth)
+            data["feats"].append(frame.feats)
             for k, v in frame.info.items():
                 if k not in data:
                     data[k] = []
diff --git a/src/stretch/dynav/mapping_utils/a_star.py b/src/stretch/dynav/mapping_utils/a_star.py
index 4fa4317cd..d7172df71 100644
--- a/src/stretch/dynav/mapping_utils/a_star.py
+++ b/src/stretch/dynav/mapping_utils/a_star.py
@@ -55,6 +55,7 @@ def reset(self):
         obs, exp = self.space.voxel_map.get_2d_map()
         # print('up to date navigable map loaded')
         self._navigable = ~obs & exp
+        # self._navigable = ~obs
         self.start_time = time.time()
 
     def verify_path(self, path) -> bool:
@@ -95,7 +96,7 @@ def compute_obstacle_punishment(self, a: Tuple[int, int], weight: int, avoid: in
         return obstacle_punishment
 
     # A* heuristic
-    def compute_heuristic(self, a: Tuple[int, int], b: Tuple[int, int], weight = 12, avoid = 3) -> float:
+    def compute_heuristic(self, a: Tuple[int, int], b: Tuple[int, int], weight = 6, avoid = 3) -> float:
         return self.compute_dis(a, b) + weight * self.compute_obstacle_punishment(a, weight, avoid)\
             + self.compute_obstacle_punishment(b, weight, avoid)
 
diff --git a/src/stretch/dynav/mapping_utils/voxel.py b/src/stretch/dynav/mapping_utils/voxel.py
index 22ea48d36..9fdbd6334 100644
--- a/src/stretch/dynav/mapping_utils/voxel.py
+++ b/src/stretch/dynav/mapping_utils/voxel.py
@@ -54,7 +54,7 @@
 
 VALID_FRAMES = ["camera", "world"]
 
-DEFAULT_GRID_SIZE = [200, 200]
+DEFAULT_GRID_SIZE = [180, 180]
 
 logger = logging.getLogger(__name__)
 
diff --git a/src/stretch/dynav/mapping_utils/voxel_map.py b/src/stretch/dynav/mapping_utils/voxel_map.py
index dff9f70fe..ed834a8de 100644
--- a/src/stretch/dynav/mapping_utils/voxel_map.py
+++ b/src/stretch/dynav/mapping_utils/voxel_map.py
@@ -660,7 +660,7 @@ def sample_exploration(
             plt.show()
         return index, time_heuristics, alignments_heuristics, total_heuristics
         
-    def _alignment_heuristic(self, alignments, outside_frontier, alignment_smooth = 20, alignment_threshold = 0.13, debug = False):
+    def _alignment_heuristic(self, alignments, outside_frontier, alignment_smooth = 50, alignment_threshold = 0.12, debug = False):
         alignments = np.ma.masked_array(alignments, ~outside_frontier)
         alignment_heuristics = 1 / (1 + np.exp(-alignment_smooth * (alignments - alignment_threshold)))
         index = np.unravel_index(np.argmax(alignment_heuristics), alignments.shape)
@@ -672,7 +672,7 @@ def _alignment_heuristic(self, alignments, outside_frontier, alignment_smooth =
             plt.show()
         return alignment_heuristics
 
-    def _time_heuristic(self, history_soft, outside_frontier, time_smooth = 0.1, time_threshold = 24, debug = False):
+    def _time_heuristic(self, history_soft, outside_frontier, time_smooth = 0.1, time_threshold = 15, debug = False):
         history_soft = np.ma.masked_array(history_soft, ~outside_frontier)
         time_heuristics = history_soft.max() - history_soft
         time_heuristics[history_soft < 1] = float('inf')
diff --git a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
index 3bc571037..f27de8c02 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
@@ -167,7 +167,7 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
     pin_base_point = apply_se3_transform(pin_cam2base_transform, pin_point)
     # print(f"pin base point {pin_base_point}")
 
-    diff_value = (0.227 - gripper_depth - gripper_height) # 0.228 is the distance between link_Straight_gripper node and the gripper tip
+    diff_value = (0.226 - gripper_depth - gripper_height) # 0.228 is the distance between link_Straight_gripper node and the gripper tip
     pin_transformed_point1[2] -= (diff_value)
     ref_diff = (diff_value)
 
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index 36a2c3af0..858b1c83e 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -25,8 +25,8 @@
 from stretch.dynav.ok_robot_hw.camera import RealSenseCamera
 from stretch.dynav.ok_robot_hw.utils.grasper_utils import pickup, move_to_point, capture_and_process_image
 from stretch.dynav.ok_robot_hw.utils.communication_utils import send_array, recv_array
-# from stretch.dynav.voxel_map_server import ImageProcessor
-from stretch.dynav.llm_server import ImageProcessor
+
+# from stretch.dynav.llm_server import ImageProcessor
 
 import cv2
 
@@ -43,10 +43,13 @@ def __init__(
         robot: RobotClient,
         parameters: Dict[str, Any],
         ip: str,
-        image_port: int = 5555,
+        image_port: int = 5558,
         text_port: int = 5556,
         manip_port: int = 5557,
         re: int = 1,
+        env_num: int = 1,
+        test_num: int = 1,
+        method: str = 'dynamem'
     ):
         print('------------------------YOU ARE NOW RUNNING PEIQI CODES V3-----------------')
         self.re = re
@@ -77,7 +80,12 @@ def __init__(
         )
 
         self.image_sender = ImageSender(ip = ip, image_port = image_port, text_port = text_port, manip_port = manip_port)
-        self.image_processor = ImageProcessor(rerun = True, static = False)
+        if method == 'dynamem':
+            from stretch.dynav.voxel_map_server import ImageProcessor
+            self.image_processor = ImageProcessor(rerun = True, static = False, log = 'env' + str(env_num) + '_' + str(test_num))
+        elif method == 'mllm':
+            from stretch.dynav.llm_server import ImageProcessor
+            self.image_processor = ImageProcessor(rerun = True, static = False, log = 'env' + str(env_num) + '_' + str(test_num))
 
         self.look_around_times = []
         self.execute_times = []
@@ -90,7 +98,10 @@ def look_around(self):
         print("*" * 10, "Look around to check", "*" * 10)
         for pan in [0.4, -0.4, -1.2]:
             for tilt in [-0.6]:
+                start_time = time.time()
                 self.robot.head_to(pan, tilt, blocking = True)
+                end_time = time.time()
+                print('moving head takes ', end_time - start_time, 'seconds.')
                 self.update()
 
     def rotate_in_place(self):
diff --git a/src/stretch/dynav/run_manip.py b/src/stretch/dynav/run_manip.py
index 8d4f18e79..3f63d4a37 100644
--- a/src/stretch/dynav/run_manip.py
+++ b/src/stretch/dynav/run_manip.py
@@ -63,15 +63,15 @@ def main(
     )
 
     while input('STOP? Y/N') != 'Y':
-        text = input('Enter object name: ')
-        theta = -0.6
-        demo.manipulate(text, theta)
+        if input('You want to run manipulation: y/n') == 'y':
+            text = input('Enter object name: ')
+            theta = -0.6
+            demo.manipulate(text, theta)
             
-        if input('You want to run placing: y/n') == 'n':
-            continue
-        text = input('Enter receptacle name: ')
-        theta = -0.6
-        demo.place(text, theta)
+        if input('You want to run placing: y/n') == 'y':
+            text = input('Enter receptacle name: ')
+            theta = -0.6
+            demo.place(text, theta)
 
 
 if __name__ == "__main__":
diff --git a/src/stretch/dynav/run_ok_nav.py b/src/stretch/dynav/run_ok_nav.py
index ae936cb89..5417a6a73 100644
--- a/src/stretch/dynav/run_ok_nav.py
+++ b/src/stretch/dynav/run_ok_nav.py
@@ -38,6 +38,9 @@ def compute_tilt(camera_xyz, target_xyz):
 @click.option("--random-goals", default=False, is_flag=True)
 @click.option("--explore-iter", default=-1)
 @click.option("--re", default=1, type=int)
+@click.option("--method", default='dynamem', type=str)
+@click.option("--env", default=1, type=int)
+@click.option("--test", default=1, type=int)
 @click.option(
     "--input-path",
     type=click.Path(),
@@ -50,6 +53,9 @@ def main(
     navigate_home: bool = False,
     explore_iter: int = 5,
     re: int = 1,
+    method: str = 'dynamem',
+    env: int = 1,
+    test: int = 1,
     input_path: str = None,
     **kwargs,
 ):
@@ -73,7 +79,7 @@ def main(
 
     print("- Start robot agent with data collection")
     demo = RobotAgentMDP(
-        robot, parameters, ip = ip, re = re
+        robot, parameters, ip = ip, re = re, env_num = env, test_num = test, method = method
     )
 
     if input_path is None:
@@ -117,13 +123,14 @@ def main(
                 point = demo.navigate(text)
                 if point is None:
                     print('Navigation Failure!')
+                cv2.imwrite(text + '.jpg', robot.get_observation().rgb[:, :, [2, 1, 0]])
+
+            if input('You want to run manipulation: y/n') != 'n':
                 robot.switch_to_navigation_mode()
                 xyt = robot.get_base_pose()
                 xyt[2] = xyt[2] + np.pi / 2
                 robot.navigate_to(xyt, blocking = True)
-                cv2.imwrite(text + '.jpg', robot.get_observation().rgb[:, :, [2, 1, 0]])
 
-            if input('You want to run manipulation: y/n') != 'n':
                 robot.switch_to_manipulation_mode()
                 if text is None:
                     text = input('Enter object name: ')
@@ -143,13 +150,14 @@ def main(
                 point = demo.navigate(text)
                 if point is None:
                     print('Navigation Failure')
+                cv2.imwrite(text + '.jpg', robot.get_observation().rgb[:, :, [2, 1, 0]])
+        
+            if input('You want to run placing: y/n') != 'n':
                 robot.switch_to_navigation_mode()
                 xyt = robot.get_base_pose()
                 xyt[2] = xyt[2] + np.pi / 2
                 robot.navigate_to(xyt, blocking = True)
-                cv2.imwrite(text + '.jpg', robot.get_observation().rgb[:, :, [2, 1, 0]])
-        
-            if input('You want to run placing: y/n') != 'n':
+                
                 robot.switch_to_manipulation_mode()
                 if text is None:
                     text = input('Enter receptacle name: ')
diff --git a/src/stretch/dynav/voxel_map_localizer.py b/src/stretch/dynav/voxel_map_localizer.py
index 987ffac15..e35a3cbe9 100644
--- a/src/stretch/dynav/voxel_map_localizer.py
+++ b/src/stretch/dynav/voxel_map_localizer.py
@@ -112,6 +112,7 @@ def add(self,
         rgb: Optional[Tensor],
         weights: Optional[Tensor] = None,
         obs_count: Optional[Tensor] = None,
+        # weight_decay: float = 0.95
     ):
         points = points.to(self.device)
         if features is not None:
@@ -120,6 +121,8 @@ def add(self,
             rgb = rgb.to(self.device)
         if weights is not None:
             weights = weights.to(self.device)
+        # if weight_decay is not None and self.voxel_pcd._weights is not None:
+        #     self.voxel_pcd._weights *= weight_decay
         self.voxel_pcd.add(points = points, 
                         features = features,
                         rgb = rgb,
diff --git a/src/stretch/dynav/voxel_map_server.py b/src/stretch/dynav/voxel_map_server.py
index 1ebb17e59..6ce7dbb1b 100644
--- a/src/stretch/dynav/voxel_map_server.py
+++ b/src/stretch/dynav/voxel_map_server.py
@@ -49,6 +49,7 @@
 
 from stretch.dynav.communication_util import load_socket, send_array, recv_array, send_rgb_img, recv_rgb_img, send_depth_img, recv_depth_img, send_everything, recv_everything
 # from stretch.utils.morphology import get_edges
+import torch.nn.functional as F
 
 def get_inv_intrinsics(intrinsics):
     # return intrinsics.double().inverse().to(intrinsics)
@@ -107,21 +108,26 @@ def get_xyz(depth, pose, intrinsics):
 
 class ImageProcessor:
     def __init__(self,  
-        vision_method = 'mask&*lip', 
+        vision_method = 'mask*lip', 
         siglip = True,
         device = 'cuda',
         min_depth = 0.25,
         max_depth = 2.5,
-        img_port = 5555,
+        img_port = 5558,
         text_port = 5556,
         open_communication = True,
         rerun = True,
-        static = True
+        static = True,
+        log = None,
+        image_shape = (450, 350)
     ):
         self.static = static
         self.siglip = siglip
         current_datetime = datetime.datetime.now()
-        self.log = 'debug_' + current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
+        if log is None:
+            self.log = 'debug_' + current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
+        else:
+            self.log = log
         self.rerun = rerun
         if self.rerun:
             if self.static:
@@ -147,6 +153,7 @@ def __init__(self,
         self.voxel_map_lock = threading.Lock()  # Create a lock for synchronizing access to `self.voxel_map_localizer`
 
         self.traj = None
+        self.image_shape = image_shape
         
         if open_communication:
             self.img_socket = load_socket(img_port)
@@ -244,9 +251,9 @@ def process_text(self, text, start_pose):
             if self.voxel_map_localizer.verify_point(text, traj_target_point):
                 localized_point = traj_target_point
                 debug_text += '## Last visual grounding results looks fine so directly use it.\n'
-                if self.planner.verify_path(self.traj[:-2]):
-                    waypoints = self.traj[:-2]
-                    debug_text += '## Last path planning results looks fine so directly use it.\n'
+                # if self.planner.verify_path(self.traj[:-2]):
+                #     waypoints = self.traj[:-2]
+                #     debug_text += '## Last path planning results looks fine so directly use it.\n'
 
         if waypoints is None:
             # Do visual grounding
@@ -340,7 +347,7 @@ def process_text(self, text, start_pose):
                 if idx != len(traj) - 1:
                     origins.append([traj[idx][0], traj[idx][1], 1.5])
                     vectors.append([traj[idx + 1][0] - traj[idx][0], traj[idx + 1][1] - traj[idx][1], 0])
-            rr.log("/direction", rr.Arrows3D(origins = origins, vectors = vectors, colors=torch.Tensor([0, 1, 0]), radii=0.05), static = self.static)
+            rr.log("/direction", rr.Arrows3D(origins = origins, vectors = vectors, colors=torch.Tensor([0, 1, 0]), radii=0.1), static = self.static)
             rr.log("/robot_start_pose", rr.Points3D([start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1), static = self.static)
 
         # self.write_to_pickle()
@@ -568,7 +575,7 @@ def extract_mask_clip_features(self, x, image_shape):
     def run_mask_clip(self, rgb, mask, world_xyz):
 
         with torch.no_grad():
-            results = self.yolo_model.predict(rgb.permute(1,2,0)[:, :, [2, 1, 0]].numpy(), conf=0.05, verbose=False)
+            results = self.yolo_model.predict(rgb.permute(1,2,0)[:, :, [2, 1, 0]].numpy(), conf=0.1, verbose=False)
             xyxy_tensor = results[0].boxes.xyxy
             if len(xyxy_tensor) == 0:
                 return
@@ -600,7 +607,7 @@ def run_mask_clip(self, rgb, mask, world_xyz):
 
     def run_owl_sam_clip(self, rgb, mask, world_xyz):
         with torch.no_grad():
-            results = self.yolo_model.predict(rgb.permute(1,2,0)[:, :, [2, 1, 0]].numpy(), conf=0.05, verbose=False)
+            results = self.yolo_model.predict(rgb.permute(1,2,0)[:, :, [2, 1, 0]].numpy(), conf=0.1, verbose=False)
             xyxy_tensor = results[0].boxes.xyxy
             if len(xyxy_tensor) == 0:
                 return
@@ -690,7 +697,6 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
         if not os.path.exists(self.log):
             os.mkdir(self.log)
         self.obs_count += 1
-        world_xyz = get_xyz(depth, pose, intrinsics).squeeze(0)
 
         cv2.imwrite(self.log + '/rgb' + str(self.obs_count) + '.jpg', rgb[:, :, [2, 1, 0]])
         np.save(self.log + '/rgb' + str(self.obs_count) + '.npy', rgb)
@@ -700,7 +706,18 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
 
         rgb, depth = torch.from_numpy(rgb), torch.from_numpy(depth)
         rgb = rgb.permute(2, 0, 1).to(torch.uint8)
+        
+        h, w = self.image_shape
+        h_image, w_image = depth.shape
+        rgb = F.interpolate(rgb.unsqueeze(0), size=self.image_shape, mode='bilinear', align_corners=False).squeeze(0)
+        depth = F.interpolate(depth.unsqueeze(0).unsqueeze(0), size=self.image_shape, mode='bilinear', align_corners=False).squeeze(0).squeeze(0)
+        intrinsics[0, 0] *= w / w_image
+        intrinsics[1, 1] *= h / h_image
+        intrinsics[0, 2] *= w / w_image
+        intrinsics[1, 2] *= h / h_image
 
+        world_xyz = get_xyz(depth, pose, intrinsics).squeeze(0)
+        
         median_depth = torch.from_numpy(
             scipy.ndimage.median_filter(depth, size=5)
         )
@@ -873,15 +890,15 @@ def write_to_pickle(self):
 # def main(cfg):
 #     torch.manual_seed(1)
 #     imageProcessor = ImageProcessor(rerun = False, static = False, min_depth = 0., max_depth = 2.5)
-#     # imageProcessor = ImageProcessor(rerun = cfg.rerun, static = cfg.static, min_depth = cfg.min_depth, max_depth = cfg.max_depth)
-#     # if not cfg.pickle_file_name is None:
-#     #     imageProcessor.read_from_pickle(cfg.pickle_file_name)
-#     # print(imageProcessor.voxel_map_localizer.voxel_pcd._points)
-#     # if cfg.open_communication:
-#     #     while True:
-#     #         imageProcessor.recv_text()
-#     imageProcessor.read_from_pickle('debug/debug_2024-09-09_07-50-41.pkl', -1)
-#     imageProcessor.write_to_pickle()
+    # imageProcessor = ImageProcessor(rerun = cfg.rerun, static = cfg.static, min_depth = cfg.min_depth, max_depth = cfg.max_depth)
+    # if not cfg.pickle_file_name is None:
+    #     imageProcessor.read_from_pickle(cfg.pickle_file_name)
+    # print(imageProcessor.voxel_map_localizer.voxel_pcd._points)
+    # if cfg.open_communication:
+    #     while True:
+    #         imageProcessor.recv_text()
+    # imageProcessor.read_from_pickle('env.pkl', -1)
+    # imageProcessor.write_to_pickle()
 
 if __name__ == "__main__":
-    main(None)
\ No newline at end of file
+    main(None)

From 37639fa1f05449c63175e247ddd071ce83d09bfc Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 23 Sep 2024 11:22:35 -0400
Subject: [PATCH 213/410] Run pre-commit hooks and fix formatting; many errors
 remain

---
 src/stretch/agent/zmq_client.py               |  35 +-
 src/stretch/dynav/__init__.py                 |  11 +-
 src/stretch/dynav/communication_util.py       |  55 +-
 src/stretch/dynav/llm_localizer.py            | 270 +++++---
 src/stretch/dynav/llm_server.py               | 426 +++++++-----
 src/stretch/dynav/mapping_utils/__init__.py   |  16 +-
 src/stretch/dynav/mapping_utils/a_star.py     |  66 +-
 src/stretch/dynav/mapping_utils/voxel.py      | 125 ++--
 src/stretch/dynav/mapping_utils/voxel_map.py  | 148 ++--
 .../dynav/mapping_utils/voxelized_pcd.py      | 153 +++--
 src/stretch/dynav/ok_robot_hw/README.md       |  27 +-
 src/stretch/dynav/ok_robot_hw/camera.py       |  37 +-
 .../dynav/ok_robot_hw/global_parameters.py    |  13 +-
 .../dynav/ok_robot_hw/image_publisher.py      |  41 +-
 src/stretch/dynav/ok_robot_hw/robot.py        | 233 ++++---
 .../dynav/ok_robot_hw/utils/__init__.py       |  13 +-
 .../ok_robot_hw/utils/communication_utils.py  |  42 +-
 .../dynav/ok_robot_hw/utils/grasper_utils.py  | 191 +++---
 src/stretch/dynav/ok_robot_hw/utils/utils.py  |  19 +-
 src/stretch/dynav/robot_agent_manip_mdp.py    | 299 ++++----
 src/stretch/dynav/run_manip.py                |  43 +-
 src/stretch/dynav/run_ok_nav.py               |  88 +--
 src/stretch/dynav/scannet.py                  | 424 ++++++------
 src/stretch/dynav/voxel_map_localizer.py      | 189 ++++--
 src/stretch/dynav/voxel_map_server.py         | 636 +++++++++++-------
 src/stretch/motion/kinematics.py              |   4 +-
 src/stretch/motion/pinocchio_ik_solver.py     |  18 +-
 src/stretch/visualization/rerun.py            |  60 +-
 28 files changed, 2167 insertions(+), 1515 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index d2fd6958c..a6a2214b1 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -29,15 +29,12 @@
 from stretch.core.robot import AbstractRobotClient
 from stretch.motion import PlanResult, RobotModel
 from stretch.motion.kinematics import HelloStretchIdx, HelloStretchKinematics
-from stretch.utils.geometry import angle_difference
+from stretch.utils.geometry import angle_difference, posquat2sophus, sophus2posquat
 from stretch.utils.image import Camera
 from stretch.utils.memory import lookup_address
 from stretch.utils.point_cloud import show_point_cloud
 from stretch.visualization.rerun import RerunVsualizer
 
-from scipy.spatial.transform import Rotation
-from stretch.utils.geometry import posquat2sophus, sophus2posquat
-
 # TODO: debug code - remove later if necessary
 # import faulthandler
 # faulthandler.enable()
@@ -250,8 +247,8 @@ def get_joint_positions(self, timeout: float = 5.0) -> np.ndarray:
 
     def get_six_joints(self, timeout: float = 5.0) -> np.ndarray:
         """Get the six major joint positions"""
-        joint_positions = self.get_joint_positions(timeout = timeout)
-        return  np.array(self._extract_joint_pos(joint_positions))
+        joint_positions = self.get_joint_positions(timeout=timeout)
+        return np.array(self._extract_joint_pos(joint_positions))
 
     def get_joint_velocities(self, timeout: float = 5.0) -> np.ndarray:
         """Get the current joint velocities"""
@@ -308,10 +305,10 @@ def get_gripper_position(self):
         joint_state = self.get_joint_positions()
         return joint_state[HelloStretchIdx.GRIPPER]
 
-    def get_ee_pose(self, matrix=False, link_name=None, q = None):
+    def get_ee_pose(self, matrix=False, link_name=None, q=None):
         if q is None:
             q = self.get_joint_positions()
-        pos, quat = self._robot_model.manip_fk(q, node = link_name)
+        pos, quat = self._robot_model.manip_fk(q, node=link_name)
 
         if matrix:
             pose = posquat2sophus(pos, quat)
@@ -329,7 +326,7 @@ def solve_ik(
         relative: bool = False,
         initial_cfg: np.ndarray = None,
         debug: bool = False,
-        node_name = None
+        node_name=None,
     ) -> Optional[np.ndarray]:
         """Solve inverse kinematics appropriately (or at least try to) and get the joint position
         that we will be moving to.
@@ -369,7 +366,7 @@ def solve_ik(
 
         # Perform IK
         full_body_cfg, ik_success, ik_debug_info = self._robot_model.manip_ik(
-            (pos_ik_goal, quat_ik_goal), q0=initial_cfg, node_name = node_name
+            (pos_ik_goal, quat_ik_goal), q0=initial_cfg, node_name=node_name
         )
 
         # Expected to return None if we did not get a solution
@@ -422,11 +419,15 @@ def head_to(
 
     def look_front(self, blocking: bool = True, timeout: float = 10.0):
         """Let robot look to its front."""
-        self.head_to(constants.look_front[0], constants.look_front[1], blocking = blocking, timeout = timeout)
+        self.head_to(
+            constants.look_front[0], constants.look_front[1], blocking=blocking, timeout=timeout
+        )
 
     def look_at_ee(self, blocking: bool = True, timeout: float = 10.0):
         """Let robot look to its arm."""
-        self.head_to(constants.look_at_ee[0], constants.look_at_ee[1], blocking = blocking, timeout = timeout)
+        self.head_to(
+            constants.look_at_ee[0], constants.look_at_ee[1], blocking=blocking, timeout=timeout
+        )
 
     def arm_to(
         self,
@@ -579,9 +580,7 @@ def navigate_to(
             self._rerun.update_nav_goal(xyt)
         self.send_action(next_action, timeout=timeout, verbose=verbose)
 
-    def set_velocity(
-        self, v: float, w: float
-    ):
+    def set_velocity(self, v: float, w: float):
         """Move to xyt in global coordinates or relative coordinates."""
         next_action = {"v": v, "w": w}
         self.send_action(next_action)
@@ -957,8 +956,8 @@ def get_observation(self):
             observation.camera_pose = self._obs.get("camera_pose", None)
             observation.seq_id = self._seq_id
         return observation
- 
-    def get_images(self, compute_xyz = False):
+
+    def get_images(self, compute_xyz=False):
         obs = self.get_observation()
         if compute_xyz:
             return obs.rgb, obs.depth, obs.xyz
@@ -983,7 +982,7 @@ def execute_trajectory(
         per_waypoint_timeout: float = 10.0,
         final_timeout: float = 10.0,
         relative: bool = False,
-        blocking: bool = False
+        blocking: bool = False,
     ):
         """Execute a multi-step trajectory; this is always blocking since it waits to reach each one in turn."""
 
diff --git a/src/stretch/dynav/__init__.py b/src/stretch/dynav/__init__.py
index 3448d7994..770e2711e 100644
--- a/src/stretch/dynav/__init__.py
+++ b/src/stretch/dynav/__init__.py
@@ -1 +1,10 @@
-from .robot_agent_manip_mdp import RobotAgentMDP
\ No newline at end of file
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+from .robot_agent_manip_mdp import RobotAgentMDP
diff --git a/src/stretch/dynav/communication_util.py b/src/stretch/dynav/communication_util.py
index 7680ed147..aa5d5b192 100644
--- a/src/stretch/dynav/communication_util.py
+++ b/src/stretch/dynav/communication_util.py
@@ -1,6 +1,16 @@
-import zmq
-import numpy as np
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 import cv2
+import numpy as np
+import zmq
+
 
 def load_socket(port_number):
     context = zmq.Context()
@@ -9,48 +19,58 @@ def load_socket(port_number):
 
     return socket
 
+
 def send_array(socket, A, flags=0, copy=True, track=False):
     """send a numpy array with metadata"""
     A = np.array(A)
     md = dict(
-        dtype = str(A.dtype),
-        shape = A.shape,
+        dtype=str(A.dtype),
+        shape=A.shape,
     )
-    socket.send_json(md, flags|zmq.SNDMORE)
+    socket.send_json(md, flags | zmq.SNDMORE)
     return socket.send(np.ascontiguousarray(A), flags, copy=copy, track=track)
 
+
 def recv_array(socket, flags=0, copy=True, track=False):
     """recv a numpy array"""
     md = socket.recv_json(flags=flags)
     msg = socket.recv(flags=flags, copy=copy, track=track)
-    A = np.frombuffer(msg, dtype=md['dtype'])
-    return A.reshape(md['shape'])
+    A = np.frombuffer(msg, dtype=md["dtype"])
+    return A.reshape(md["shape"])
+
 
 def send_rgb_img(socket, img):
-    img = img.astype(np.uint8) 
+    img = img.astype(np.uint8)
     encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 90]
-    _, img_encoded = cv2.imencode('.jpg', img, encode_param)
+    _, img_encoded = cv2.imencode(".jpg", img, encode_param)
     socket.send(img_encoded.tobytes())
 
+
 def recv_rgb_img(socket):
     img = socket.recv()
     img = np.frombuffer(img, dtype=np.uint8)
     img = cv2.imdecode(img, cv2.IMREAD_COLOR)
     return img
 
+
 def send_depth_img(socket, depth_img):
     depth_img = (depth_img * 1000).astype(np.uint16)
-    encode_param = [int(cv2.IMWRITE_PNG_COMPRESSION), 3]  # Compression level from 0 (no compression) to 9 (max compression)
-    _, depth_img_encoded = cv2.imencode('.png', depth_img, encode_param)
+    encode_param = [
+        int(cv2.IMWRITE_PNG_COMPRESSION),
+        3,
+    ]  # Compression level from 0 (no compression) to 9 (max compression)
+    _, depth_img_encoded = cv2.imencode(".png", depth_img, encode_param)
     socket.send(depth_img_encoded.tobytes())
 
+
 def recv_depth_img(socket):
     depth_img = socket.recv()
     depth_img = np.frombuffer(depth_img, dtype=np.uint8)
     depth_img = cv2.imdecode(depth_img, cv2.IMREAD_UNCHANGED)
-    depth_img = (depth_img / 1000.)
+    depth_img = depth_img / 1000.0
     return depth_img
 
+
 def send_everything(socket, rgb, depth, intrinsics, pose):
     send_rgb_img(socket, rgb)
     socket.recv_string()
@@ -61,13 +81,14 @@ def send_everything(socket, rgb, depth, intrinsics, pose):
     send_array(socket, pose)
     socket.recv_string()
 
+
 def recv_everything(socket):
     rgb = recv_rgb_img(socket)
-    socket.send_string('')
+    socket.send_string("")
     depth = recv_depth_img(socket)
-    socket.send_string('')
+    socket.send_string("")
     intrinsics = recv_array(socket)
-    socket.send_string('')
+    socket.send_string("")
     pose = recv_array(socket)
-    socket.send_string('')
-    return rgb, depth, intrinsics, pose
\ No newline at end of file
+    socket.send_string("")
+    return rgb, depth, intrinsics, pose
diff --git a/src/stretch/dynav/llm_localizer.py b/src/stretch/dynav/llm_localizer.py
index 4dd767e20..d99168f5d 100644
--- a/src/stretch/dynav/llm_localizer.py
+++ b/src/stretch/dynav/llm_localizer.py
@@ -1,27 +1,32 @@
-from io import BytesIO
-from concurrent.futures import ThreadPoolExecutor, as_completed
-import time
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import base64
 import os
+import time
+from collections import OrderedDict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from io import BytesIO
+from typing import Optional
 
-import torch
+import google.generativeai as genai
 import numpy as np
+import torch
+from openai import OpenAI
 from PIL import Image
-
-from typing import Optional
 from torch import Tensor
+from transformers import AutoProcessor, Owlv2ForObjectDetection
 
-from stretch.utils.point_cloud_torch import unproject_masked_depth_to_xyz_coordinates
 from stretch.dynav.mapping_utils import VoxelizedPointcloud
+from stretch.utils.point_cloud_torch import unproject_masked_depth_to_xyz_coordinates
 
-from transformers import AutoProcessor
-from transformers import Owlv2ForObjectDetection
-
-import google.generativeai as genai
-from openai import OpenAI
-import base64
-from collections import OrderedDict
-
-genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
+genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
 generation_config = genai.GenerationConfig(temperature=0)
 safety_settings = [
     {
@@ -46,9 +51,15 @@
     },
 ]
 
+
 def get_inv_intrinsics(intrinsics):
     # return intrinsics.double().inverse().to(intrinsics)
-    fx, fy, ppx, ppy = intrinsics[..., 0, 0], intrinsics[..., 1, 1], intrinsics[..., 0, 2], intrinsics[..., 1, 2]
+    fx, fy, ppx, ppy = (
+        intrinsics[..., 0, 0],
+        intrinsics[..., 1, 1],
+        intrinsics[..., 0, 2],
+        intrinsics[..., 1, 2],
+    )
     inv_intrinsics = torch.zeros_like(intrinsics)
     inv_intrinsics[..., 0, 0] = 1.0 / fx
     inv_intrinsics[..., 1, 1] = 1.0 / fy
@@ -57,6 +68,7 @@ def get_inv_intrinsics(intrinsics):
     inv_intrinsics[..., 2, 2] = 1.0
     return inv_intrinsics
 
+
 def get_xyz(depth, pose, intrinsics):
     """Returns the XYZ coordinates for a set of points.
 
@@ -96,52 +108,57 @@ def get_xyz(depth, pose, intrinsics):
     xyz = xyz @ get_inv_intrinsics(intrinsics).transpose(-1, -2)
     xyz = xyz * depth.flatten(1).unsqueeze(-1)
     xyz = (xyz[..., None, :] * pose[..., None, :3, :3]).sum(dim=-1) + pose[..., None, :3, 3]
-    
+
     xyz = xyz.unflatten(1, (height, width))
 
     return xyz
 
 
-class LLM_Localizer():
-    def __init__(self, voxel_map_wrapper = None, exist_model = 'gemini-1.5-pro', loc_model = 'owlv2', device = 'cuda'):
+class LLM_Localizer:
+    def __init__(
+        self, voxel_map_wrapper=None, exist_model="gemini-1.5-pro", loc_model="owlv2", device="cuda"
+    ):
         self.voxel_map_wrapper = voxel_map_wrapper
         self.device = device
         self.voxel_pcd = VoxelizedPointcloud(voxel_size=0.2).to(self.device)
         self.existence_checking_model = exist_model
-        
-        self.api_key_1 = os.getenv('GOOGLE_API_KEY')
-        self.api_key_2 = os.getenv('GOOGLE_API_KEY_2')
-        self.api_key_3 = os.getenv('GOOGLE_API_KEY_3')
-        self.api_key = self.api_key_1
 
+        self.api_key_1 = os.getenv("GOOGLE_API_KEY")
+        self.api_key_2 = os.getenv("GOOGLE_API_KEY_2")
+        self.api_key_3 = os.getenv("GOOGLE_API_KEY_3")
+        self.api_key = self.api_key_1
 
         self.context_length = 60
         self.count_threshold = 3
-        if 'gpt' in self.existence_checking_model:
+        if "gpt" in self.existence_checking_model:
             self.max_img_per_request = 30
         else:
             self.max_img_per_request = 200
 
+        if exist_model == "gpt-4o":
+            print("WE ARE USING OPENAI GPT4o")
+            self.gpt_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+        elif exist_model == "gemini-1.5-pro":
+            print("WE ARE USING GEMINI 1.5 PRO")
 
-        if exist_model == 'gpt-4o':
-            print('WE ARE USING OPENAI GPT4o')
-            self.gpt_client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
-        elif exist_model == 'gemini-1.5-pro':
-            print('WE ARE USING GEMINI 1.5 PRO')
-            
-        elif exist_model == 'gemini-1.5-flash':
-            print('WE ARE USING GEMINI 1.5 FLASH')
+        elif exist_model == "gemini-1.5-flash":
+            print("WE ARE USING GEMINI 1.5 FLASH")
         else:
-            print('YOU ARE USING NOTHING!')
+            print("YOU ARE USING NOTHING!")
         self.location_checking_model = loc_model
-        if loc_model == 'owlv2':
-            self.exist_processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
-            self.exist_model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble").to(self.device)
-            print('WE ARE USING OWLV2 FOR LOCALIZATION!')
+        if loc_model == "owlv2":
+            self.exist_processor = AutoProcessor.from_pretrained(
+                "google/owlv2-base-patch16-ensemble"
+            )
+            self.exist_model = Owlv2ForObjectDetection.from_pretrained(
+                "google/owlv2-base-patch16-ensemble"
+            ).to(self.device)
+            print("WE ARE USING OWLV2 FOR LOCALIZATION!")
         else:
-            print('YOU ARE USING VOXEL MAP FOR LOCALIZATION!')
-        
-    def add(self,
+            print("YOU ARE USING VOXEL MAP FOR LOCALIZATION!")
+
+    def add(
+        self,
         points: Tensor,
         features: Optional[Tensor],
         rgb: Optional[Tensor],
@@ -155,35 +172,42 @@ def add(self,
             rgb = rgb.to(self.device)
         if weights is not None:
             weights = weights.to(self.device)
-        self.voxel_pcd.add(points = points, 
-                        features = features,
-                        rgb = rgb,
-                        weights = weights,
-                        obs_count = obs_count)
-
-    def compute_coord(self, text, image_info, threshold = 0.2):
-        rgb = image_info['image']
+        self.voxel_pcd.add(
+            points=points, features=features, rgb=rgb, weights=weights, obs_count=obs_count
+        )
+
+    def compute_coord(self, text, image_info, threshold=0.2):
+        rgb = image_info["image"]
         inputs = self.exist_processor(text=text, images=rgb, return_tensors="pt")
         for input in inputs:
-            inputs[input] = inputs[input].to('cuda')
-        
+            inputs[input] = inputs[input].to("cuda")
+
         with torch.no_grad():
             outputs = self.exist_model(**inputs)
-    
+
         target_sizes = torch.Tensor([rgb.size[::-1]]).to(self.device)
         results = self.exist_processor.image_processor.post_process_object_detection(
             outputs, threshold=threshold, target_sizes=target_sizes
         )[0]
-        depth = image_info['depth']
-        xyzs = image_info['xyz']
+        depth = image_info["depth"]
+        xyzs = image_info["xyz"]
         temp_lst = []
-        for idx, (score, bbox) in enumerate(sorted(zip(results['scores'], results['boxes']), key=lambda x: x[0], reverse=True)):
-        
+        for idx, (score, bbox) in enumerate(
+            sorted(zip(results["scores"], results["boxes"]), key=lambda x: x[0], reverse=True)
+        ):
+
             tl_x, tl_y, br_x, br_y = bbox
             w, h = depth.shape
-            tl_x, tl_y, br_x, br_y = int(max(0, tl_x.item())), int(max(0, tl_y.item())), int(min(h, br_x.item())), int(min(w, br_y.item()))
-            if np.median(depth[tl_y: br_y, tl_x: br_x].reshape(-1)) < 3:
-                coordinate = torch.from_numpy(np.median(xyzs[tl_y: br_y, tl_x: br_x].reshape(-1, 3), axis = 0))
+            tl_x, tl_y, br_x, br_y = (
+                int(max(0, tl_x.item())),
+                int(max(0, tl_y.item())),
+                int(min(h, br_x.item())),
+                int(min(w, br_y.item())),
+            )
+            if np.median(depth[tl_y:br_y, tl_x:br_x].reshape(-1)) < 3:
+                coordinate = torch.from_numpy(
+                    np.median(xyzs[tl_y:br_y, tl_x:br_x].reshape(-1, 3), axis=0)
+                )
                 # temp_lst.append(coordinate)
                 return coordinate
         return None
@@ -198,9 +222,13 @@ def owl_locater(self, A, encoded_image, timestamps_lst):
                 image_info = encoded_image[i][-1]
                 res = self.compute_coord(A, image_info, threshold=0.2)
                 if res is not None:
-                    debug_text = '#### - Obejct is detected in observations where instance' + str(i + 1) + ' comes from. **😃** Directly navigate to it.\n'
+                    debug_text = (
+                        "#### - Obejct is detected in observations where instance"
+                        + str(i + 1)
+                        + " comes from. **😃** Directly navigate to it.\n"
+                    )
                     return res, debug_text, i, None
-        debug_text = '#### - All instances are not the target! Maybe target object has not been observed yet. **😭**\n'
+        debug_text = "#### - All instances are not the target! Maybe target object has not been observed yet. **😭**\n"
 
         # if query_coord_lst != []:
         #     query_coord_lst = np.array(query_coord_lst)
@@ -218,28 +246,39 @@ def owl_locater(self, A, encoded_image, timestamps_lst):
         #     centroid = largest_cluster_points.mean(axis=0)
         #     return centroid, debug_text, None, None
         return None, debug_text, None, None
-    
+
     def process_chunk(self, chunk, sys_prompt, user_prompt):
         for i in range(50):
             try:
-                if 'gpt' in self.existence_checking_model:
+                if "gpt" in self.existence_checking_model:
                     start_time = time.time()
-                    response = self.gpt_client.chat.completions.create(
-                        model=self.existence_checking_model,
-                        messages=[
-                            {"role": "system", "content": sys_prompt},
-                            {"role": "user", "content": user_prompt},
-                            {"role": "user", "content": chunk}
+                    response = (
+                        self.gpt_client.chat.completions.create(
+                            model=self.existence_checking_model,
+                            messages=[
+                                {"role": "system", "content": sys_prompt},
+                                {"role": "user", "content": user_prompt},
+                                {"role": "user", "content": chunk},
                             ],
                             temperature=0.0,
-                        ).choices[0].message.content
-                    
+                        )
+                        .choices[0]
+                        .message.content
+                    )
+
                     end_time = time.time()
-                    print('GPT request cost:', end_time-start_time)
+                    print("GPT request cost:", end_time - start_time)
                 else:
-                    model = genai.GenerativeModel(model_name=f"models/{self.existence_checking_model}-exp-0827", system_instruction=sys_prompt)
+                    model = genai.GenerativeModel(
+                        model_name=f"models/{self.existence_checking_model}-exp-0827",
+                        system_instruction=sys_prompt,
+                    )
                     # "models/{self.existence_checking_model}-exp-0827"
-                    response = model.generate_content(chunk + [user_prompt], generation_config=generation_config, safety_settings=safety_settings).text
+                    response = model.generate_content(
+                        chunk + [user_prompt],
+                        generation_config=generation_config,
+                        safety_settings=safety_settings,
+                    ).text
                 # print("Assistant: ", response)
                 return response
             except Exception as e:
@@ -256,17 +295,17 @@ def process_chunk(self, chunk, sys_prompt, user_prompt):
         return "Execution Failed"
 
     def update_dict(self, A, timestamps_dict, content):
-        timestamps_lst = content.split('\n')
+        timestamps_lst = content.split("\n")
         for item in timestamps_lst:
-            if len(item) < 3 or ':' not in item:
+            if len(item) < 3 or ":" not in item:
                 continue
-            key, value_str = item.split(':')
+            key, value_str = item.split(":")
             if A not in key:
                 continue
-            if 'None' in value_str:
+            if "None" in value_str:
                 value = None
             else:
-                value = list(map(int, value_str.replace(' ', '').split(',')))
+                value = list(map(int, value_str.replace(" ", "").split(",")))
             if key in timestamps_dict:
                 if timestamps_dict[key] is None:
                     timestamps_dict[key] = value
@@ -277,7 +316,7 @@ def update_dict(self, A, timestamps_dict, content):
 
     def llm_locator(self, A, encoded_image):
         timestamps_dict = {}
-    
+
         sys_prompt = f"""
         For object query I give, you need to find timestamps of images that the object is shown, without any unnecessary explanation or space. If the object never exist, please output the object name and the word "None" for timestamps.
 
@@ -291,16 +330,26 @@ def llm_locator(self, A, encoded_image):
         white box: 2,4,10"""
 
         user_prompt = f"""The object you need to find is {A}"""
-        if 'gpt' in self.existence_checking_model:
-            content = [item for sublist in list(encoded_image.values()) for item in sublist[:2]][-self.context_length*2:]
+        if "gpt" in self.existence_checking_model:
+            content = [item for sublist in list(encoded_image.values()) for item in sublist[:2]][
+                -self.context_length * 2 :
+            ]
         else:
-            content = [item for sublist in list(encoded_image.values()) for item in sublist[0]][-self.context_length*2:]
-        
-        content_chunks = [content[i:i + 2* self.max_img_per_request] for i in range(0, len(content), 2* self.max_img_per_request)]
-        
+            content = [item for sublist in list(encoded_image.values()) for item in sublist[0]][
+                -self.context_length * 2 :
+            ]
+
+        content_chunks = [
+            content[i : i + 2 * self.max_img_per_request]
+            for i in range(0, len(content), 2 * self.max_img_per_request)
+        ]
+
         with ThreadPoolExecutor(max_workers=5) as executor:
-            future_to_chunk = {executor.submit(self.process_chunk, chunk, sys_prompt, user_prompt): chunk for chunk in content_chunks}
-            
+            future_to_chunk = {
+                executor.submit(self.process_chunk, chunk, sys_prompt, user_prompt): chunk
+                for chunk in content_chunks
+            }
+
             for future in as_completed(future_to_chunk):
                 chunk = future_to_chunk[future]
                 try:
@@ -310,23 +359,22 @@ def llm_locator(self, A, encoded_image):
                 except Exception as e:
                     print(f"Exception occurred: {e}")
         if A not in timestamps_dict:
-            return None, 'debug_text', None, None
+            return None, "debug_text", None, None
 
         timestamps_lst = timestamps_dict[A]
         if timestamps_lst is None:
-            return None, 'debug_text', None, None
+            return None, "debug_text", None, None
         timestamps_lst = sorted(timestamps_lst, reverse=True)
         # return None
         return self.owl_locater(A, encoded_image, timestamps_lst)
-    
 
-    def localize_A(self, A, debug = True, return_debug = False):
+    def localize_A(self, A, debug=True, return_debug=False):
         encoded_image = OrderedDict()
         counts = torch.bincount(self.voxel_pcd._obs_counts)
         filtered_obs = (counts > self.count_threshold).nonzero(as_tuple=True)[0].tolist()
         filtered_obs = sorted(filtered_obs)
 
-        for obs_id in filtered_obs: 
+        for obs_id in filtered_obs:
             obs_id -= 1
             rgb = np.copy(self.voxel_map_wrapper.observations[obs_id].rgb.numpy())
             depth = self.voxel_map_wrapper.observations[obs_id].depth
@@ -337,28 +385,38 @@ def localize_A(self, A, debug = True, return_debug = False):
 
             rgb[depth > 2.5] = [0, 0, 0]
 
-            image = Image.fromarray(rgb.astype(np.uint8), mode='RGB')
-            if 'gemini' in self.existence_checking_model:
-                encoded_image[obs_id] = [[f"Following is the image took on timestamp {obs_id}: ", image], {'image':image, 'xyz': xyz, 'depth':depth}]
-            elif 'gpt' in self.existence_checking_model:
+            image = Image.fromarray(rgb.astype(np.uint8), mode="RGB")
+            if "gemini" in self.existence_checking_model:
+                encoded_image[obs_id] = [
+                    [f"Following is the image took on timestamp {obs_id}: ", image],
+                    {"image": image, "xyz": xyz, "depth": depth},
+                ]
+            elif "gpt" in self.existence_checking_model:
                 buffered = BytesIO()
                 image.save(buffered, format="PNG")
                 img_bytes = buffered.getvalue()
-                base64_encoded = base64.b64encode(img_bytes).decode('utf-8')
-                encoded_image[obs_id] = [{"type": "text", "text": f"Following is the image took on timestamp {obs_id}: "},
-                    {"type": "image_url", "image_url": {
-                        "url": f"data:image/png;base64,{base64_encoded}"}
-                    }, {'image':image, 'xyz':xyz, 'depth':depth}]
+                base64_encoded = base64.b64encode(img_bytes).decode("utf-8")
+                encoded_image[obs_id] = [
+                    {
+                        "type": "text",
+                        "text": f"Following is the image took on timestamp {obs_id}: ",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/png;base64,{base64_encoded}"},
+                    },
+                    {"image": image, "xyz": xyz, "depth": depth},
+                ]
                 # print(obs_id)
-        
+
         start_time = time.time()
         target_point, debug_text, obs, point = self.llm_locator(A, encoded_image)
         end_time = time.time()
-        print('It takes', end_time - start_time)
+        print("It takes", end_time - start_time)
 
         if not debug:
             return target_point
         elif not return_debug:
             return target_point, debug_text
         else:
-            return target_point, debug_text, obs, point
\ No newline at end of file
+            return target_point, debug_text, obs, point
diff --git a/src/stretch/dynav/llm_server.py b/src/stretch/dynav/llm_server.py
index de83a4a6d..eaec0c5c4 100644
--- a/src/stretch/dynav/llm_server.py
+++ b/src/stretch/dynav/llm_server.py
@@ -1,52 +1,69 @@
-import zmq
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
 
-from stretch.dynav.scannet import CLASS_LABELS_200
+import datetime
+import os
+import pickle
+import threading
 
+# import wget
+import time
+from collections import OrderedDict
+from io import BytesIO
+from pathlib import Path
+
+import clip
 import cv2
+import hydra
 import numpy as np
+import open3d as o3d
+import rerun as rr
+import scipy
 import torch
 import torch.nn.functional as F
 import torchvision.transforms.functional as V
-
-import clip
+import zmq
+from matplotlib import pyplot as plt
+from PIL import Image
 from torchvision import transforms
+from transformers import AutoModel, AutoProcessor
 
-import os
-# import wget
-import time
-
-import open3d as o3d
-
-from matplotlib import pyplot as plt
-import pickle
-from pathlib import Path
-from stretch.dynav.llm_localizer import LLM_Localizer
 from stretch.core import get_parameters
+from stretch.dynav.communication_util import (
+    load_socket,
+    recv_array,
+    recv_depth_img,
+    recv_everything,
+    recv_rgb_img,
+    send_array,
+    send_depth_img,
+    send_everything,
+    send_rgb_img,
+)
+from stretch.dynav.llm_localizer import LLM_Localizer
 from stretch.dynav.mapping_utils import (
-    SparseVoxelMap, 
-    SparseVoxelMapNavigationSpace, 
     AStar,
-    VoxelizedPointcloud
+    SparseVoxelMap,
+    SparseVoxelMapNavigationSpace,
+    VoxelizedPointcloud,
 )
+from stretch.dynav.scannet import CLASS_LABELS_200
 
-import datetime
-
-import threading
-import scipy
-import hydra
-
-from transformers import AutoProcessor, AutoModel
-import rerun as rr
-
-from io import BytesIO
-from PIL import Image
-
-from stretch.dynav.communication_util import load_socket, send_array, recv_array, send_rgb_img, recv_rgb_img, send_depth_img, recv_depth_img, send_everything, recv_everything
-from collections import OrderedDict
 
 def get_inv_intrinsics(intrinsics):
     # return intrinsics.double().inverse().to(intrinsics)
-    fx, fy, ppx, ppy = intrinsics[..., 0, 0], intrinsics[..., 1, 1], intrinsics[..., 0, 2], intrinsics[..., 1, 2]
+    fx, fy, ppx, ppy = (
+        intrinsics[..., 0, 0],
+        intrinsics[..., 1, 1],
+        intrinsics[..., 0, 2],
+        intrinsics[..., 1, 2],
+    )
     inv_intrinsics = torch.zeros_like(intrinsics)
     inv_intrinsics[..., 0, 0] = 1.0 / fx
     inv_intrinsics[..., 1, 1] = 1.0 / fy
@@ -55,6 +72,7 @@ def get_inv_intrinsics(intrinsics):
     inv_intrinsics[..., 2, 2] = 1.0
     return inv_intrinsics
 
+
 def get_xyz(depth, pose, intrinsics):
     """Returns the XYZ coordinates for a set of points.
 
@@ -94,35 +112,37 @@ def get_xyz(depth, pose, intrinsics):
     xyz = xyz @ get_inv_intrinsics(intrinsics).transpose(-1, -2)
     xyz = xyz * depth.flatten(1).unsqueeze(-1)
     xyz = (xyz[..., None, :] * pose[..., None, :3, :3]).sum(dim=-1) + pose[..., None, :3, 3]
-    
+
     xyz = xyz.unflatten(1, (height, width))
 
     return xyz
 
+
 class ImageProcessor:
-    def __init__(self,  
-        vision_method = 'pro_owl', 
-        siglip = True,
-        device = 'cuda',
-        min_depth = 0.25,
-        max_depth = 2.5,
-        img_port = 5555,
-        text_port = 5556,
-        open_communication = True,
-        rerun = True,
-        static = True,
-        log = None
+    def __init__(
+        self,
+        vision_method="pro_owl",
+        siglip=True,
+        device="cuda",
+        min_depth=0.25,
+        max_depth=2.5,
+        img_port=5555,
+        text_port=5556,
+        open_communication=True,
+        rerun=True,
+        static=True,
+        log=None,
     ):
         self.static = static
         self.siglip = siglip
         current_datetime = datetime.datetime.now()
         if log is None:
-            self.log = 'debug_' + current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
+            self.log = "debug_" + current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
         else:
             self.log = log
         self.rerun = rerun
         if self.rerun:
-            rr.init(self.log, spawn = True)
+            rr.init(self.log, spawn=True)
             # if self.static:
             #     rr.init(self.log, spawn = False)
             #     rr.connect('100.108.67.79:9876')
@@ -137,14 +157,16 @@ def __init__(self,
         self.vision_method = vision_method
         # If cuda is not available, then device will be forced to be cpu
         if not torch.cuda.is_available():
-            device = 'cpu'
+            device = "cpu"
         self.device = device
 
         self.create_obstacle_map()
         self.create_vision_model()
 
-        self.voxel_map_lock = threading.Lock()  # Create a lock for synchronizing access to `self.voxel_map_localizer`
-        
+        self.voxel_map_lock = (
+            threading.Lock()
+        )  # Create a lock for synchronizing access to `self.voxel_map_localizer`
+
         if open_communication:
             self.img_socket = load_socket(img_port)
             self.text_socket = load_socket(text_port)
@@ -152,7 +174,7 @@ def __init__(self,
             self.img_thread = threading.Thread(target=self._recv_image)
             self.img_thread.daemon = True
             self.img_thread.start()
-    
+
     def create_obstacle_map(self):
         print("- Load parameters")
         parameters = get_parameters("dynav_config.yaml")
@@ -162,29 +184,21 @@ def create_obstacle_map(self):
             local_radius=parameters["local_radius"],
             obs_min_height=parameters["obs_min_height"],
             obs_max_height=parameters["obs_max_height"],
-            obs_min_density = parameters["obs_min_density"],
-            exp_min_density = parameters["exp_min_density"],
+            obs_min_density=parameters["obs_min_density"],
+            exp_min_density=parameters["exp_min_density"],
             min_depth=self.min_depth,
             max_depth=self.max_depth,
             pad_obstacles=parameters["pad_obstacles"],
-            add_local_radius_points=parameters.get(
-                "add_local_radius_points", default=True
-            ),
+            add_local_radius_points=parameters.get("add_local_radius_points", default=True),
             remove_visited_from_obstacles=parameters.get(
                 "remove_visited_from_obstacles", default=False
             ),
             smooth_kernel_size=parameters.get("filters/smooth_kernel_size", -1),
             use_median_filter=parameters.get("filters/use_median_filter", False),
             median_filter_size=parameters.get("filters/median_filter_size", 5),
-            median_filter_max_error=parameters.get(
-                "filters/median_filter_max_error", 0.01
-            ),
-            use_derivative_filter=parameters.get(
-                "filters/use_derivative_filter", False
-            ),
-            derivative_filter_threshold=parameters.get(
-                "filters/derivative_filter_threshold", 0.5
-            )
+            median_filter_max_error=parameters.get("filters/median_filter_max_error", 0.01),
+            use_derivative_filter=parameters.get("filters/use_derivative_filter", False),
+            derivative_filter_threshold=parameters.get("filters/derivative_filter_threshold", 0.5),
         )
         self.space = SparseVoxelMapNavigationSpace(
             self.voxel_map,
@@ -200,42 +214,69 @@ def create_obstacle_map(self):
         self.planner = AStar(self.space)
 
     def create_vision_model(self):
-        if self.vision_method == 'gpt_owl':
-            self.voxel_map_localizer = LLM_Localizer(self.voxel_map, exist_model = 'gpt-4o', loc_model = 'owlv2', device = self.device)
-        elif self.vision_method == 'flash_owl':
-            self.voxel_map_localizer = LLM_Localizer(self.voxel_map, exist_model = 'gemini-1.5-flash', loc_model = 'owlv2', device = self.device)    
-        elif self.vision_method == 'pro_owl':
-            self.voxel_map_localizer = LLM_Localizer(self.voxel_map, exist_model = 'gemini-1.5-pro', loc_model = 'owlv2', device = self.device)
+        if self.vision_method == "gpt_owl":
+            self.voxel_map_localizer = LLM_Localizer(
+                self.voxel_map, exist_model="gpt-4o", loc_model="owlv2", device=self.device
+            )
+        elif self.vision_method == "flash_owl":
+            self.voxel_map_localizer = LLM_Localizer(
+                self.voxel_map,
+                exist_model="gemini-1.5-flash",
+                loc_model="owlv2",
+                device=self.device,
+            )
+        elif self.vision_method == "pro_owl":
+            self.voxel_map_localizer = LLM_Localizer(
+                self.voxel_map, exist_model="gemini-1.5-pro", loc_model="owlv2", device=self.device
+            )
 
     def process_text(self, text, start_pose):
         if self.rerun:
-            rr.log('/object', rr.Clear(recursive = True), static = self.static)
-            rr.log('/robot_start_pose', rr.Clear(recursive = True), static = self.static)
-            rr.log('/direction', rr.Clear(recursive = True), static = self.static)
-            rr.log('robot_monologue', rr.Clear(recursive = True), static = self.static)
-            rr.log('/Past_observation_most_similar_to_text', rr.Clear(recursive = True), static = self.static)
+            rr.log("/object", rr.Clear(recursive=True), static=self.static)
+            rr.log("/robot_start_pose", rr.Clear(recursive=True), static=self.static)
+            rr.log("/direction", rr.Clear(recursive=True), static=self.static)
+            rr.log("robot_monologue", rr.Clear(recursive=True), static=self.static)
+            rr.log(
+                "/Past_observation_most_similar_to_text",
+                rr.Clear(recursive=True),
+                static=self.static,
+            )
             if not self.static:
-                rr.connect('100.108.67.79:9876')
-        debug_text = ''
-        mode = 'navigation'
+                rr.connect("100.108.67.79:9876")
+        debug_text = ""
+        mode = "navigation"
         obs = None
         # Do visual grounding
-        if text is not None and text != '':
+        if text is not None and text != "":
             with self.voxel_map_lock:
-                localized_point, debug_text, obs, pointcloud = self.voxel_map_localizer.localize_A(text, debug = True, return_debug = True)
+                localized_point, debug_text, obs, pointcloud = self.voxel_map_localizer.localize_A(
+                    text, debug=True, return_debug=True
+                )
             if localized_point is not None:
-                rr.log("/object", rr.Points3D([localized_point[0], localized_point[1], 1.5], colors=torch.Tensor([1, 0, 0]), radii=0.1), static = self.static)
+                rr.log(
+                    "/object",
+                    rr.Points3D(
+                        [localized_point[0], localized_point[1], 1.5],
+                        colors=torch.Tensor([1, 0, 0]),
+                        radii=0.1,
+                    ),
+                    static=self.static,
+                )
         # Do Frontier based exploration
-        if text is None or text == '' or localized_point is None:
-            debug_text += '## Navigation fails, so robot starts exploring environments.\n'
+        if text is None or text == "" or localized_point is None:
+            debug_text += "## Navigation fails, so robot starts exploring environments.\n"
             localized_point = self.sample_frontier(start_pose, text)
-            mode = 'exploration'
-            rr.log("/object", rr.Points3D([0, 0, 0], colors=torch.Tensor([1, 0, 0]), radii=0), static = self.static)
-            print('\n', localized_point, '\n')
+            mode = "exploration"
+            rr.log(
+                "/object",
+                rr.Points3D([0, 0, 0], colors=torch.Tensor([1, 0, 0]), radii=0),
+                static=self.static,
+            )
+            print("\n", localized_point, "\n")
 
         if localized_point is None:
             return []
-        
+
         if len(localized_point) == 2:
             localized_point = np.array([localized_point[0], localized_point[1], 0])
 
@@ -243,48 +284,52 @@ def process_text(self, text, start_pose):
 
         if self.rerun:
             buf = BytesIO()
-            plt.savefig(buf, format='png')
+            plt.savefig(buf, format="png")
             buf.seek(0)
             img = Image.open(buf)
             img = np.array(img)
             buf.close()
-            rr.log('2d_map', rr.Image(img), static = self.static)
+            rr.log("2d_map", rr.Image(img), static=self.static)
         else:
-            if text != '':
-                plt.savefig(self.log + '/debug_' + text + str(self.obs_count) + '.png')
+            if text != "":
+                plt.savefig(self.log + "/debug_" + text + str(self.obs_count) + ".png")
             else:
-                plt.savefig(self.log + '/debug_exploration' + str(self.obs_count) + '.png')
+                plt.savefig(self.log + "/debug_exploration" + str(self.obs_count) + ".png")
         plt.clf()
 
         if self.rerun:
-            if text is not None and text != '':
-                debug_text = '### The goal is to navigate to ' + text + '.\n' + debug_text
+            if text is not None and text != "":
+                debug_text = "### The goal is to navigate to " + text + ".\n" + debug_text
             else:
-                debug_text = '### I have not received any text query from human user.\n ### So, I plan to explore the environment with Frontier-based exploration.\n'
-            debug_text = '# Robot\'s monologue: \n' + debug_text
-            rr.log("robot_monologue", rr.TextDocument(debug_text, media_type = rr.MediaType.MARKDOWN), static = self.static)
+                debug_text = "### I have not received any text query from human user.\n ### So, I plan to explore the environment with Frontier-based exploration.\n"
+            debug_text = "# Robot's monologue: \n" + debug_text
+            rr.log(
+                "robot_monologue",
+                rr.TextDocument(debug_text, media_type=rr.MediaType.MARKDOWN),
+                static=self.static,
+            )
 
-        if obs is not None and mode == 'navigation':
+        if obs is not None and mode == "navigation":
             rgb = self.voxel_map.observations[obs].rgb
             if not self.rerun:
                 if isinstance(rgb, torch.Tensor):
                     rgb = np.array(rgb)
-                cv2.imwrite(self.log + '/debug_' + text + '.png', rgb[:, :, [2, 1, 0]])
+                cv2.imwrite(self.log + "/debug_" + text + ".png", rgb[:, :, [2, 1, 0]])
             else:
-                rr.log('/Past_observation_most_similar_to_text', rr.Image(rgb), static = self.static)
+                rr.log("/Past_observation_most_similar_to_text", rr.Image(rgb), static=self.static)
         traj = []
         waypoints = None
 
         if point is None:
-            print('Unable to find any target point, some exception might happen')
+            print("Unable to find any target point, some exception might happen")
         else:
-            print('Target point is', point)
+            print("Target point is", point)
             res = self.planner.plan(start_pose, point)
             if res.success:
                 waypoints = [pt.state for pt in res.trajectory]
-                # If we are navigating to some object of interst, send (x, y, z) of 
+                # If we are navigating to some object of interst, send (x, y, z) of
                 # the object so that we can make sure the robot looks at the object after navigation
-                finished = len(waypoints) <= 10 and mode == 'navigation'
+                finished = len(waypoints) <= 10 and mode == "navigation"
                 if not finished:
                     waypoints = waypoints[:8]
                 traj = self.planner.clean_path_for_xy(waypoints)
@@ -294,19 +339,33 @@ def process_text(self, text, start_pose):
                     if isinstance(localized_point, torch.Tensor):
                         localized_point = localized_point.tolist()
                     traj.append(localized_point)
-                print('Planned trajectory:', traj)
+                print("Planned trajectory:", traj)
             else:
-                print('[FAILURE]', res.reason)
-            
+                print("[FAILURE]", res.reason)
+
         if traj is not None:
             origins = []
             vectors = []
             for idx in range(len(traj)):
                 if idx != len(traj) - 1:
                     origins.append([traj[idx][0], traj[idx][1], 1.5])
-                    vectors.append([traj[idx + 1][0] - traj[idx][0], traj[idx + 1][1] - traj[idx][1], 0])
-            rr.log("/direction", rr.Arrows3D(origins = origins, vectors = vectors, colors=torch.Tensor([0, 1, 0]), radii=0.05), static = self.static)
-            rr.log("/robot_start_pose", rr.Points3D([start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1), static = self.static)
+                    vectors.append(
+                        [traj[idx + 1][0] - traj[idx][0], traj[idx + 1][1] - traj[idx][1], 0]
+                    )
+            rr.log(
+                "/direction",
+                rr.Arrows3D(
+                    origins=origins, vectors=vectors, colors=torch.Tensor([0, 1, 0]), radii=0.05
+                ),
+                static=self.static,
+            )
+            rr.log(
+                "/robot_start_pose",
+                rr.Points3D(
+                    [start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1
+                ),
+                static=self.static,
+            )
 
         # self.write_to_pickle()
         return traj
@@ -317,34 +376,41 @@ def sample_navigation(self, start, point):
         plt.imshow(obstacles)
         if point is None:
             start_pt = self.planner.to_pt(start)
-            plt.scatter(start_pt[1], start_pt[0], s = 10)
+            plt.scatter(start_pt[1], start_pt[0], s=10)
             return None
         goal = self.space.sample_target_point(start, point, self.planner)
         print("point:", point, "goal:", goal)
         obstacles, explored = self.voxel_map.get_2d_map()
         start_pt = self.planner.to_pt(start)
-        plt.scatter(start_pt[1], start_pt[0], s = 15, c = 'b')
+        plt.scatter(start_pt[1], start_pt[0], s=15, c="b")
         point_pt = self.planner.to_pt(point)
-        plt.scatter(point_pt[1], point_pt[0], s = 15, c = 'g')
+        plt.scatter(point_pt[1], point_pt[0], s=15, c="g")
         if goal is not None:
             goal_pt = self.planner.to_pt(goal)
-            plt.scatter(goal_pt[1], goal_pt[0], s = 10, c = 'r')
+            plt.scatter(goal_pt[1], goal_pt[0], s=10, c="r")
         return goal
 
-    def sample_frontier(self, start_pose = [0, 0, 0], text = None):
+    def sample_frontier(self, start_pose=[0, 0, 0], text=None):
         with self.voxel_map_lock:
-            if text is not None and text != '':
-                index, time_heuristics, alignments_heuristics, total_heuristics = self.space.sample_exploration(start_pose, self.planner, None, None, debug = False)
+            if text is not None and text != "":
+                (
+                    index,
+                    time_heuristics,
+                    alignments_heuristics,
+                    total_heuristics,
+                ) = self.space.sample_exploration(start_pose, self.planner, None, None, debug=False)
             else:
-                index, time_heuristics, _, total_heuristics = self.space.sample_exploration(start_pose, self.planner, None, None, debug = False)
+                index, time_heuristics, _, total_heuristics = self.space.sample_exploration(
+                    start_pose, self.planner, None, None, debug=False
+                )
                 alignments_heuristics = time_heuristics
-                
+
         obstacles, explored = self.voxel_map.get_2d_map()
         plt.clf()
         plt.imshow(obstacles * 0.5 + explored * 0.5)
-        plt.scatter(index[1], index[0], s = 20, c = 'r')
+        plt.scatter(index[1], index[0], s=20, c="r")
         return self.voxel_map.grid_coords_to_xyt(torch.tensor([index[0], index[1]]))
-            
+
     def _recv_image(self):
         while True:
             # data = recv_array(self.img_socket)
@@ -352,11 +418,11 @@ def _recv_image(self):
             start_time = time.time()
             self.process_rgbd_images(rgb, depth, intrinsics, pose)
             process_time = time.time() - start_time
-            print('Image processing takes', process_time, 'seconds')
-    
-    def add_to_voxel_pcd(self, valid_xyz, feature, valid_rgb, weights = None, threshold = 0.95):
+            print("Image processing takes", process_time, "seconds")
+
+    def add_to_voxel_pcd(self, valid_xyz, feature, valid_rgb, weights=None, threshold=0.95):
         # Adding all points to voxelizedPointCloud is useless and expensive, we should exclude threshold of all points
-        selected_indices = torch.randperm(len(valid_xyz))[:int((1 - threshold) * len(valid_xyz))]
+        selected_indices = torch.randperm(len(valid_xyz))[: int((1 - threshold) * len(valid_xyz))]
         if len(selected_indices) == 0:
             return
         if valid_xyz is not None:
@@ -368,11 +434,13 @@ def add_to_voxel_pcd(self, valid_xyz, feature, valid_rgb, weights = None, thresh
         if weights is not None:
             weights = weights[selected_indices]
         with self.voxel_map_lock:
-            self.voxel_map_localizer.add(points = valid_xyz, 
-                                    features = feature,
-                                    rgb = valid_rgb,
-                                    weights = weights,
-                                    obs_count = self.obs_count)
+            self.voxel_map_localizer.add(
+                points=valid_xyz,
+                features=feature,
+                rgb=valid_rgb,
+                weights=weights,
+                obs_count=self.obs_count,
+            )
 
     def process_rgbd_images(self, rgb, depth, intrinsics, pose):
         if not os.path.exists(self.log):
@@ -380,69 +448,76 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
         self.obs_count += 1
         world_xyz = get_xyz(depth, pose, intrinsics).squeeze(0)
 
-        cv2.imwrite('debug/rgb' + str(self.obs_count) + '.jpg', rgb[:, :, [2, 1, 0]])
+        cv2.imwrite("debug/rgb" + str(self.obs_count) + ".jpg", rgb[:, :, [2, 1, 0]])
 
         rgb, depth = torch.from_numpy(rgb), torch.from_numpy(depth)
         rgb = rgb.permute(2, 0, 1).to(torch.uint8)
 
-        median_depth = torch.from_numpy(
-            scipy.ndimage.median_filter(depth, size=5)
-        )
+        median_depth = torch.from_numpy(scipy.ndimage.median_filter(depth, size=5))
         median_filter_error = (depth - median_depth).abs()
         valid_depth = torch.logical_and(depth < self.max_depth, depth > self.min_depth)
-        valid_depth = (
-            valid_depth
-            & (median_filter_error < 0.01).bool()
-        )
-        
+        valid_depth = valid_depth & (median_filter_error < 0.01).bool()
+
         with self.voxel_map_lock:
-            self.voxel_map_localizer.voxel_pcd.clear_points(depth, torch.from_numpy(intrinsics), torch.from_numpy(pose), min_samples_clear = 20)
-            self.voxel_map.voxel_pcd.clear_points(depth, torch.from_numpy(intrinsics), torch.from_numpy(pose))
-        
-        if '_owl' in self.vision_method:
+            self.voxel_map_localizer.voxel_pcd.clear_points(
+                depth, torch.from_numpy(intrinsics), torch.from_numpy(pose), min_samples_clear=20
+            )
+            self.voxel_map.voxel_pcd.clear_points(
+                depth, torch.from_numpy(intrinsics), torch.from_numpy(pose)
+            )
+
+        if "_owl" in self.vision_method:
             self.run_llm_owl(rgb, ~valid_depth, world_xyz)
 
         self.voxel_map.add(
-            camera_pose = torch.Tensor(pose), 
-            rgb = torch.Tensor(rgb).permute(1, 2, 0), 
-            depth = torch.Tensor(depth), 
-            camera_K = torch.Tensor(intrinsics)
+            camera_pose=torch.Tensor(pose),
+            rgb=torch.Tensor(rgb).permute(1, 2, 0),
+            depth=torch.Tensor(depth),
+            camera_K=torch.Tensor(intrinsics),
         )
         obs, exp = self.voxel_map.get_2d_map()
         if self.rerun:
             if self.voxel_map.voxel_pcd._points is not None:
-                rr.log("Obstalce_map/pointcloud", rr.Points3D(self.voxel_map.voxel_pcd._points.detach().cpu(), \
-                                                              colors=self.voxel_map.voxel_pcd._rgb.detach().cpu() / 255., radii=0.03), static = self.static)
+                rr.log(
+                    "Obstalce_map/pointcloud",
+                    rr.Points3D(
+                        self.voxel_map.voxel_pcd._points.detach().cpu(),
+                        colors=self.voxel_map.voxel_pcd._rgb.detach().cpu() / 255.0,
+                        radii=0.03,
+                    ),
+                    static=self.static,
+                )
             if self.voxel_map_localizer.voxel_pcd._points is not None:
-                rr.log("Semantic_memory/pointcloud", rr.Points3D(self.voxel_map_localizer.voxel_pcd._points.detach().cpu(), \
-                                                                 colors=self.voxel_map_localizer.voxel_pcd._rgb.detach().cpu() / 255., radii=0.03), static = self.static)
+                rr.log(
+                    "Semantic_memory/pointcloud",
+                    rr.Points3D(
+                        self.voxel_map_localizer.voxel_pcd._points.detach().cpu(),
+                        colors=self.voxel_map_localizer.voxel_pcd._rgb.detach().cpu() / 255.0,
+                        radii=0.03,
+                    ),
+                    static=self.static,
+                )
         else:
-            cv2.imwrite(self.log + '/debug_' + str(self.obs_count) + '.jpg', np.asarray(obs.int() * 127 + exp.int() * 127))
-    
+            cv2.imwrite(
+                self.log + "/debug_" + str(self.obs_count) + ".jpg",
+                np.asarray(obs.int() * 127 + exp.int() * 127),
+            )
+
     def run_llm_owl(self, rgb, mask, world_xyz):
         valid_xyz = world_xyz[~mask]
         valid_rgb = rgb.permute(1, 2, 0)[~mask]
         if len(valid_xyz) != 0:
             self.add_to_voxel_pcd(valid_xyz, None, valid_rgb)
-    
+
     def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
-        print('Reading from ', pickle_file_name)
-        rr.init('Debug', spawn = True)
+        print("Reading from ", pickle_file_name)
+        rr.init("Debug", spawn=True)
         if isinstance(pickle_file_name, str):
             pickle_file_name = Path(pickle_file_name)
         assert pickle_file_name.exists(), f"No file found at {pickle_file_name}"
         with pickle_file_name.open("rb") as f:
             data = pickle.load(f)
-        for i, (
-            camera_pose,
-            xyz,
-            rgb,
-            feats,
-            depth,
-            base_pose,
-            K,
-            world_xyz,
-        ) in enumerate(
+        for i, (camera_pose, xyz, rgb, feats, depth, base_pose, K, world_xyz,) in enumerate(
             zip(
                 data["camera_poses"],
                 data["xyz"],
@@ -483,15 +558,18 @@ def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
         self.voxel_map_localizer.voxel_pcd._rgb = data["combined_rgb"]
         self.voxel_map_localizer.voxel_pcd._obs_counts = data["obs_id"]
         self.voxel_map_localizer.voxel_pcd._entity_ids = data["entity_id"]
-        self.voxel_map_localizer.voxel_pcd.obs_count = max(self.voxel_map_localizer.voxel_pcd._obs_counts).item()
-        self.voxel_map.voxel_pcd.obs_count = max(self.voxel_map_localizer.voxel_pcd._obs_counts).item()
-
+        self.voxel_map_localizer.voxel_pcd.obs_count = max(
+            self.voxel_map_localizer.voxel_pcd._obs_counts
+        ).item()
+        self.voxel_map.voxel_pcd.obs_count = max(
+            self.voxel_map_localizer.voxel_pcd._obs_counts
+        ).item()
 
     def write_to_pickle(self):
         """Write out to a pickle file. This is a rough, quick-and-easy output for debugging, not intended to replace the scalable data writer in data_tools for bigger efforts."""
-        if not os.path.exists('debug'):
-            os.mkdir('debug')
-        filename = 'debug/' + self.log + '.pkl'
+        if not os.path.exists("debug"):
+            os.mkdir("debug")
+        filename = "debug/" + self.log + ".pkl"
         data = {}
         data["camera_poses"] = []
         data["camera_K"] = []
@@ -526,4 +604,4 @@ def write_to_pickle(self):
         data["entity_id"] = self.voxel_map_localizer.voxel_pcd._entity_ids
         with open(filename, "wb") as f:
             pickle.dump(data, f)
-        print('write all data to', filename)
+        print("write all data to", filename)
diff --git a/src/stretch/dynav/mapping_utils/__init__.py b/src/stretch/dynav/mapping_utils/__init__.py
index 5b9355132..6003d865a 100644
--- a/src/stretch/dynav/mapping_utils/__init__.py
+++ b/src/stretch/dynav/mapping_utils/__init__.py
@@ -1,8 +1,18 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+from .a_star import AStar
+from .voxel import SparseVoxelMap
+from .voxel_map import SparseVoxelMapNavigationSpace
+
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 from .voxelized_pcd import VoxelizedPointcloud, scatter3d
-from .voxel import SparseVoxelMap
-from .voxel_map import SparseVoxelMapNavigationSpace
-from .a_star import AStar
diff --git a/src/stretch/dynav/mapping_utils/a_star.py b/src/stretch/dynav/mapping_utils/a_star.py
index d7172df71..727c3a303 100644
--- a/src/stretch/dynav/mapping_utils/a_star.py
+++ b/src/stretch/dynav/mapping_utils/a_star.py
@@ -1,26 +1,39 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import heapq
+import math
 import time
 from random import random
-from typing import Callable, List, Optional, Tuple, Set
+from typing import Callable, List, Optional, Set, Tuple
 
 import numpy as np
+import torch
 
-from stretch.motion import PlanResult
 from stretch.dynav.mapping_utils import SparseVoxelMapNavigationSpace
+from stretch.motion import PlanResult
 
-import heapq
-import math
-import torch
 
-class Node():
+class Node:
     """Stores an individual spot in the tree"""
 
     def __init__(self, state):
         self.state = state
 
+
 def neighbors(pt: Tuple[int, int]) -> List[Tuple[int, int]]:
-    return [(pt[0] + dx, pt[1] + dy) for dx in range(-1, 2) for dy in range(-1, 2) if (dx, dy) != (0, 0)]
+    return [
+        (pt[0] + dx, pt[1] + dy) for dx in range(-1, 2) for dy in range(-1, 2) if (dx, dy) != (0, 0)
+    ]
 
-class AStar():
+
+class AStar:
     """Define RRT planning problem and parameters"""
 
     def __init__(
@@ -30,7 +43,7 @@ def __init__(
         """Create RRT planner with configuration"""
         self.space = space
         self.reset()
-    
+
     def compute_theta(self, cur_x, cur_y, end_x, end_y):
         theta = 0
         if end_x == cur_x and end_y >= cur_y:
@@ -41,7 +54,7 @@ def compute_theta(self, cur_x, cur_y, end_x, end_y):
             theta = np.arctan((end_y - cur_y) / (end_x - cur_x))
             if end_x < cur_x:
                 theta = theta + np.pi
-            # move theta into [-pi, pi] range, for this function specifically, 
+            # move theta into [-pi, pi] range, for this function specifically,
             # (theta -= 2 * pi) or (theta += 2 * pi) is enough
             if theta > np.pi:
                 theta = theta - 2 * np.pi
@@ -67,7 +80,7 @@ def verify_path(self, path) -> bool:
 
     def point_is_occupied(self, x: int, y: int) -> bool:
         return not bool(self._navigable[x][y])
-    
+
     def to_pt(self, xy: Tuple[float, float]) -> Tuple[int, int]:
         xy = torch.tensor([xy[0], xy[1]])
         pt = self.space.voxel_map.xy_to_grid_coords(xy)
@@ -96,9 +109,12 @@ def compute_obstacle_punishment(self, a: Tuple[int, int], weight: int, avoid: in
         return obstacle_punishment
 
     # A* heuristic
-    def compute_heuristic(self, a: Tuple[int, int], b: Tuple[int, int], weight = 6, avoid = 3) -> float:
-        return self.compute_dis(a, b) + weight * self.compute_obstacle_punishment(a, weight, avoid)\
+    def compute_heuristic(self, a: Tuple[int, int], b: Tuple[int, int], weight=6, avoid=3) -> float:
+        return (
+            self.compute_dis(a, b)
+            + weight * self.compute_obstacle_punishment(a, weight, avoid)
             + self.compute_obstacle_punishment(b, weight, avoid)
+        )
 
     def is_in_line_of_sight(self, start_pt: Tuple[int, int], end_pt: Tuple[int, int]) -> bool:
         """Checks if there is a line-of-sight between two points.
@@ -150,7 +166,9 @@ def clean_path_for_xy(self, waypoints):
         waypoints = [self.to_xy(waypt) for waypt in waypts]
         traj = []
         for i in range(len(waypoints) - 1):
-            theta = self.compute_theta(waypoints[i][0], waypoints[i][1], waypoints[i + 1][0], waypoints[i + 1][1])
+            theta = self.compute_theta(
+                waypoints[i][0], waypoints[i][1], waypoints[i + 1][0], waypoints[i + 1][1]
+            )
             traj.append([waypoints[i][0], waypoints[i][1], float(theta)])
         traj.append([waypoints[-1][0], waypoints[-1][1], goal[-1]])
         return traj
@@ -177,7 +195,7 @@ def clean_path(self, path) -> List[Tuple[int, int]]:
                 if self.is_in_line_of_sight(path[i][:2], path[j][:2]):
                     break
             else:
-               j = i + 1
+                j = i + 1
             # Include the mid waypoint to avoid the collision
             # if j - i >= 2:
             #     cleaned_path.append(path[(i + j) // 2])
@@ -185,7 +203,7 @@ def clean_path(self, path) -> List[Tuple[int, int]]:
             i = j
         return cleaned_path
 
-    def get_unoccupied_neighbor(self, pt: Tuple[int, int], goal_pt = None) -> Tuple[int, int]:
+    def get_unoccupied_neighbor(self, pt: Tuple[int, int], goal_pt=None) -> Tuple[int, int]:
         if not self.point_is_occupied(*pt):
             return pt
 
@@ -199,7 +217,9 @@ def get_unoccupied_neighbor(self, pt: Tuple[int, int], goal_pt = None) -> Tuple[
         for neighbor_pt in neighbor_pts:
             if not self.point_is_occupied(*neighbor_pt):
                 return neighbor_pt
-        print("The robot might stand on a non navigable point, so check obstacle map and restart roslaunch")
+        print(
+            "The robot might stand on a non navigable point, so check obstacle map and restart roslaunch"
+        )
         return None
         # raise ValueError("The robot might stand on a non navigable point, so check obstacle map and restart roslaunch")
 
@@ -305,20 +325,22 @@ def plan(self, start, goal, verbose: bool = True) -> PlanResult:
         if not self.space.is_valid(goal):
             if verbose:
                 print("[Planner] invalid goal")
-            return PlanResult(False, reason = "[Planner] invalid goal")
+            return PlanResult(False, reason="[Planner] invalid goal")
         # Add start to the tree
         # print('Start running A* ', time.time() - self.start_time, ' seconds after path planning starts')
-        waypoints = self.run_astar(start[:2], goal[:2]) 
+        waypoints = self.run_astar(start[:2], goal[:2])
         # print('Finish running A* ', time.time() - self.start_time, ' seconds after path planning starts')
 
         if waypoints is None:
             if verbose:
                 print("A* fails, check obstacle map")
-            return PlanResult(False, reason = "A* fails, check obstacle map")
+            return PlanResult(False, reason="A* fails, check obstacle map")
         trajectory = []
         for i in range(len(waypoints) - 1):
-            theta = self.compute_theta(waypoints[i][0], waypoints[i][1], waypoints[i + 1][0], waypoints[i + 1][1])
+            theta = self.compute_theta(
+                waypoints[i][0], waypoints[i][1], waypoints[i + 1][0], waypoints[i + 1][1]
+            )
             trajectory.append(Node([waypoints[i][0], waypoints[i][1], float(theta)]))
         trajectory.append(Node([waypoints[-1][0], waypoints[-1][1], goal[-1]]))
         # print('Finish computing theta ', time.time() - self.start_time, ' seconds after path planning starts')
-        return PlanResult(True, trajectory = trajectory)
\ No newline at end of file
+        return PlanResult(True, trajectory=trajectory)
diff --git a/src/stretch/dynav/mapping_utils/voxel.py b/src/stretch/dynav/mapping_utils/voxel.py
index 9fdbd6334..625e3cc98 100644
--- a/src/stretch/dynav/mapping_utils/voxel.py
+++ b/src/stretch/dynav/mapping_utils/voxel.py
@@ -1,3 +1,12 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 # This source code is licensed under the MIT license found in the
@@ -5,36 +14,34 @@
 import copy
 import logging
 import math
+import os
 import pickle
+import time
 from collections import namedtuple
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
+import cv2
 import numpy as np
 import open3d as open3d
 import scipy
 import skimage
 import torch
+import torch.nn.functional as F
+import torchvision.transforms.functional as V
 import trimesh
+from scipy.ndimage import gaussian_filter, maximum_filter
 from torch import Tensor
+from torchvision import transforms
 
 from stretch.core.interfaces import Observations
+from stretch.dynav.mapping_utils import VoxelizedPointcloud, scatter3d
 from stretch.motion import Footprint, PlanResult, RobotModel
 from stretch.utils.data_tools.dict import update
 from stretch.utils.morphology import binary_dilation, binary_erosion, get_edges
 from stretch.utils.point_cloud import create_visualization_geometries, numpy_to_pcd
 from stretch.utils.point_cloud_torch import unproject_masked_depth_to_xyz_coordinates
 from stretch.utils.visualization import create_disk
-from stretch.dynav.mapping_utils import VoxelizedPointcloud, scatter3d
-
-import torch.nn.functional as F
-import torchvision.transforms.functional as V
-from torchvision import transforms
-import os
-import cv2
-import time
-
-from scipy.ndimage import gaussian_filter, maximum_filter
 
 Frame = namedtuple(
     "Frame",
@@ -129,7 +136,9 @@ def __init__(
             resolution(float): in meters, size of a voxel
             feature_dim(int): size of feature embeddings to capture per-voxel point
         """
-        print('------------------------YOU ARE NOW RUNNING PEIQI VOXEL MAP CODES V3-----------------')
+        print(
+            "------------------------YOU ARE NOW RUNNING PEIQI VOXEL MAP CODES V3-----------------"
+        )
         # TODO: We an use fastai.store_attr() to get rid of this boilerplate code
         self.resolution = resolution
         self.feature_dim = feature_dim
@@ -248,11 +257,9 @@ def add_obs(
         depth = self.fix_data_type(obs.depth)
         xyz = self.fix_data_type(obs.xyz)
         camera_pose = self.fix_data_type(obs.camera_pose)
-        base_pose = torch.from_numpy(
-            np.array([obs.gps[0], obs.gps[1], obs.compass[0]])
-        ).float()
+        base_pose = torch.from_numpy(np.array([obs.gps[0], obs.gps[1], obs.compass[0]])).float()
         K = self.fix_data_type(obs.camera_K) if camera_K is None else camera_K
-        
+
         self.add(
             camera_pose=camera_pose,
             xyz=xyz,
@@ -317,9 +324,7 @@ def add(
                 assert (
                     feats.shape[-1] == self.feature_dim
                 ), f"features must match voxel feature dimenstionality of {self.feature_dim}"
-                assert (
-                    xyz.shape[0] == feats.shape[0]
-                ), "features must be available for each point"
+                assert xyz.shape[0] == feats.shape[0], "features must be available for each point"
             else:
                 pass
             if isinstance(xyz, np.ndarray):
@@ -329,9 +334,7 @@ def add(
         if camera_K is not None:
             assert camera_K.ndim == 2, "camera intrinsics K must be a 3x3 matrix"
         assert (
-            camera_pose.ndim == 2
-            and camera_pose.shape[0] == 4
-            and camera_pose.shape[1] == 4
+            camera_pose.ndim == 2 and camera_pose.shape[0] == 4 and camera_pose.shape[1] == 4
         ), "Camera pose must be a 4x4 matrix representing a pose in SE(3)"
         assert (
             xyz_frame in VALID_FRAMES
@@ -349,8 +352,7 @@ def add(
         if xyz is not None:
             if xyz_frame == "camera":
                 full_world_xyz = (
-                    torch.cat([xyz, torch.ones_like(xyz[..., [0]])], dim=-1)
-                    @ camera_pose.T
+                    torch.cat([xyz, torch.ones_like(xyz[..., [0]])], dim=-1) @ camera_pose.T
                 )[..., :3]
             elif xyz_frame == "world":
                 full_world_xyz = xyz
@@ -389,8 +391,7 @@ def add(
 
             if self.use_median_filter:
                 valid_depth = (
-                    valid_depth
-                    & (median_filter_error < self.median_filter_max_error).bool()
+                    valid_depth & (median_filter_error < self.median_filter_max_error).bool()
                 )
 
         # Add to voxel grid
@@ -401,7 +402,9 @@ def add(
 
         # TODO: weights could also be confidence, inv distance from camera, etc
         if world_xyz.nelement() > 0:
-            selected_indices = torch.randperm(len(world_xyz))[:int((1 - self.point_update_threshold) * len(world_xyz))]
+            selected_indices = torch.randperm(len(world_xyz))[
+                : int((1 - self.point_update_threshold) * len(world_xyz))
+            ]
             if len(selected_indices) == 0:
                 return
             if world_xyz is not None:
@@ -524,16 +527,7 @@ def read_from_pickle(self, filename: str, num_frames: int = -1):
         assert filename.exists(), f"No file found at {filename}"
         with filename.open("rb") as f:
             data = pickle.load(f)
-        for i, (
-            camera_pose,
-            xyz,
-            rgb,
-            feats,
-            depth,
-            base_pose,
-            K,
-            world_xyz,
-        ) in enumerate(
+        for i, (camera_pose, xyz, rgb, feats, depth, base_pose, K, world_xyz,) in enumerate(
             zip(
                 data["camera_poses"],
                 data["xyz"],
@@ -586,9 +580,7 @@ def recompute_map(self):
                 **frame.info,
             )
 
-    def get_2d_alignment_heuristics(
-        self, voxel_map_localizer, text, debug: bool = False
-    ):
+    def get_2d_alignment_heuristics(self, voxel_map_localizer, text, debug: bool = False):
         if voxel_map_localizer.voxel_pcd._points is None:
             return None
         # Convert metric measurements to discrete
@@ -615,17 +607,22 @@ def get_2d_alignment_heuristics(
 
         alignment_heuristics = scatter3d(xyz, alignments, grid_size, "max")
         alignment_heuristics = torch.max(alignment_heuristics, dim=-1).values
-        alignment_heuristics = torch.from_numpy(maximum_filter(alignment_heuristics.numpy(), size = 7))
+        alignment_heuristics = torch.from_numpy(
+            maximum_filter(alignment_heuristics.numpy(), size=7)
+        )
         return alignment_heuristics
 
-
     def get_2d_map(
         self, debug: bool = False, return_history_id: bool = False
     ) -> Tuple[np.ndarray, ...]:
         """Get 2d map with explored area and frontiers."""
 
         # Is this already cached? If so we don't need to go to all this work
-        if self._map2d is not None and self._history_soft is not None and self._seq == self._2d_last_updated:
+        if (
+            self._map2d is not None
+            and self._history_soft is not None
+            and self._seq == self._2d_last_updated
+        ):
             return self._map2d if not return_history_id else (*self._map2d, self._history_soft)
 
         # Convert metric measurements to discrete
@@ -669,7 +666,7 @@ def get_2d_map(
 
         # history_ids = history_ids[:, :, min_height:max_height]
         history_soft = torch.max(history_ids, dim=-1).values
-        history_soft = torch.from_numpy(maximum_filter(history_soft.float().numpy(), size = 5))
+        history_soft = torch.from_numpy(maximum_filter(history_soft.float().numpy(), size=5))
 
         if self._remove_visited_from_obstacles:
             # Remove "visited" points containing observations of the robot
@@ -700,11 +697,9 @@ def get_2d_map(
         if self.smooth_kernel_size > 0:
             # Opening and closing operations here on explore
             explored = binary_erosion(
-                binary_dilation(
-                    explored.float().unsqueeze(0).unsqueeze(0), self.smooth_kernel
-                ),
+                binary_dilation(explored.float().unsqueeze(0).unsqueeze(0), self.smooth_kernel),
                 self.smooth_kernel,
-            ) #[0, 0].bool()
+            )  # [0, 0].bool()
             explored = binary_dilation(
                 binary_erosion(explored, self.smooth_kernel),
                 self.smooth_kernel,
@@ -761,16 +756,12 @@ def xy_to_grid_coords(self, xy: torch.Tensor) -> Optional[np.ndarray]:
         if isinstance(xy, np.ndarray):
             xy = torch.from_numpy(xy).float()
         grid_xy = (xy / self.grid_resolution) + self.grid_origin[:2]
-        if torch.any(grid_xy >= self._grid_size_t) or torch.any(
-            grid_xy < torch.zeros(2)
-        ):
+        if torch.any(grid_xy >= self._grid_size_t) or torch.any(grid_xy < torch.zeros(2)):
             return None
         else:
             return grid_xy.int()
 
-    def plan_to_grid_coords(
-        self, plan_result: PlanResult
-    ) -> Optional[List[torch.Tensor]]:
+    def plan_to_grid_coords(self, plan_result: PlanResult) -> Optional[List[torch.Tensor]]:
         """Convert a plan properly into grid coordinates"""
         if not plan_result.success:
             return None
@@ -791,16 +782,12 @@ def grid_coords_to_xyt(self, grid_coords: np.ndarray) -> np.ndarray:
         res[:2] = self.grid_coords_to_xy(grid_coords)
         return res
 
-    def show(
-        self, backend: str = "open3d", **backend_kwargs
-    ) -> Tuple[np.ndarray, np.ndarray]:
+    def show(self, backend: str = "open3d", **backend_kwargs) -> Tuple[np.ndarray, np.ndarray]:
         """Display the aggregated point cloud."""
         if backend == "open3d":
             return self._show_open3d(**backend_kwargs)
         else:
-            raise NotImplementedError(
-                f"Uknown backend {backend}, must be 'open3d' or 'pytorch3d"
-            )
+            raise NotImplementedError(f"Uknown backend {backend}, must be 'open3d' or 'pytorch3d")
 
     def get_xyz_rgb(self) -> Tuple[torch.Tensor, torch.Tensor]:
         """Return xyz and rgb of the current map"""
@@ -819,9 +806,7 @@ def sample_from_mask(self, mask: torch.Tensor) -> Optional[np.ndarray]:
     def xyt_is_safe(self, xyt: np.ndarray, robot: Optional[RobotModel] = None) -> bool:
         """Check to see if a given xyt position is known to be safe."""
         if robot is not None:
-            raise NotImplementedError(
-                "not currently checking against robot base geometry"
-            )
+            raise NotImplementedError("not currently checking against robot base geometry")
         obstacles, explored = self.get_2d_map()
         # Convert xy to grid coords
         grid_xy = self.xy_to_grid_coords(xyt[:2])
@@ -904,9 +889,7 @@ def _get_open3d_geometries(
         # Create a combined point cloud
         # pc_xyz, pc_rgb, pc_feats = self.get_data()
         points, _, _, rgb = self.voxel_pcd.get_pointcloud()
-        pcd = numpy_to_pcd(
-            points.detach().cpu().numpy(), (rgb / norm).detach().cpu().numpy()
-        )
+        pcd = numpy_to_pcd(points.detach().cpu().numpy(), (rgb / norm).detach().cpu().numpy())
         if orig is None:
             orig = np.zeros(3)
         geoms = create_visualization_geometries(pcd=pcd, orig=orig)
@@ -938,9 +921,7 @@ def _get_o3d_robot_footprint_geometry(
         """Get a 3d mesh cube for the footprint of the robot. But this does not work very well for whatever reason."""
         # Define the dimensions of the cube
         if dimensions is None:
-            dimensions = np.array(
-                [0.2, 0.2, 0.2]
-            )  # Cube dimensions (length, width, height)
+            dimensions = np.array([0.2, 0.2, 0.2])  # Cube dimensions (length, width, height)
 
         x, y, theta = xyt
         # theta = theta - np.pi/2
@@ -964,9 +945,7 @@ def _get_o3d_robot_footprint_geometry(
         y_offset = -1 * dimensions[1] / 2
         dx = (math.cos(theta) * x_offset) + (math.cos(theta - np.pi / 2) * y_offset)
         dy = (math.sin(theta + np.pi / 2) * y_offset) + (math.sin(theta) * x_offset)
-        translation_vector = np.array(
-            [x + dx, y + dy, 0]
-        )  # Apply offset based on theta
+        translation_vector = np.array([x + dx, y + dy, 0])  # Apply offset based on theta
         transformation_matrix = np.identity(4)
         transformation_matrix[:3, :3] = rotation_matrix
         transformation_matrix[:3, 3] = translation_vector
@@ -990,9 +969,7 @@ def _show_open3d(
         """Show and return bounding box information and rgb color information from an explored point cloud. Uses open3d."""
 
         # get geometries so we can use them
-        geoms = self._get_open3d_geometries(
-            orig, norm, xyt=xyt, footprint=footprint
-        )
+        geoms = self._get_open3d_geometries(orig, norm, xyt=xyt, footprint=footprint)
 
         # Show the geometries of where we have explored
         open3d.visualization.draw_geometries(geoms)
diff --git a/src/stretch/dynav/mapping_utils/voxel_map.py b/src/stretch/dynav/mapping_utils/voxel_map.py
index ed834a8de..9b0bb8316 100644
--- a/src/stretch/dynav/mapping_utils/voxel_map.py
+++ b/src/stretch/dynav/mapping_utils/voxel_map.py
@@ -1,3 +1,12 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 # This source code is licensed under the MIT license found in the
@@ -23,6 +32,7 @@
     get_edges,
 )
 
+
 class SparseVoxelMapNavigationSpace(XYT):
     """subclass for sampling XYT states from explored space"""
 
@@ -40,7 +50,9 @@ def __init__(
         dilate_obstacle_size: int = 2,
         extend_mode: str = "separate",
     ):
-        print('------------------------YOU ARE NOW RUNNING PEIQI VOXEL NAVIGATION SPACE CODES-----------------')
+        print(
+            "------------------------YOU ARE NOW RUNNING PEIQI VOXEL NAVIGATION SPACE CODES-----------------"
+        )
         self.step_size = step_size
         self.rotation_step_size = rotation_step_size
         self.voxel_map = voxel_map
@@ -90,9 +102,7 @@ def draw_state_on_grid(
         img[x0:x1, y0:y1] += mask * weight
         return img
 
-    def create_collision_masks(
-        self, orientation_resolution: int, show_all: bool = False
-    ):
+    def create_collision_masks(self, orientation_resolution: int, show_all: bool = False):
         """Create a set of orientation masks
 
         Args:
@@ -134,9 +144,7 @@ def extend(self, q0: np.ndarray, q1: np.ndarray) -> np.ndarray:
         separate: move then rotate
         joint: move and rotate all at once."""
         assert len(q0) == 3, f"initial configuration must be 3d, was {q0}"
-        assert (
-            len(q1) == 3 or len(q1) == 2
-        ), f"final configuration can be 2d or 3d, was {q1}"
+        assert len(q1) == 3 or len(q1) == 2, f"final configuration can be 2d or 3d, was {q1}"
         if self.extend_mode == "separate":
             return self._extend_separate(q0, q1)
         elif self.extend_mode == "joint":
@@ -145,15 +153,11 @@ def extend(self, q0: np.ndarray, q1: np.ndarray) -> np.ndarray:
         else:
             raise NotImplementedError(f"not supported: {self.extend_mode=}")
 
-    def _extend_separate(
-        self, q0: np.ndarray, q1: np.ndarray, xy_tol: float = 1e-8
-    ) -> np.ndarray:
+    def _extend_separate(self, q0: np.ndarray, q1: np.ndarray, xy_tol: float = 1e-8) -> np.ndarray:
         """extend towards another configuration in this space.
         TODO: we can set the classes here, right now assuming still np.ndarray"""
         assert len(q0) == 3, f"initial configuration must be 3d, was {q0}"
-        assert (
-            len(q1) == 3 or len(q1) == 2
-        ), f"final configuration can be 2d or 3d, was {q1}"
+        assert len(q1) == 3 or len(q1) == 2, f"final configuration can be 2d or 3d, was {q1}"
         dxy = q1[:2] - q0[:2]
         step = dxy / np.linalg.norm(dxy + self.tolerance) * self.step_size
         xy = np.copy(q0[:2])
@@ -175,9 +179,7 @@ def _extend_separate(
             angle_diff = angle_difference(new_theta, cur_theta)
             while angle_diff > self.rotation_step_size:
                 # Interpolate
-                cur_theta = interpolate_angles(
-                    cur_theta, new_theta, self.rotation_step_size
-                )
+                cur_theta = interpolate_angles(cur_theta, new_theta, self.rotation_step_size)
                 # print("interp ang =", cur_theta, "from =", cur_theta, "to =", new_theta)
                 yield np.array([xy[0], xy[1], cur_theta])
                 angle_diff = angle_difference(new_theta, cur_theta)
@@ -221,9 +223,7 @@ def _get_theta_index(self, theta: float) -> int:
             theta += 2 * np.pi
         if theta >= 2 * np.pi:
             theta -= 2 * np.pi
-        assert (
-            theta >= 0 and theta <= 2 * np.pi
-        ), "only angles between 0 and 2*PI allowed"
+        assert theta >= 0 and theta <= 2 * np.pi, "only angles between 0 and 2*PI allowed"
         theta_idx = np.round((theta / (2 * np.pi) * self._orientation_resolution) - 0.5)
         if theta_idx == self._orientation_resolution:
             theta_idx = 0
@@ -271,9 +271,7 @@ def is_valid(
 
         collision = torch.any(crop_obs & mask)
 
-        p_is_safe = (
-            torch.sum((crop_exp & mask) | ~mask) / (mask.shape[0] * mask.shape[1])
-        ).item()
+        p_is_safe = (torch.sum((crop_exp & mask) | ~mask) / (mask.shape[0] * mask.shape[1])).item()
         is_safe = p_is_safe >= is_safe_threshold
         if verbose:
             print(f"{collision=}, {is_safe=}, {p_is_safe=}, {is_safe_threshold=}")
@@ -322,11 +320,7 @@ def compute_theta(self, cur_x, cur_y, end_x, end_y):
         return theta
 
     def sample_target_point(
-        self,
-        start: torch.Tensor,
-        point: torch.Tensor,
-        planner,
-        exploration: bool = False
+        self, start: torch.Tensor, point: torch.Tensor, planner, exploration: bool = False
     ) -> Optional[np.array]:
         """Sample a position near the mask and return.
 
@@ -340,14 +334,14 @@ def sample_target_point(
         start_pt = planner.to_pt(start)
         reachable_points = planner.get_reachable_points(start_pt)
         if len(reachable_points) == 0:
-            print('No target point find, maybe no point is reachable')
+            print("No target point find, maybe no point is reachable")
             return None
         reachable_xs, reachable_ys = zip(*reachable_points)
         reachable_xs = torch.tensor(reachable_xs)
         reachable_ys = torch.tensor(reachable_ys)
-        reachable = torch.empty(obstacles.shape, dtype = torch.bool).fill_(False)
+        reachable = torch.empty(obstacles.shape, dtype=torch.bool).fill_(False)
         reachable[reachable_xs, reachable_ys] = True
-        
+
         obstacles, explored = self.voxel_map.get_2d_map()
         # if self.dilate_obstacles_kernel is not None:
         #     obstacles = binary_dilation(
@@ -359,12 +353,15 @@ def sample_target_point(
 
         xs, ys = torch.where(reachable)
         if len(xs) < 1:
-            print('No target point find, maybe no point is reachable')
+            print("No target point find, maybe no point is reachable")
             return None
-        selected_targets = torch.stack([xs, ys], dim = -1) \
-            [torch.linalg.norm( \
-                (torch.stack([xs, ys], dim = -1) - torch.tensor([target_x, target_y])).float(), dim = -1 \
-            ).topk(k = len(xs), largest = False).indices]
+        selected_targets = torch.stack([xs, ys], dim=-1)[
+            torch.linalg.norm(
+                (torch.stack([xs, ys], dim=-1) - torch.tensor([target_x, target_y])).float(), dim=-1
+            )
+            .topk(k=len(xs), largest=False)
+            .indices
+        ]
 
         # TODO: was this:
         # expanded_mask = expanded_mask & less_explored & ~obstacles
@@ -404,7 +401,7 @@ def sample_target_point(
             #                 target_is_valid = False
             if not target_is_valid:
                 continue
-            
+
             return np.array([selected_x, selected_y, theta])
         return None
 
@@ -441,17 +438,13 @@ def sample_near_mask(
             import matplotlib.pyplot as plt
 
             plt.imshow(
-                mask.int() * 20
-                + expanded_mask.int() * 10
-                + explored.int()
-                + obstacles.int() * 5
+                mask.int() * 20 + expanded_mask.int() * 10 + explored.int() + obstacles.int() * 5
             )
             # import datetime
             # current_datetime = datetime.datetime.now()
             # formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
             # plt.savefig('debug_' + formatted_datetime + '.png')
 
-
         # Where can the robot go?
         valid_indices = torch.nonzero(expanded_mask, as_tuple=False)
         if valid_indices.size(0) == 0:
@@ -467,9 +460,7 @@ def sample_near_mask(
             point_grid_coords = valid_indices[random_index]
 
             if look_at_any_point:
-                outside_point = find_closest_point_on_mask(
-                    mask, point_grid_coords.float()
-                )
+                outside_point = find_closest_point_on_mask(mask, point_grid_coords.float())
 
             # convert back
             point = self.voxel_map.grid_coords_to_xy(point_grid_coords)
@@ -530,10 +521,7 @@ def _get_kernel(self, size: int):
             return None
         if size not in self._kernels:
             kernel = torch.nn.Parameter(
-                torch.from_numpy(skimage.morphology.disk(size))
-                .unsqueeze(0)
-                .unsqueeze(0)
-                .float(),
+                torch.from_numpy(skimage.morphology.disk(size)).unsqueeze(0).unsqueeze(0).float(),
                 requires_grad=False,
             )
             self._kernels[size] = kernel
@@ -606,12 +594,12 @@ def sample_exploration(
         self,
         xyt,
         planner,
-        voxel_map_localizer = None,
-        text = None,
-        debug = False,
+        voxel_map_localizer=None,
+        text=None,
+        debug=False,
     ):
-        obstacles, explored, history_soft = self.voxel_map.get_2d_map(return_history_id = True)
-        if len(xyt) == 3:  
+        obstacles, explored, history_soft = self.voxel_map.get_2d_map(return_history_id=True)
+        if len(xyt) == 3:
             xyt = xyt[:2]
         reachable_points = planner.get_reachable_points(planner.to_pt(xyt))
         reachable_xs, reachable_ys = zip(*reachable_points)
@@ -632,10 +620,14 @@ def sample_exploration(
         else:
             expanded_frontier = edges
         outside_frontier = expanded_frontier & ~reachable_map
-        time_heuristics = self._time_heuristic(history_soft, outside_frontier, debug = debug)
+        time_heuristics = self._time_heuristic(history_soft, outside_frontier, debug=debug)
         if voxel_map_localizer is not None:
-            alignments_heuristics = self.voxel_map.get_2d_alignment_heuristics(voxel_map_localizer, text)
-            alignments_heuristics = self._alignment_heuristic(alignments_heuristics, outside_frontier, debug = debug)
+            alignments_heuristics = self.voxel_map.get_2d_alignment_heuristics(
+                voxel_map_localizer, text
+            )
+            alignments_heuristics = self._alignment_heuristic(
+                alignments_heuristics, outside_frontier, debug=debug
+            )
             total_heuristics = time_heuristics + alignments_heuristics
         else:
             alignments_heuristics = None
@@ -644,47 +636,61 @@ def sample_exploration(
         rounded_heuristics = np.ceil(total_heuristics * 100) / 100
         max_heuristic = rounded_heuristics.max()
         indices = np.column_stack(np.where(rounded_heuristics == max_heuristic))
-        closest_index = np.argmin(np.linalg.norm(indices -  np.asarray(planner.to_pt([0, 0, 0])), axis = -1))
+        closest_index = np.argmin(
+            np.linalg.norm(indices - np.asarray(planner.to_pt([0, 0, 0])), axis=-1)
+        )
         index = indices[closest_index]
         # index = np.unravel_index(np.argmax(total_heuristics), total_heuristics.shape)
         # debug = True
         if debug:
             from matplotlib import pyplot as plt
+
             plt.subplot(221)
             plt.imshow(obstacles.int() * 5 + outside_frontier.int() * 10)
             plt.subplot(222)
             plt.imshow(explored.int() * 5)
             plt.subplot(223)
             plt.imshow(total_heuristics)
-            plt.scatter(index[1], index[0], s = 15, c = 'g')
+            plt.scatter(index[1], index[0], s=15, c="g")
             plt.show()
         return index, time_heuristics, alignments_heuristics, total_heuristics
-        
-    def _alignment_heuristic(self, alignments, outside_frontier, alignment_smooth = 50, alignment_threshold = 0.12, debug = False):
+
+    def _alignment_heuristic(
+        self,
+        alignments,
+        outside_frontier,
+        alignment_smooth=50,
+        alignment_threshold=0.12,
+        debug=False,
+    ):
         alignments = np.ma.masked_array(alignments, ~outside_frontier)
-        alignment_heuristics = 1 / (1 + np.exp(-alignment_smooth * (alignments - alignment_threshold)))
+        alignment_heuristics = 1 / (
+            1 + np.exp(-alignment_smooth * (alignments - alignment_threshold))
+        )
         index = np.unravel_index(np.argmax(alignment_heuristics), alignments.shape)
         if debug:
             plt.clf()
-            plt.title('alignment')
+            plt.title("alignment")
             plt.imshow(alignment_heuristics)
-            plt.scatter(index[1], index[0], s = 15, c = 'g')
+            plt.scatter(index[1], index[0], s=15, c="g")
             plt.show()
         return alignment_heuristics
 
-    def _time_heuristic(self, history_soft, outside_frontier, time_smooth = 0.1, time_threshold = 15, debug = False):
+    def _time_heuristic(
+        self, history_soft, outside_frontier, time_smooth=0.1, time_threshold=15, debug=False
+    ):
         history_soft = np.ma.masked_array(history_soft, ~outside_frontier)
         time_heuristics = history_soft.max() - history_soft
-        time_heuristics[history_soft < 1] = float('inf')
+        time_heuristics[history_soft < 1] = float("inf")
         time_heuristics = 1 / (1 + np.exp(-time_smooth * (time_heuristics - time_threshold)))
         index = np.unravel_index(np.argmax(time_heuristics), history_soft.shape)
         # return index
         # debug = True
         if debug:
             # plt.clf()
-            plt.title('time')
+            plt.title("time")
             plt.imshow(time_heuristics)
-            plt.scatter(index[1], index[0], s = 15, c = 'r')
+            plt.scatter(index[1], index[0], s=15, c="r")
             plt.show()
         return time_heuristics
 
@@ -706,9 +712,7 @@ def sample_closest_frontier(
             debug(bool): show visualizations of frontiers
             step_dist(float): how far apart in geo dist these points should be
         """
-        assert (
-            len(xyt) == 2 or len(xyt) == 3
-        ), f"xyt must be of size 2 or 3 instead of {len(xyt)}"
+        assert len(xyt) == 2 or len(xyt) == 3, f"xyt must be of size 2 or 3 instead of {len(xyt)}"
 
         frontier, outside_frontier, traversible = self.get_frontier(
             expand_size=expand_size, debug=debug
@@ -755,9 +759,7 @@ def sample_closest_frontier(
             prev_dist = dist
 
             point_grid_coords = torch.FloatTensor([[x, y]])
-            outside_point = find_closest_point_on_mask(
-                outside_frontier, point_grid_coords
-            )
+            outside_point = find_closest_point_on_mask(outside_frontier, point_grid_coords)
 
             if outside_point is None:
                 print(
diff --git a/src/stretch/dynav/mapping_utils/voxelized_pcd.py b/src/stretch/dynav/mapping_utils/voxelized_pcd.py
index 56ca55e4b..e3cb35a67 100644
--- a/src/stretch/dynav/mapping_utils/voxelized_pcd.py
+++ b/src/stretch/dynav/mapping_utils/voxelized_pcd.py
@@ -1,3 +1,12 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 """
     This file implements VoxelizedPointcloud module in home-robot project (https://github.com/facebookresearch/home-robot).
     Adapted to be used in ok-robot's navigation voxel map:
@@ -35,6 +44,7 @@
 import numpy as np
 import torch
 from torch import Tensor
+
 USE_TORCH_GEOMETRIC = True
 if USE_TORCH_GEOMETRIC:
     from torch_geometric.nn.pool.consecutive import consecutive_cluster
@@ -55,7 +65,9 @@ def project_points(points_3d, K, pose):
         pose = torch.Tensor(pose)
     pose = pose.to(points_3d)
     # Convert points to homogeneous coordinates
-    points_3d_homogeneous = torch.hstack((points_3d, torch.ones((points_3d.shape[0], 1)).to(points_3d)))
+    points_3d_homogeneous = torch.hstack(
+        (points_3d, torch.ones((points_3d.shape[0], 1)).to(points_3d))
+    )
 
     # Transform points into camera coordinate system
     points_camera_homogeneous = torch.matmul(torch.linalg.inv(pose), points_3d_homogeneous.T).T
@@ -73,7 +85,9 @@ def get_depth_values(points_3d, pose):
     if not isinstance(pose, torch.Tensor):
         pose = torch.Tensor(pose)
     pose = pose.to(points_3d)
-    points_3d_homogeneous = torch.hstack((points_3d, torch.ones((points_3d.shape[0], 1)).to(points_3d)))
+    points_3d_homogeneous = torch.hstack(
+        (points_3d, torch.ones((points_3d.shape[0], 1)).to(points_3d))
+    )
 
     # Transform points into camera coordinate system
     points_camera_homogeneous = torch.matmul(torch.linalg.inv(pose), points_3d_homogeneous.T).T
@@ -136,7 +150,7 @@ def reset(self):
         self._maxs = self.dim_maxs
         self.obs_count = 1
 
-    def clear_points(self, depth, intrinsics, pose, depth_is_valid = None, min_samples_clear = None):
+    def clear_points(self, depth, intrinsics, pose, depth_is_valid=None, min_samples_clear=None):
         if self._points is not None:
             xys = project_points(self._points.detach().cpu(), intrinsics, pose).int()
             xys = xys[:, [1, 0]]
@@ -146,37 +160,48 @@ def clear_points(self, depth, intrinsics, pose, depth_is_valid = None, min_sampl
             # Some points are projected to (i, j) on image plane and i, j might be smaller than 0 or greater than image size
             # which will lead to Index Error.
             valid_xys = xys.clone()
-            valid_xys[torch.any(torch.stack([
-                    xys[:, 0] < 0, 
-                    xys[:, 0] >= H, 
-                    xys[:, 1] < 0, 
-                    xys[:, 1] >= W,
-                ], dim = 0), dim = 0)] = 0
+            valid_xys[
+                torch.any(
+                    torch.stack(
+                        [
+                            xys[:, 0] < 0,
+                            xys[:, 0] >= H,
+                            xys[:, 1] < 0,
+                            xys[:, 1] >= W,
+                        ],
+                        dim=0,
+                    ),
+                    dim=0,
+                )
+            ] = 0
             indices = torch.any(
                 torch.stack(
                     [
                         # the points are projected outside image frame
-                        xys[:, 0] < 0, xys[:, 0] >= H, 
-                        xys[:, 1] < 0, xys[:, 1] >= W, 
+                        xys[:, 0] < 0,
+                        xys[:, 0] >= H,
+                        xys[:, 1] < 0,
+                        xys[:, 1] >= W,
                         # the points are projected to the image frame but is blocked by some obstacles
-                        depth[valid_xys[:, 0], valid_xys[:, 1]] < (proj_depth - 0.1), 
+                        depth[valid_xys[:, 0], valid_xys[:, 1]] < (proj_depth - 0.1),
                         # the points are projected to the image frame but they are behind camera
                         depth[valid_xys[:, 0], valid_xys[:, 1]] < 0.01,
                         proj_depth < 0.01,
                         # depth is too large
                         # (~depth_is_valid)[valid_xys[:, 0], valid_xys[:, 1]],
-                        proj_depth > 2.0
+                        proj_depth > 2.0,
                     ],
-                    dim = 0
+                    dim=0,
                 ),
-                dim = 0)
+                dim=0,
+            )
 
             # if self._entity_ids is not None:
             #     removed_entities, removed_counts = torch.unique(self._entity_ids.detach().cpu()[~indices], return_counts = True)
             #     total_counts = torch.bincount(self._entity_ids.detach().cpu())
             #     entities_to_be_removed = removed_entities[(removed_counts > total_counts[removed_entities] * 0.6) | (total_counts[removed_entities] - removed_counts < 5)]
             #     indices = indices & ~torch.isin(self._entity_ids.detach().cpu(), entities_to_be_removed)
-        
+
             # print('Clearing non valid points...')
             # print('Removing ' + str((~indices).sum().item()) + ' points.')
             indices = indices.to(self._points.device)
@@ -184,7 +209,7 @@ def clear_points(self, depth, intrinsics, pose, depth_is_valid = None, min_sampl
             if self._features is not None:
                 self._features = self._features[indices]
             if self._weights is not None:
-                self._weights= self._weights[indices]
+                self._weights = self._weights[indices]
             if self._rgb is not None:
                 self._rgb = self._rgb[indices]
             if self._obs_counts is not None:
@@ -194,7 +219,13 @@ def clear_points(self, depth, intrinsics, pose, depth_is_valid = None, min_sampl
 
             if self._entity_ids is not None and min_samples_clear is not None:
                 dbscan = DBSCAN(eps=self.voxel_size * 4, min_samples=min_samples_clear)
-                cluster_vertices = torch.cat((self._points.detach().cpu(), self._entity_ids.detach().cpu().reshape(-1,1) * 1000), -1).numpy()
+                cluster_vertices = torch.cat(
+                    (
+                        self._points.detach().cpu(),
+                        self._entity_ids.detach().cpu().reshape(-1, 1) * 1000,
+                    ),
+                    -1,
+                ).numpy()
                 clusters = dbscan.fit(cluster_vertices)
                 labels = clusters.labels_
                 indices = labels != -1
@@ -202,7 +233,7 @@ def clear_points(self, depth, intrinsics, pose, depth_is_valid = None, min_sampl
                 if self._features is not None:
                     self._features = self._features[indices]
                 if self._weights is not None:
-                    self._weights= self._weights[indices]
+                    self._weights = self._weights[indices]
                 if self._rgb is not None:
                     self._rgb = self._rgb[indices]
                 if self._obs_counts is not None:
@@ -217,7 +248,7 @@ def add(
         rgb: Optional[Tensor],
         weights: Optional[Tensor] = None,
         obs_count: Optional[int] = None,
-        entity_id: Optional[int] = None
+        entity_id: Optional[int] = None,
     ):
         """Add a feature pointcloud to the voxel grid.
 
@@ -239,7 +270,7 @@ def add(
         else:
             obs_count = torch.ones_like(weights) * obs_count
         if entity_id is None:
-            entity_id =  torch.ones_like(weights) * self.obs_count
+            entity_id = torch.ones_like(weights) * self.obs_count
         else:
             obs_count = torch.ones_like(weights) * entity_id
         self.obs_count += 1
@@ -263,21 +294,15 @@ def add(
             self._mins, self._maxs = pos_mins, pos_maxs
             # recompute_voxels = True
         else:
-            assert (
-                self._maxs is not None
-            ), "How did self._mins get set without self._maxs?"
+            assert self._maxs is not None, "How did self._mins get set without self._maxs?"
             # recompute_voxels = torch.any(pos_mins < self._mins) or torch.any(self._maxs < pos_maxs)
             self._mins = torch.min(self._mins, pos_mins)
             self._maxs = torch.max(self._maxs, pos_maxs)
 
         if self._points is None:
-            assert (
-                self._features is None
-            ), "How did self._points get unset while _features is set?"
+            assert self._features is None, "How did self._points get unset while _features is set?"
             # assert self._rgbs is None, "How did self._points get unset while _rgbs is set?"
-            assert (
-                self._weights is None
-            ), "How did self._points get unset while _weights is set?"
+            assert self._weights is None, "How did self._points get unset while _weights is set?"
             all_points, all_features, all_weights, all_rgb = (
                 points,
                 features,
@@ -291,9 +316,7 @@ def add(
             all_points = torch.cat([self._points, points], dim=0)
             all_weights = torch.cat([self._weights, weights], dim=0)
             all_features = (
-                torch.cat([self._features, features], dim=0)
-                if (features is not None)
-                else None
+                torch.cat([self._features, features], dim=0) if (features is not None) else None
             )
             all_rgb = torch.cat([self._rgb, rgb], dim=0) if (rgb is not None) else None
             all_obs_counts = torch.cat([self._obs_counts, obs_count], dim=0)
@@ -306,14 +329,21 @@ def add(
         cluster_voxel_idx, cluster_consecutive_idx, _ = voxelize(
             all_points, voxel_size=self.voxel_size, start=self._mins, end=self._maxs
         )
-        self._points, self._features, self._weights, self._rgb, self._obs_counts, self._entity_ids = reduce_pointcloud(
+        (
+            self._points,
+            self._features,
+            self._weights,
+            self._rgb,
+            self._obs_counts,
+            self._entity_ids,
+        ) = reduce_pointcloud(
             cluster_consecutive_idx,
             pos=all_points,
             features=all_features,
             weights=all_weights,
             rgbs=all_rgb,
-            obs_counts = all_obs_counts,
-            entity_ids = all_entity_ids,
+            obs_counts=all_obs_counts,
+            entity_ids=all_entity_ids,
             feature_reduce=self.feature_pool_method,
         )
         self._obs_counts, self._entity_ids = self._obs_counts.int(), self._entity_ids.int()
@@ -449,9 +479,7 @@ def voxelize(
         cluster_consecutive_idx (LongTensor): Packed idx -- contiguous in cluster ID. E.g. [0, 0, 2, 1, 1, 2]
         batch_sample: See https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/nn/pool/max_pool.html
     """
-    voxel_cluster = voxel_grid(
-        pos=pos, batch=batch, size=voxel_size, start=start, end=end
-    )
+    voxel_cluster = voxel_grid(pos=pos, batch=batch, size=voxel_size, start=start, end=end)
     cluster_consecutive_idx, perm = consecutive_cluster(voxel_cluster)
     batch_sample = batch[perm] if batch is not None else None
     cluster_idx = voxel_cluster
@@ -478,9 +506,7 @@ def scatter_weighted_mean(
         Tensor: Agggregated features, weighted by weights and normalized by weights_cluster
     """
     assert dim == 0, "Dim != 0 not yet implemented"
-    feature_cluster = scatter(
-        features * weights[:, None], cluster, dim=dim, reduce="sum"
-    )
+    feature_cluster = scatter(features * weights[:, None], cluster, dim=dim, reduce="sum")
     feature_cluster = feature_cluster / weights_cluster[:, None]
     return feature_cluster
 
@@ -518,29 +544,32 @@ def reduce_pointcloud(
         weights = torch.ones_like(pos[..., 0])
     weights_cluster = scatter(weights, voxel_cluster, dim=0, reduce="sum")
 
-    pos_cluster = scatter_weighted_mean(
-        pos, weights, voxel_cluster, weights_cluster, dim=0
-    )
+    pos_cluster = scatter_weighted_mean(pos, weights, voxel_cluster, weights_cluster, dim=0)
 
     if rgbs is not None:
-        rgb_cluster = scatter_weighted_mean(
-            rgbs, weights, voxel_cluster, weights_cluster, dim=0
-        )
+        rgb_cluster = scatter_weighted_mean(rgbs, weights, voxel_cluster, weights_cluster, dim=0)
     else:
         rgb_cluster = None
-    
+
     if obs_counts is not None:
-        obs_count_cluster = scatter(obs_counts, voxel_cluster, dim = 0, reduce = "max")
+        obs_count_cluster = scatter(obs_counts, voxel_cluster, dim=0, reduce="max")
     else:
         obs_count_cluster = None
 
     if entity_ids is not None:
-        entity_ids_cluster = scatter(entity_ids, voxel_cluster, dim = 0, reduce = "max")
+        entity_ids_cluster = scatter(entity_ids, voxel_cluster, dim=0, reduce="max")
     else:
         entity_ids_cluster = None
 
     if features is None:
-        return pos_cluster, None, weights_cluster, rgb_cluster, obs_count_cluster, entity_ids_cluster
+        return (
+            pos_cluster,
+            None,
+            weights_cluster,
+            rgb_cluster,
+            obs_count_cluster,
+            entity_ids_cluster,
+        )
 
     if feature_reduce == "mean":
         feature_cluster = scatter_weighted_mean(
@@ -549,17 +578,25 @@ def reduce_pointcloud(
     elif feature_reduce == "max":
         feature_cluster = scatter(features, voxel_cluster, dim=0, reduce="max")
     elif feature_reduce == "sum":
-        feature_cluster = scatter(
-            features * weights[:, None], voxel_cluster, dim=0, reduce="sum"
-        )
+        feature_cluster = scatter(features * weights[:, None], voxel_cluster, dim=0, reduce="sum")
     else:
         raise NotImplementedError(f"Unknown feature reduction method {feature_reduce}")
 
-    return pos_cluster, feature_cluster, weights_cluster, rgb_cluster, obs_count_cluster, entity_ids_cluster
+    return (
+        pos_cluster,
+        feature_cluster,
+        weights_cluster,
+        rgb_cluster,
+        obs_count_cluster,
+        entity_ids_cluster,
+    )
 
 
 def scatter3d(
-    voxel_indices: Tensor, weights: Tensor, grid_dimensions: List[int], method: str = "mean",
+    voxel_indices: Tensor,
+    weights: Tensor,
+    grid_dimensions: List[int],
+    method: str = "mean",
 ) -> Tensor:
     """Scatter weights into a 3d voxel grid of the appropriate size.
 
@@ -622,4 +659,4 @@ def drop_smallest_weight_points(
     if len(points_sorted[cutoff_idx:]) < min_points_after_drop:
         return orig_points
     # print(f"Reduced {len(orig_points)} -> {len(points)} -> {above_cutoff.sum()}")
-    return points_sorted[cutoff_idx:]
\ No newline at end of file
+    return points_sorted[cutoff_idx:]
diff --git a/src/stretch/dynav/ok_robot_hw/README.md b/src/stretch/dynav/ok_robot_hw/README.md
index 3c773b67f..5e79b1f41 100644
--- a/src/stretch/dynav/ok_robot_hw/README.md
+++ b/src/stretch/dynav/ok_robot_hw/README.md
@@ -1,7 +1,9 @@
 # Robot Side Code
+
 Most of the heavy code will be runnning in the workstation and will communicate with the robot through sockets
 
 ## Preparation to run robot side codes
+
 Our hardware codes heavily rely on robot controller provided by [home-robot](https://github.com/facebookresearch/home-robot).
 
 You need to install the home-robot on stretch robot following intructions provided by [home-robot-hw](https://github.com/facebookresearch/home-robot/blob/main/docs/install_robot.md) before running any robot side codes on robot.
@@ -13,26 +15,29 @@ Once you finised installing home-robot, follow [these steps](../docs/robot-insta
 The success of OK-Robot system also depends on robot calibration and accurate urdf file, so make sure you follow [these calibration steps](../docs/robot-calibration.md) to obtain an accurate urdf file for your robot.
 
 ## Start home-robot
+
 ```
 stretch_robot_home.py
 roslaunch home_robot_hw startup_stretch_hector_slam.launch
 ```
 
 ## Start Robot Control
+
 ```
 python run.py -x1 [x1] -y1 [y1] -x2 [x2] -y2 [y2] -ip [your workstation ip]
 ```
 
-* **[x1, y1]** - Co-ordinated of tape on which the base of the robot is on
-* **[x2, y2]** - Co-ordinates of the secondary tape.
-* **ip** - Your workstation ip, the robot will try to communicate with this ip
-* **np** - Navigation port number, the robot will listen to this port number to get planned navigation paths (default: 5555)
-* **mp** - Manipulation port number, the robot will listen to this port number to get estimated manipulation pose (default: 5556)
+- **\[x1, y1\]** - Co-ordinated of tape on which the base of the robot is on
+- **\[x2, y2\]** - Co-ordinates of the secondary tape.
+- **ip** - Your workstation ip, the robot will try to communicate with this ip
+- **np** - Navigation port number, the robot will listen to this port number to get planned navigation paths (default: 5555)
+- **mp** - Manipulation port number, the robot will listen to this port number to get estimated manipulation pose (default: 5556)
+
+After running run.py it will go through 4 states in each cycle.
 
-After running run.py it will go through 4 states in each cycle. 
-* Picking Navigation
-* Manipulation
-* Placing Navigation
-* Placing 
+- Picking Navigation
+- Manipulation
+- Placing Navigation
+- Placing
 
-For each navigation stage it asks A [Object Name], B [Near by Object Name]
\ No newline at end of file
+For each navigation stage it asks A \[Object Name\], B \[Near by Object Name\]
diff --git a/src/stretch/dynav/ok_robot_hw/camera.py b/src/stretch/dynav/ok_robot_hw/camera.py
index 09def2415..947cf9a57 100644
--- a/src/stretch/dynav/ok_robot_hw/camera.py
+++ b/src/stretch/dynav/ok_robot_hw/camera.py
@@ -1,3 +1,12 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 ## License: Apache 2.0. See LICENSE file in root directory.
 ## Copyright(c) 2017 Intel Corporation. All Rights Reserved.
 
@@ -5,10 +14,11 @@
 ##              Align Depth to Color               ##
 #####################################################
 
-import numpy as np
 import cv2
-import open3d as o3d
 import matplotlib.pyplot as plt
+import numpy as np
+import open3d as o3d
+
 
 class RealSenseCamera:
     def __init__(self, robot):
@@ -16,7 +26,7 @@ def __init__(self, robot):
         self.depth_scale = 0.001
 
         # Camera intrinsics
-        intrinsics = self.robot.get_camera_K() 
+        intrinsics = self.robot.get_camera_K()
         self.fy = intrinsics[0, 0]
         self.fx = intrinsics[1, 1]
         self.cy = intrinsics[0, 2]
@@ -28,34 +38,37 @@ def __init__(self, robot):
 
     def capture_image(self, visualize=False):
         self.rgb_image, self.depth_image, self.points = self.robot.get_images(compute_xyz=True)
-        self.rgb_image = np.rot90(self.rgb_image, k = 1)[:, :, [2, 1, 0]]
-        self.depth_image = np.rot90(self.depth_image, k = 1)
-        self.points = np.rot90(self.points, k = 1)
+        self.rgb_image = np.rot90(self.rgb_image, k=1)[:, :, [2, 1, 0]]
+        self.depth_image = np.rot90(self.depth_image, k=1)
+        self.points = np.rot90(self.points, k=1)
 
         cv2.imwrite("./images/input.jpg", np.rot90(self.rgb_image, k=-1))
         # cv2.imwrite("depth.jpg", np.rot90(self.depth_image, k=-1)/np.max(self.depth_image))
         self.rgb_image = cv2.cvtColor(self.rgb_image, cv2.COLOR_BGR2RGB)
-        
+
         if visualize:
-            fig, ax = plt.subplots(1, 2, figsize=(10,5))
-            timer = fig.canvas.new_timer(interval = 5000) #creating a timer object and setting an interval of 3000 milliseconds
-            timer.add_callback(lambda : plt.close())
+            fig, ax = plt.subplots(1, 2, figsize=(10, 5))
+            timer = fig.canvas.new_timer(
+                interval=5000
+            )  # creating a timer object and setting an interval of 3000 milliseconds
+            timer.add_callback(lambda: plt.close())
 
             ax[0].imshow(np.rot90(self.rgb_image, k=-1))
             ax[0].set_title("Color Image")
 
             ax[1].imshow(np.rot90(self.depth_image, k=-1))
             ax[1].set_title("Depth Image")
-                
+
             plt.savefig("./images/rgb_dpt.png")
             plt.pause(3)
             plt.close()
-        
+
         return self.rgb_image, self.depth_image, self.points
 
     def pixel2d_to_point3d(self, ix, iy):
         return self.points[iy, ix][[1, 0, 2]]
 
+
 if __name__ == "__main__":
     camera = RealSenseCamera()
     camera.capture_image()
diff --git a/src/stretch/dynav/ok_robot_hw/global_parameters.py b/src/stretch/dynav/ok_robot_hw/global_parameters.py
index 98faca81e..574e04959 100644
--- a/src/stretch/dynav/ok_robot_hw/global_parameters.py
+++ b/src/stretch/dynav/ok_robot_hw/global_parameters.py
@@ -1,3 +1,12 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 POS_TOL = 0.1
 YAW_TOL = 0.2
 
@@ -5,8 +14,8 @@
 GRIPPER_FINGERTIP_LEFT_NODE = "link_gripper_fingertip_left"
 GRIPPER_FINGERTIP_RIGHT_NODE = "link_gripper_fingertip_right"
 
-INIT_LIFT_POS = 0.45 
-INIT_WRIST_PITCH = -1.57 
+INIT_LIFT_POS = 0.45
+INIT_WRIST_PITCH = -1.57
 INIT_ARM_POS = 0
 INIT_WRIST_ROLL = 0
 INIT_WRIST_YAW = 0
diff --git a/src/stretch/dynav/ok_robot_hw/image_publisher.py b/src/stretch/dynav/ok_robot_hw/image_publisher.py
index ab9064a30..6afc54a47 100644
--- a/src/stretch/dynav/ok_robot_hw/image_publisher.py
+++ b/src/stretch/dynav/ok_robot_hw/image_publisher.py
@@ -1,11 +1,27 @@
-import zmq
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 import numpy as np
+import zmq
 from PIL import Image as PILImage
 
-from stretch.dynav.ok_robot_hw.utils.communication_utils import send_array, recv_array, send_depth_img, recv_depth_img, send_rgb_img,  recv_rgb_img
+from stretch.dynav.ok_robot_hw.utils.communication_utils import (
+    recv_array,
+    recv_depth_img,
+    recv_rgb_img,
+    send_array,
+    send_depth_img,
+    send_rgb_img,
+)
 
-class ImagePublisher():
 
+class ImagePublisher:
     def __init__(self, camera, socket):
         self.camera = camera
         self.socket = socket
@@ -24,12 +40,23 @@ def publish_image(self, text, mode, head_tilt=-1):
         send_rgb_img(self.socket, rotated_image)
         print(self.socket.recv_string())
         send_depth_img(self.socket, rotated_depth)
-        print(self.socket.recv_string()) 
+        print(self.socket.recv_string())
         # send_array(self.socket, rotated_image)
         # print(self.socket.recv_string())
         # send_array(self.socket, rotated_depth)
-        # print(self.socket.recv_string()) 
-        send_array(self.socket, np.array([self.camera.fy, self.camera.fx, self.camera.cy, self.camera.cx, int(head_tilt*100)]))
+        # print(self.socket.recv_string())
+        send_array(
+            self.socket,
+            np.array(
+                [
+                    self.camera.fy,
+                    self.camera.fx,
+                    self.camera.cy,
+                    self.camera.cx,
+                    int(head_tilt * 100),
+                ]
+            ),
+        )
         print(self.socket.recv_string())
 
         ## Sending Object text and Manipulation mode
@@ -55,5 +82,5 @@ def publish_image(self, text, mode, head_tilt=-1):
         print(translation)
         print("rotation: ")
         print(rotation)
-        print(self.socket.recv_string())    
+        print(self.socket.recv_string())
         return translation, rotation, depth, width, retry
diff --git a/src/stretch/dynav/ok_robot_hw/robot.py b/src/stretch/dynav/ok_robot_hw/robot.py
index d3aa1e1a8..87e5edc3b 100644
--- a/src/stretch/dynav/ok_robot_hw/robot.py
+++ b/src/stretch/dynav/ok_robot_hw/robot.py
@@ -1,56 +1,75 @@
-import numpy as np
-import sys
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import math
 import os
+import random
+import sys
+import time
+
+import numpy as np
+import pinocchio as pin
 
 # from urdf_parser_py.urdf import URDF
 from scipy.spatial.transform import Rotation as R
-import math
-import time
-import random
-import os
 
-from stretch.dynav.ok_robot_hw.utils import transform_joint_array
 from stretch.dynav.ok_robot_hw.global_parameters import *
-
-import pinocchio as pin
+from stretch.dynav.ok_robot_hw.utils import transform_joint_array
 from stretch.motion.kinematics import HelloStretchIdx
 
 OVERRIDE_STATES = {}
 
+
 class HelloRobot:
     def __init__(
         self,
         robot,
-        stretch_client_urdf_file = 'src/stretch/config/urdf',
-        gripper_threshold = 7.0, 
-        stretch_gripper_max = 0.64, 
-        stretch_gripper_min = 0, 
-        end_link = 'link_gripper_s3_body'
+        stretch_client_urdf_file="src/stretch/config/urdf",
+        gripper_threshold=7.0,
+        stretch_gripper_max=0.64,
+        stretch_gripper_min=0,
+        end_link="link_gripper_s3_body",
     ):
         self.STRETCH_GRIPPER_MAX = stretch_gripper_max
         self.STRETCH_GRIPPER_MIN = stretch_gripper_min
-        self.urdf_path = os.path.join(stretch_client_urdf_file, 'stretch.urdf')
-        self.joints_pin = {'joint_fake':0}
-        
+        self.urdf_path = os.path.join(stretch_client_urdf_file, "stretch.urdf")
+        self.joints_pin = {"joint_fake": 0}
+
         self.GRIPPER_THRESHOLD = gripper_threshold
 
         print("hello robot starting")
         self.head_joint_list = ["joint_fake", "joint_head_pan", "joint_head_tilt"]
-        self.init_joint_list = ["joint_fake","joint_lift","3","2","1" ,"0","joint_wrist_yaw","joint_wrist_pitch","joint_wrist_roll", "joint_gripper_finger_left"]
-
-        # end_link is the frame of reference node 
-        self.end_link = end_link 
+        self.init_joint_list = [
+            "joint_fake",
+            "joint_lift",
+            "3",
+            "2",
+            "1",
+            "0",
+            "joint_wrist_yaw",
+            "joint_wrist_pitch",
+            "joint_wrist_roll",
+            "joint_gripper_finger_left",
+        ]
+
+        # end_link is the frame of reference node
+        self.end_link = end_link
         self.set_end_link(end_link)
-        
+
         # Initialize StretchClient controller
         self.robot = robot
         self.robot.switch_to_manipulation_mode()
         # time.sleep(2)
 
         # Constraining the robots movement
-        self.clamp = lambda n, minn, maxn: max(min(maxn, n), minn) 
+        self.clamp = lambda n, minn, maxn: max(min(maxn, n), minn)
         self.pan, self.tilt = self.robot.get_pan_tilt()
-    
 
     def set_end_link(self, link):
         if link == GRIPPER_FINGERTIP_LEFT_NODE or GRIPPER_FINGERTIP_RIGHT_NODE:
@@ -58,44 +77,48 @@ def set_end_link(self, link):
         else:
             self.joint_list = self.init_joint_list[:-1]
 
-
     def get_joints(self):
         """
-            Returns all the joint names and values involved in forward kinematics of head and gripper 
+        Returns all the joint names and values involved in forward kinematics of head and gripper
         """
         ## Names of all 13 joints
-        joint_names = self.init_joint_list + ["joint_gripper_finger_right"] + self.head_joint_list[1:]
+        joint_names = (
+            self.init_joint_list + ["joint_gripper_finger_right"] + self.head_joint_list[1:]
+        )
         self.updateJoints()
         joint_values = list(self.joints.values()) + [0] + list(self.head_joints.values())[1:]
 
         return joint_names, joint_values
 
     def move_to_position(
-        self, 
-        lift_pos = None, 
-        arm_pos = None, 
-        base_trans = 0.0, 
-        wrist_yaw = None, 
-        wrist_pitch = None, 
-        wrist_roll = None, 
-        gripper_pos = None, 
-        base_theta = None, 
-        head_tilt = None, 
-        head_pan = None,
-        blocking = True,
+        self,
+        lift_pos=None,
+        arm_pos=None,
+        base_trans=0.0,
+        wrist_yaw=None,
+        wrist_pitch=None,
+        wrist_roll=None,
+        gripper_pos=None,
+        base_theta=None,
+        head_tilt=None,
+        head_pan=None,
+        blocking=True,
     ):
         """
-            Moves the robots, base, arm, lift, wrist and head to a desired position.
+        Moves the robots, base, arm, lift, wrist and head to a desired position.
         """
         if base_theta is not None:
-            self.robot.navigate_to([0, 0, base_theta], relative = True, blocking = True)
+            self.robot.navigate_to([0, 0, base_theta], relative=True, blocking=True)
             return
-            
+
         # Base, arm and lift state update
         target_state = self.robot.get_six_joints()
         if not gripper_pos is None:
-            self.CURRENT_STATE = gripper_pos*(self.STRETCH_GRIPPER_MAX-self.STRETCH_GRIPPER_MIN)+self.STRETCH_GRIPPER_MIN
-            self.robot.gripper_to(self.CURRENT_STATE, blocking = blocking)
+            self.CURRENT_STATE = (
+                gripper_pos * (self.STRETCH_GRIPPER_MAX - self.STRETCH_GRIPPER_MIN)
+                + self.STRETCH_GRIPPER_MIN
+            )
+            self.robot.gripper_to(self.CURRENT_STATE, blocking=blocking)
         if not arm_pos is None:
             target_state[2] = arm_pos
         if not lift_pos is None:
@@ -110,12 +133,12 @@ def move_to_position(
         if not wrist_pitch is None:
             target_state[4] = min(wrist_pitch, 0.1)
         if not wrist_roll is None:
-            target_state[5] = wrist_roll    
-        
+            target_state[5] = wrist_roll
+
         # Actual Movement
         # print('Target Position', target_state)
         # print('pan tilt before', self.robot.get_pan_tilt())
-        self.robot.arm_to(target_state, blocking = blocking, head = np.array([self.pan, self.tilt]))
+        self.robot.arm_to(target_state, blocking=blocking, head=np.array([self.pan, self.tilt]))
         # print('pan tilt after', self.robot.get_pan_tilt())
         # print('Actual location', self.robot.get_six_joints())
 
@@ -129,77 +152,77 @@ def move_to_position(
         if not head_pan is None:
             target_head_pan = head_pan
             self.pan = head_pan
-        self.robot.head_to(head_tilt = target_head_tilt, head_pan = target_head_pan, blocking = blocking)
+        self.robot.head_to(head_tilt=target_head_tilt, head_pan=target_head_pan, blocking=blocking)
         # self.pan, self.tilt = self.robot.get_pan_tilt()
-        #time.sleep(0.7)
+        # time.sleep(0.7)
 
     def pickup(self, width):
         """
-            Code for grasping the object
-            Gripper closes gradually until it encounters resistence
+        Code for grasping the object
+        Gripper closes gradually until it encounters resistence
         """
         next_gripper_pos = width
         while True:
-            self.robot.gripper_to(max(next_gripper_pos * self.STRETCH_GRIPPER_MAX, -0.2), blocking = True)
+            self.robot.gripper_to(
+                max(next_gripper_pos * self.STRETCH_GRIPPER_MAX, -0.2), blocking=True
+            )
             curr_gripper_pose = self.robot.get_gripper_position()
             # print('Robot means to move gripper to', next_gripper_pos * self.STRETCH_GRIPPER_MAX)
             # print('Robot actually moves gripper to', curr_gripper_pose)
             if next_gripper_pos == -1:
                 break
-            
+
             if next_gripper_pos > 0:
                 next_gripper_pos -= 0.35
-            else: 
+            else:
                 next_gripper_pos = -1
 
     def updateJoints(self):
         """
-            update all the current poisitions of joints 
+        update all the current poisitions of joints
         """
         state = self.robot.get_six_joints()
         origin_dist = state[0]
 
         # Head Joints
         pan, tilt = self.robot.get_pan_tilt()
-        
-        self.joints_pin['joint_fake'] = origin_dist
-        self.joints_pin['joint_lift'] = state[1]
+
+        self.joints_pin["joint_fake"] = origin_dist
+        self.joints_pin["joint_lift"] = state[1]
         armPos = state[2]
-        self.joints_pin['joint_arm_l3'] = armPos / 4.0
-        self.joints_pin['joint_arm_l2'] = armPos / 4.0
-        self.joints_pin['joint_arm_l1'] = armPos / 4.0
-        self.joints_pin['joint_arm_l0'] = armPos / 4.0
-        self.joints_pin['joint_wrist_yaw'] = state[3]
-        self.joints_pin['joint_wrist_roll'] = state[5]
-        self.joints_pin['joint_wrist_pitch'] = OVERRIDE_STATES.get('wrist_pitch', state[4])
-        self.joints_pin['joint_gripper_finger_left'] = 0
-
-        self.joints_pin['joint_head_pan'] = pan
-        self.joints_pin['joint_head_tilt'] = tilt
-
-    # following function is used to move the robot to a desired joint configuration 
+        self.joints_pin["joint_arm_l3"] = armPos / 4.0
+        self.joints_pin["joint_arm_l2"] = armPos / 4.0
+        self.joints_pin["joint_arm_l1"] = armPos / 4.0
+        self.joints_pin["joint_arm_l0"] = armPos / 4.0
+        self.joints_pin["joint_wrist_yaw"] = state[3]
+        self.joints_pin["joint_wrist_roll"] = state[5]
+        self.joints_pin["joint_wrist_pitch"] = OVERRIDE_STATES.get("wrist_pitch", state[4])
+        self.joints_pin["joint_gripper_finger_left"] = 0
+
+        self.joints_pin["joint_head_pan"] = pan
+        self.joints_pin["joint_head_tilt"] = tilt
+
+    # following function is used to move the robot to a desired joint configuration
     def move_to_joints(self, joints, gripper, mode=0):
         """
-            Given the desrired joints movement this fucntion will the joints accordingly
+        Given the desrired joints movement this fucntion will the joints accordingly
         """
         state = self.robot.get_six_joints()
 
         # clamp rotational joints between -1.57 to 1.57
-        joints['joint_wrist_pitch'] = (joints['joint_wrist_pitch'] + 1.57) % 3.14 - 1.57
-        joints['joint_wrist_yaw'] = (joints['joint_wrist_yaw'] + 1.57) % 3.14 - 1.57
-        joints['joint_wrist_roll'] = (joints['joint_wrist_roll'] + 1.57) % 3.14 - 1.57
-        joints['joint_wrist_pitch'] = self.clamp(joints['joint_wrist_pitch'], -1.57, 0.1)
+        joints["joint_wrist_pitch"] = (joints["joint_wrist_pitch"] + 1.57) % 3.14 - 1.57
+        joints["joint_wrist_yaw"] = (joints["joint_wrist_yaw"] + 1.57) % 3.14 - 1.57
+        joints["joint_wrist_roll"] = (joints["joint_wrist_roll"] + 1.57) % 3.14 - 1.57
+        joints["joint_wrist_pitch"] = self.clamp(joints["joint_wrist_pitch"], -1.57, 0.1)
         target_state = [
-            joints['joint_fake'], 
-            joints['joint_lift'],
-            joints['3'] + 
-            joints['2'] + 
-            joints['1'] + 
-            joints['0'],
-            joints['joint_wrist_yaw'],
-            joints['joint_wrist_pitch'],
-            joints['joint_wrist_roll']]
-        
+            joints["joint_fake"],
+            joints["joint_lift"],
+            joints["3"] + joints["2"] + joints["1"] + joints["0"],
+            joints["joint_wrist_yaw"],
+            joints["joint_wrist_pitch"],
+            joints["joint_wrist_roll"],
+        ]
+
         # print('pan tilt before', self.robot.get_pan_tilt())
 
         # Moving only the lift first
@@ -207,52 +230,54 @@ def move_to_joints(self, joints, gripper, mode=0):
             target1 = state
             target1[0] = target_state[0]
             target1[1] = min(1.1, target_state[1] + 0.2)
-            self.robot.arm_to(target1, blocking = True, head = np.array([self.pan, self.tilt]))
+            self.robot.arm_to(target1, blocking=True, head=np.array([self.pan, self.tilt]))
 
         # print('pan tilt after', self.robot.get_pan_tilt())
         # print('pan tilt before', self.robot.get_pan_tilt())
 
-        self.robot.arm_to(target_state, blocking = True, head = np.array([self.pan, self.tilt]))
-        self.robot.head_to(head_tilt = self.tilt, head_pan = self.pan, blocking = True)
+        self.robot.arm_to(target_state, blocking=True, head=np.array([self.pan, self.tilt]))
+        self.robot.head_to(head_tilt=self.tilt, head_pan=self.pan, blocking=True)
 
         # print('pan tilt after', self.robot.get_pan_tilt())
         # print(f"current state {self.robot.get_six_joints()}")
         # print(f"target state {target_state}")
         # time.sleep(1)
 
-        #NOTE: below code is to fix the pitch drift issue in current hello-robot. Remove it if there is no pitch drift issue
-        OVERRIDE_STATES['wrist_pitch'] = joints['joint_wrist_pitch']
-    
+        # NOTE: below code is to fix the pitch drift issue in current hello-robot. Remove it if there is no pitch drift issue
+        OVERRIDE_STATES["wrist_pitch"] = joints["joint_wrist_pitch"]
+
     def get_joint_transform(self, node1, node2):
-        '''
-            This function takes two nodes from a robot URDF file as input and 
-            outputs the coordinate frame of node2 relative to the coordinate frame of node1.
+        """
+        This function takes two nodes from a robot URDF file as input and
+        outputs the coordinate frame of node2 relative to the coordinate frame of node1.
 
-            Mainly used for transforming co-ordinates from camera frame to gripper frame.
-        '''
+        Mainly used for transforming co-ordinates from camera frame to gripper frame.
+        """
 
         # return frame_transform, frame2, frame1
         self.updateJoints()
         frame_pin = self.robot.get_frame_pose(self.joints_pin, node1, node2)
 
         return frame_pin
-    
+
     def move_to_pose(self, translation_tensor, rotational_tensor, gripper, move_mode=0):
         """
-            Function to move the gripper to a desired translation and rotation
+        Function to move the gripper to a desired translation and rotation
         """
         translation = [translation_tensor[0], translation_tensor[1], translation_tensor[2]]
         rotation = rotational_tensor
-        
+
         self.updateJoints()
 
         q = self.robot.get_joint_positions()
-        q[HelloStretchIdx.WRIST_PITCH] = OVERRIDE_STATES.get('wrist_pitch', q[HelloStretchIdx.WRIST_PITCH])
-        pin_pose = self.robot.get_ee_pose(matrix=True, link_name=self.end_link, q = q)
+        q[HelloStretchIdx.WRIST_PITCH] = OVERRIDE_STATES.get(
+            "wrist_pitch", q[HelloStretchIdx.WRIST_PITCH]
+        )
+        pin_pose = self.robot.get_ee_pose(matrix=True, link_name=self.end_link, q=q)
         pin_rotation, pin_translation = pin_pose[:3, :3], pin_pose[:3, 3]
         pin_curr_pose = pin.SE3(pin_rotation, pin_translation)
 
-        rot_matrix = R.from_euler('xyz', rotation, degrees=False).as_matrix()
+        rot_matrix = R.from_euler("xyz", rotation, degrees=False).as_matrix()
 
         pin_del_pose = pin.SE3(np.array(rot_matrix), np.array(translation))
         pin_goal_pose_new = pin_curr_pose * pin_del_pose
@@ -262,7 +287,7 @@ def move_to_pose(self, translation_tensor, rotational_tensor, gripper, move_mode
         # print(f"final pos and quat {final_pos}\n {final_quat}")
 
         full_body_cfg = self.robot.solve_ik(
-            final_pos, final_quat, False, None, False, node_name = self.end_link
+            final_pos, final_quat, False, None, False, node_name=self.end_link
         )
         if full_body_cfg is None:
             print("Warning: Cannot find an IK solution for desired EE pose!")
diff --git a/src/stretch/dynav/ok_robot_hw/utils/__init__.py b/src/stretch/dynav/ok_robot_hw/utils/__init__.py
index c7a2ffae2..5baa346ae 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/__init__.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/__init__.py
@@ -1,3 +1,12 @@
-from .grasper_utils import *
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 from .communication_utils import *
-from .utils import *
\ No newline at end of file
+from .grasper_utils import *
+from .utils import *
diff --git a/src/stretch/dynav/ok_robot_hw/utils/communication_utils.py b/src/stretch/dynav/ok_robot_hw/utils/communication_utils.py
index ad2396b70..6e9775237 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/communication_utils.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/communication_utils.py
@@ -1,47 +1,65 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import cv2
 import numpy as np
 import zmq
-import cv2
+
 
 # use zmq to send a numpy array
 def send_array(socket, A, flags=0, copy=True, track=False):
     """send a numpy array with metadata"""
     A = np.array(A)
     md = dict(
-        dtype = str(A.dtype),
-        shape = A.shape,
+        dtype=str(A.dtype),
+        shape=A.shape,
     )
-    socket.send_json(md, flags|zmq.SNDMORE)
+    socket.send_json(md, flags | zmq.SNDMORE)
     return socket.send(np.ascontiguousarray(A), flags, copy=copy, track=track)
 
+
 # use zmq to receive a numpy array
 def recv_array(socket, flags=0, copy=True, track=False):
     """recv a numpy array"""
     md = socket.recv_json(flags=flags)
     msg = socket.recv(flags=flags, copy=copy, track=track)
-    A = np.frombuffer(msg, dtype=md['dtype'])
-    return A.reshape(md['shape'])
+    A = np.frombuffer(msg, dtype=md["dtype"])
+    return A.reshape(md["shape"])
+
 
 def send_rgb_img(socket, img):
-    img = img.astype(np.uint8) 
+    img = img.astype(np.uint8)
     encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 90]
-    _, img_encoded = cv2.imencode('.jpg', img, encode_param)
+    _, img_encoded = cv2.imencode(".jpg", img, encode_param)
     socket.send(img_encoded.tobytes())
 
+
 def recv_rgb_img(socket):
     img = socket.recv()
     img = np.frombuffer(img, dtype=np.uint8)
     img = cv2.imdecode(img, cv2.IMREAD_COLOR)
     return img
 
+
 def send_depth_img(socket, depth_img):
     depth_img = (depth_img * 1000).astype(np.uint16)
-    encode_param = [int(cv2.IMWRITE_PNG_COMPRESSION), 3]  # Compression level from 0 (no compression) to 9 (max compression)
-    _, depth_img_encoded = cv2.imencode('.png', depth_img, encode_param)
+    encode_param = [
+        int(cv2.IMWRITE_PNG_COMPRESSION),
+        3,
+    ]  # Compression level from 0 (no compression) to 9 (max compression)
+    _, depth_img_encoded = cv2.imencode(".png", depth_img, encode_param)
     socket.send(depth_img_encoded.tobytes())
 
+
 def recv_depth_img(socket):
     depth_img = socket.recv()
     depth_img = np.frombuffer(depth_img, dtype=np.uint8)
     depth_img = cv2.imdecode(depth_img, cv2.IMREAD_UNCHANGED)
-    depth_img = (depth_img / 1000.)
-    return depth_img
\ No newline at end of file
+    depth_img = depth_img / 1000.0
+    return depth_img
diff --git a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
index f27de8c02..3f1ceae4c 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
@@ -1,15 +1,26 @@
-import time
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 import math
+import time
+
 import numpy as np
 import pinocchio as pin
 
-from stretch.dynav.ok_robot_hw.image_publisher import ImagePublisher
 from stretch.dynav.ok_robot_hw.global_parameters import *
+from stretch.dynav.ok_robot_hw.image_publisher import ImagePublisher
 from stretch.dynav.ok_robot_hw.utils.utils import apply_se3_transform
 
+
 def capture_and_process_image(camera, mode, obj, socket, hello_robot):
-    
-    print('Currently in ' + mode + ' mode and the robot is about to manipulate ' + obj + '.')
+
+    print("Currently in " + mode + " mode and the robot is about to manipulate " + obj + ".")
 
     image_publisher = ImagePublisher(camera, socket)
 
@@ -20,31 +31,33 @@ def capture_and_process_image(camera, mode, obj, socket, hello_robot):
     head_tilt = INIT_HEAD_TILT
     head_pan = INIT_HEAD_PAN
 
-    while(retry_flag):
-        translation, rotation, depth, width, retry_flag = image_publisher.publish_image(obj, mode, head_tilt=head_tilt)
+    while retry_flag:
+        translation, rotation, depth, width, retry_flag = image_publisher.publish_image(
+            obj, mode, head_tilt=head_tilt
+        )
 
         print(f"retry flag : {retry_flag}")
-        if (retry_flag == 1):
+        if retry_flag == 1:
             base_trans = translation[0]
-            head_tilt += (rotation[0])
+            head_tilt += rotation[0]
 
-            hello_robot.move_to_position(base_trans=base_trans,
-                                    head_pan=head_pan,
-                                    head_tilt=head_tilt)
+            hello_robot.move_to_position(
+                base_trans=base_trans, head_pan=head_pan, head_tilt=head_tilt
+            )
             time.sleep(1)
-        
-        elif (retry_flag !=0 and side_retries == 3):
+
+        elif retry_flag != 0 and side_retries == 3:
             print("Tried in all angles but couldn't succed")
             time.sleep(1)
             return None, None, None, None
 
-        elif (side_retries == 2 and tilt_retries == 3):
+        elif side_retries == 2 and tilt_retries == 3:
             hello_robot.move_to_position(base_trans=0.1, head_tilt=head_tilt)
             side_retries = 3
 
         elif retry_flag == 2:
-            if (tilt_retries == 3):
-                if (side_retries == 0):
+            if tilt_retries == 3:
+                if side_retries == 0:
                     hello_robot.move_to_position(base_trans=0.1, head_tilt=head_tilt)
                     side_retries = 1
                 else:
@@ -53,8 +66,9 @@ def capture_and_process_image(camera, mode, obj, socket, hello_robot):
                 tilt_retries = 1
             else:
                 print(f"retrying with head tilt : {head_tilt + head_tilt_angles[tilt_retries]}")
-                hello_robot.move_to_position(head_pan=head_pan,
-                                        head_tilt=head_tilt + head_tilt_angles[tilt_retries])
+                hello_robot.move_to_position(
+                    head_pan=head_pan, head_tilt=head_tilt + head_tilt_angles[tilt_retries]
+                )
                 tilt_retries += 1
                 time.sleep(1)
 
@@ -69,11 +83,9 @@ def capture_and_process_image(camera, mode, obj, socket, hello_robot):
 
 def move_to_point(robot, point, base_node, gripper_node, move_mode=1, pitch_rotation=0):
     """
-        Function for moving the gripper to a specific point
+    Function for moving the gripper to a specific point
     """
-    rotation = np.array([[1, 0, 0],
-                         [0, 1, 0],
-                         [0, 0, 1]])
+    rotation = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
 
     dest_frame = pin.SE3(rotation, point)
     transform = robot.get_joint_transform(base_node, gripper_node)
@@ -84,38 +96,60 @@ def move_to_point(robot, point, base_node, gripper_node, move_mode=1, pitch_rota
     transformed_frame.translation[2] -= 0.2
 
     robot.move_to_pose(
-            [transformed_frame.translation[0], transformed_frame.translation[1], transformed_frame.translation[2]],
-            [pitch_rotation, 0, 0],
-            [1],
-            move_mode=move_mode
-        )
+        [
+            transformed_frame.translation[0],
+            transformed_frame.translation[1],
+            transformed_frame.translation[2],
+        ],
+        [pitch_rotation, 0, 0],
+        [1],
+        move_mode=move_mode,
+    )
 
-def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height = 0.03, gripper_depth=0.03, gripper_width = 1):
+
+def pickup(
+    robot,
+    rotation,
+    translation,
+    base_node,
+    gripper_node,
+    gripper_height=0.03,
+    gripper_depth=0.03,
+    gripper_width=1,
+):
     """
-        rotation: Relative rotation of gripper pose w.r.t camera
-        translation: Relative translation of gripper pose w.r.t camera
-        base_node: Camera Node
+    rotation: Relative rotation of gripper pose w.r.t camera
+    translation: Relative translation of gripper pose w.r.t camera
+    base_node: Camera Node
 
-        Supports home robot top down grasping as well
+    Supports home robot top down grasping as well
 
-        Graping trajectory steps
-        1. Rotation of gripper
-        2. Lift the gripper
-        3. Move the base such gripper in line with the grasp
-        4. Gradually Move the gripper to the desired position
+    Graping trajectory steps
+    1. Rotation of gripper
+    2. Lift the gripper
+    3. Move the base such gripper in line with the grasp
+    4. Gradually Move the gripper to the desired position
     """
     # Transforming the final point from Model camera frame to robot camera frame
     pin_point = np.array([-translation[1], -translation[0], translation[2]])
 
     # Rotation from Camera frame to Model frame
-    rotation1_bottom_mat = np.array([[0.0000000, -1.0000000,  0.0000000],
-                                 [-1.0000000,  0.0000000,  0.0000000],
-                                 [0.0000000,  0.0000000, 1.0000000]])
+    rotation1_bottom_mat = np.array(
+        [
+            [0.0000000, -1.0000000, 0.0000000],
+            [-1.0000000, 0.0000000, 0.0000000],
+            [0.0000000, 0.0000000, 1.0000000],
+        ]
+    )
 
     # Rotation from model frame to pose frame
-    rotation1_mat = np.array([[rotation[0][0], rotation[0][1], rotation[0][2]],
-                            [rotation[1][0],  rotation[1][1], rotation[1][2]],
-                            [rotation[2][0],  rotation[2][1], rotation[2][2]]])
+    rotation1_mat = np.array(
+        [
+            [rotation[0][0], rotation[0][1], rotation[0][2]],
+            [rotation[1][0], rotation[1][1], rotation[1][2]],
+            [rotation[2][0], rotation[2][1], rotation[2][2]],
+        ]
+    )
 
     # Rotation from camera frame to pose frame
     pin_rotation = np.dot(rotation1_bottom_mat, rotation1_mat)
@@ -133,29 +167,26 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
     # print(f"pin_transformed frame {pin_transformed_frame}")
 
     # Lifting the arm to high position as part of pregrasping position
-    print('pan, tilt before', robot.robot.get_pan_tilt())
-    robot.move_to_position(gripper_pos = gripper_width)
-    robot.move_to_position(lift_pos = 1.05, head_pan = None, head_tilt = None)
-    print('pan, tilt after', robot.robot.get_pan_tilt())
+    print("pan, tilt before", robot.robot.get_pan_tilt())
+    robot.move_to_position(gripper_pos=gripper_width)
+    robot.move_to_position(lift_pos=1.05, head_pan=None, head_tilt=None)
+    print("pan, tilt after", robot.robot.get_pan_tilt())
 
     # Rotation for aligning Robot gripper frame to Model gripper frame
-    rotation2_top_mat = np.array([[0, 0, 1], 
-                                [1, 0, 0],
-                                [0, -1, 0]])
-
+    rotation2_top_mat = np.array([[0, 0, 1], [1, 0, 0], [0, -1, 0]])
 
     # final Rotation of gripper to hold the objet
     pin_final_rotation = np.dot(pin_transformed_frame.rotation, rotation2_top_mat)
     print(f"pin final rotation {pin_final_rotation}")
 
     rpy_angles = pin.rpy.matrixToRpy(pin_final_rotation)
-    print('pan, tilt before', robot.robot.get_pan_tilt())
+    print("pan, tilt before", robot.robot.get_pan_tilt())
     robot.move_to_pose(
-            [0, 0, 0],
-            [rpy_angles[0], rpy_angles[1], rpy_angles[2]],
-            [1],
-        )
-    print('pan, tilt after', robot.robot.get_pan_tilt())
+        [0, 0, 0],
+        [rpy_angles[0], rpy_angles[1], rpy_angles[2]],
+        [1],
+    )
+    print("pan, tilt after", robot.robot.get_pan_tilt())
 
     # Final grasping point relative to camera
     pin_cam2gripper_transform = robot.get_joint_transform(base_node, gripper_node)
@@ -163,27 +194,29 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
     # print(f"pin transformed point1 {pin_transformed_point1}")
 
     # Final grasping point relative to base
-    pin_cam2base_transform = robot.get_joint_transform(base_node, 'base_link')
+    pin_cam2base_transform = robot.get_joint_transform(base_node, "base_link")
     pin_base_point = apply_se3_transform(pin_cam2base_transform, pin_point)
     # print(f"pin base point {pin_base_point}")
 
-    diff_value = (0.226 - gripper_depth - gripper_height) # 0.228 is the distance between link_Straight_gripper node and the gripper tip
-    pin_transformed_point1[2] -= (diff_value)
-    ref_diff = (diff_value)
+    diff_value = (
+        0.226 - gripper_depth - gripper_height
+    )  # 0.228 is the distance between link_Straight_gripper node and the gripper tip
+    pin_transformed_point1[2] -= diff_value
+    ref_diff = diff_value
 
     # Moving gripper to a point that is 0.2m away from the pose center in the line of gripper
-    print('pan, tilt before', robot.robot.get_pan_tilt())
+    print("pan, tilt before", robot.robot.get_pan_tilt())
     robot.move_to_pose(
         [pin_transformed_point1[0], pin_transformed_point1[1], pin_transformed_point1[2] - 0.2],
         [0, 0, 0],
         [1],
-        move_mode = 1
+        move_mode=1,
     )
-    print('pan, tilt after', robot.robot.get_pan_tilt())
+    print("pan, tilt after", robot.robot.get_pan_tilt())
 
     # Z-Axis of link_straight_gripper points in line of gripper
     # So, the z co-ordiante of point w.r.t gripper gives the distance of point from gripper
-    pin_base2gripper_transform = robot.get_joint_transform('base_link', gripper_node)
+    pin_base2gripper_transform = robot.get_joint_transform("base_link", gripper_node)
     pin_transformed_point2 = apply_se3_transform(pin_base2gripper_transform, pin_base_point)
     curr_diff = pin_transformed_point2[2]
 
@@ -194,31 +227,23 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
         dist = diff - 0.08
         state = robot.robot.get_six_joints()
         state[1] += 0.02
-        robot.robot.arm_to(state, blocking = True)
-        robot.move_to_pose(
-            [0, 0, dist],
-            [0, 0, 0],
-            [1]
-        )
+        robot.robot.arm_to(state, blocking=True)
+        robot.move_to_pose([0, 0, dist], [0, 0, 0], [1])
         diff = diff - dist
-        
+
     while diff > 0.01:
         dist = min(0.03, diff)
-        robot.move_to_pose(
-            [0, 0, dist],   
-            [0, 0, 0],
-            [1]
-        )
+        robot.move_to_pose([0, 0, dist], [0, 0, 0], [1])
         diff = diff - dist
-    
+
     # Now the gripper reached the grasping point and starts picking procedure
     robot.pickup(gripper_width)
 
     # Lifts the arm
-    robot.move_to_position(lift_pos = min(robot.robot.get_six_joints()[1] + 0.2, 1.1))
+    robot.move_to_position(lift_pos=min(robot.robot.get_six_joints()[1] + 0.2, 1.1))
 
     # Tucks the gripper so that while moving to place it wont collide with any obstacles
-    robot.move_to_position(arm_pos = 0.01)
-    robot.move_to_position(wrist_pitch = 0.0)
-    robot.move_to_position(lift_pos = min(robot.robot.get_six_joints()[1], 0.9), wrist_yaw = 2.5)
-    robot.move_to_position(lift_pos = min(robot.robot.get_six_joints()[1], 0.55))
+    robot.move_to_position(arm_pos=0.01)
+    robot.move_to_position(wrist_pitch=0.0)
+    robot.move_to_position(lift_pos=min(robot.robot.get_six_joints()[1], 0.9), wrist_yaw=2.5)
+    robot.move_to_position(lift_pos=min(robot.robot.get_six_joints()[1], 0.55))
diff --git a/src/stretch/dynav/ok_robot_hw/utils/utils.py b/src/stretch/dynav/ok_robot_hw/utils/utils.py
index 1c60e52ee..27ace6fb0 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/utils.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/utils.py
@@ -1,5 +1,15 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 import numpy as np
 
+
 def apply_se3_transform(se3_obj, point):
     homogeneous_point = np.append(point.flatten(), 1)
     print(homogeneous_point)
@@ -8,14 +18,15 @@ def apply_se3_transform(se3_obj, point):
 
     return transformed_point
 
+
 def transform_joint_array(joint_array):
     n = len(joint_array)
     new_joint_array = []
-    for i in range(n+3):
+    for i in range(n + 3):
         if i < 2:
             new_joint_array.append(joint_array[i])
         elif i < 6:
-            new_joint_array.append(joint_array[2]/4.0)
+            new_joint_array.append(joint_array[2] / 4.0)
         else:
-            new_joint_array.append(joint_array[i-3])
-    return np.array(new_joint_array)
\ No newline at end of file
+            new_joint_array.append(joint_array[i - 3])
+    return np.array(new_joint_array)
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index 858b1c83e..ec74828c4 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -1,3 +1,12 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 # This source code is licensed under the MIT license found in the
@@ -6,32 +15,32 @@
 import datetime
 import os
 import pickle
+import threading
 import time
+from multiprocessing import Process
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 
+import cv2
 import numpy as np
 import torch
-
-from stretch.core.parameters import Parameters, get_parameters
-from stretch.agent import RobotClient
-
 import zmq
-
 from matplotlib import pyplot as plt
 
+from stretch.agent import RobotClient
+from stretch.core.parameters import Parameters, get_parameters
+from stretch.dynav.ok_robot_hw.camera import RealSenseCamera
 from stretch.dynav.ok_robot_hw.global_parameters import *
 from stretch.dynav.ok_robot_hw.robot import HelloRobot as Manipulation_Wrapper
-from stretch.dynav.ok_robot_hw.camera import RealSenseCamera
-from stretch.dynav.ok_robot_hw.utils.grasper_utils import pickup, move_to_point, capture_and_process_image
-from stretch.dynav.ok_robot_hw.utils.communication_utils import send_array, recv_array
+from stretch.dynav.ok_robot_hw.utils.communication_utils import recv_array, send_array
+from stretch.dynav.ok_robot_hw.utils.grasper_utils import (
+    capture_and_process_image,
+    move_to_point,
+    pickup,
+)
 
 # from stretch.dynav.llm_server import ImageProcessor
 
-import cv2
-
-import threading
-from multiprocessing import Process
 
 class RobotAgentMDP:
     """Basic demo code. Collects everything that we need to make this work."""
@@ -49,9 +58,9 @@ def __init__(
         re: int = 1,
         env_num: int = 1,
         test_num: int = 1,
-        method: str = 'dynamem'
+        method: str = "dynamem",
     ):
-        print('------------------------YOU ARE NOW RUNNING PEIQI CODES V3-----------------')
+        print("------------------------YOU ARE NOW RUNNING PEIQI CODES V3-----------------")
         self.re = re
         if isinstance(parameters, Dict):
             self.parameters = Parameters(**parameters)
@@ -67,7 +76,9 @@ def __init__(
             stretch_gripper_max = 0.64
             end_link = "link_gripper_s3_body"
         self.transform_node = end_link
-        self.manip_wrapper = Manipulation_Wrapper(self.robot, stretch_gripper_max = stretch_gripper_max, end_link = end_link)
+        self.manip_wrapper = Manipulation_Wrapper(
+            self.robot, stretch_gripper_max=stretch_gripper_max, end_link=end_link
+        )
         self.robot.move_to_nav_posture()
 
         self.normalize_embeddings = True
@@ -75,21 +86,27 @@ def __init__(
         self.rot_err_threshold = 0.4
         self.obs_count = 0
         self.obs_history = []
-        self.guarantee_instance_is_reachable = (
-            parameters.guarantee_instance_is_reachable
-        )
+        self.guarantee_instance_is_reachable = parameters.guarantee_instance_is_reachable
 
-        self.image_sender = ImageSender(ip = ip, image_port = image_port, text_port = text_port, manip_port = manip_port)
-        if method == 'dynamem':
+        self.image_sender = ImageSender(
+            ip=ip, image_port=image_port, text_port=text_port, manip_port=manip_port
+        )
+        if method == "dynamem":
             from stretch.dynav.voxel_map_server import ImageProcessor
-            self.image_processor = ImageProcessor(rerun = True, static = False, log = 'env' + str(env_num) + '_' + str(test_num))
-        elif method == 'mllm':
+
+            self.image_processor = ImageProcessor(
+                rerun=True, static=False, log="env" + str(env_num) + "_" + str(test_num)
+            )
+        elif method == "mllm":
             from stretch.dynav.llm_server import ImageProcessor
-            self.image_processor = ImageProcessor(rerun = True, static = False, log = 'env' + str(env_num) + '_' + str(test_num))
+
+            self.image_processor = ImageProcessor(
+                rerun=True, static=False, log="env" + str(env_num) + "_" + str(test_num)
+            )
 
         self.look_around_times = []
         self.execute_times = []
-        
+
         timestamp = f"{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}"
 
         # self.head_lock = threading.Lock()
@@ -99,18 +116,18 @@ def look_around(self):
         for pan in [0.4, -0.4, -1.2]:
             for tilt in [-0.6]:
                 start_time = time.time()
-                self.robot.head_to(pan, tilt, blocking = True)
+                self.robot.head_to(pan, tilt, blocking=True)
                 end_time = time.time()
-                print('moving head takes ', end_time - start_time, 'seconds.')
+                print("moving head takes ", end_time - start_time, "seconds.")
                 self.update()
 
     def rotate_in_place(self):
         print("*" * 10, "Rotate in place", "*" * 10)
         xyt = self.robot.get_base_pose()
-        self.robot.head_to(head_pan = 0, head_tilt = -0.6, blocking = True)
+        self.robot.head_to(head_pan=0, head_tilt=-0.6, blocking=True)
         for i in range(8):
             xyt[2] += 2 * np.pi / 8
-            self.robot.navigate_to(xyt, blocking = True)
+            self.robot.navigate_to(xyt, blocking=True)
             self.update()
 
     def update(self):
@@ -138,14 +155,14 @@ def execute_action(
 
         start = self.robot.get_base_pose()
         # print("       Start:", start)
-        # res = self.image_sender.query_text(text, start)  
+        # res = self.image_sender.query_text(text, start)
         res = self.image_processor.process_text(text, start)
-        if len(res) == 0 and text != '' and text is not None:
-            res = self.image_processor.process_text('', start)
+        if len(res) == 0 and text != "" and text is not None:
+            res = self.image_processor.process_text("", start)
 
         look_around_finish = time.time()
         look_around_take = look_around_finish - start_time
-        print('Path planning takes ', look_around_take, ' seconds.')
+        print("Path planning takes ", look_around_take, " seconds.")
         # self.look_around_times.append(look_around_take)
         # print(self.look_around_times)
         # print(sum(self.look_around_times) / len(self.look_around_times))
@@ -159,12 +176,12 @@ def execute_action(
                         res[:-2],
                         pos_err_threshold=self.pos_err_threshold,
                         rot_err_threshold=self.rot_err_threshold,
-                        blocking = True
+                        blocking=True,
                     )
 
                 execution_finish = time.time()
                 execution_take = execution_finish - look_around_finish
-                print('Executing action takes ', execution_take, ' seconds.')
+                print("Executing action takes ", execution_take, " seconds.")
                 self.execute_times.append(execution_take)
                 print(self.execute_times)
                 print(sum(self.execute_times) / len(self.execute_times))
@@ -175,12 +192,12 @@ def execute_action(
                     res,
                     pos_err_threshold=self.pos_err_threshold,
                     rot_err_threshold=self.rot_err_threshold,
-                    blocking = False
+                    blocking=False,
                 )
 
                 execution_finish = time.time()
                 execution_take = execution_finish - look_around_finish
-                print('Executing action takes ', execution_take, ' seconds.')
+                print("Executing action takes ", execution_take, " seconds.")
                 self.execute_times.append(execution_take)
                 print(self.execute_times)
                 print(sum(self.execute_times) / len(self.execute_times))
@@ -190,9 +207,7 @@ def execute_action(
             print("Failed. Try again!")
             return None, None
 
-    def run_exploration(
-        self
-    ):
+    def run_exploration(self):
         """Go through exploration. We use the voxel_grid map created by our collector to sample free space, and then use our motion planner (RRT for now) to get there. At the end, we plan back to (0,0,0).
 
         Args:
@@ -203,119 +218,132 @@ def run_exploration(
             return False
         return True
 
-    def navigate(self, text, max_step = 10):
+    def navigate(self, text, max_step=10):
         finished = False
         step = 0
         end_point = None
         while not finished and step < max_step:
-            print('*' * 20, step, '*' * 20)
+            print("*" * 20, step, "*" * 20)
             step += 1
-            finished, end_point = self.execute_action(text) 
+            finished, end_point = self.execute_action(text)
             if finished is None:
                 print("Navigation failed! The path might be blocked!")
                 return None
         return end_point
 
-    def place(self, text, init_tilt = INIT_HEAD_TILT, base_node = TOP_CAMERA_NODE):
-        '''
-            An API for running placing. By calling this API, human will ask the robot to place whatever it holds
-            onto objects specified by text queries A
-            - hello_robot: a wrapper for home-robot StretchClient controller
-            - socoket: we use this to communicate with workstation to get estimated gripper pose
-            - text: queries specifying target object
-            - transform node: node name for coordinate systems of target gripper pose (usually the coordinate system on the robot gripper)
-            - base node: node name for coordinate systems of estimated gipper poses given by anygrasp
-        '''
+    def place(self, text, init_tilt=INIT_HEAD_TILT, base_node=TOP_CAMERA_NODE):
+        """
+        An API for running placing. By calling this API, human will ask the robot to place whatever it holds
+        onto objects specified by text queries A
+        - hello_robot: a wrapper for home-robot StretchClient controller
+        - socoket: we use this to communicate with workstation to get estimated gripper pose
+        - text: queries specifying target object
+        - transform node: node name for coordinate systems of target gripper pose (usually the coordinate system on the robot gripper)
+        - base node: node name for coordinate systems of estimated gipper poses given by anygrasp
+        """
         self.robot.switch_to_manipulation_mode()
         self.robot.look_at_ee()
-        self.manip_wrapper.move_to_position(
-            head_pan=INIT_HEAD_PAN,
-            head_tilt=init_tilt)
+        self.manip_wrapper.move_to_position(head_pan=INIT_HEAD_PAN, head_tilt=init_tilt)
         camera = RealSenseCamera(self.robot)
 
         rotation, translation = capture_and_process_image(
-            camera = camera,
-            mode = 'place',
-            obj = text,
-            socket = self.image_sender.manip_socket, 
-            hello_robot = self.manip_wrapper)
+            camera=camera,
+            mode="place",
+            obj=text,
+            socket=self.image_sender.manip_socket,
+            hello_robot=self.manip_wrapper,
+        )
 
         if rotation is None:
             return False
 
         # lift arm to the top before the robot extends the arm, prepare the pre-placing gripper pose
         self.manip_wrapper.move_to_position(lift_pos=1.05)
-        self.manip_wrapper.move_to_position(wrist_yaw=0,
-                                 wrist_pitch=0)
+        self.manip_wrapper.move_to_position(wrist_yaw=0, wrist_pitch=0)
 
         # Placing the object
         move_to_point(self.manip_wrapper, translation, base_node, self.transform_node, move_mode=0)
-        self.manip_wrapper.move_to_position(gripper_pos=1, blocking = True)
+        self.manip_wrapper.move_to_position(gripper_pos=1, blocking=True)
 
         # Lift the arm a little bit, and rotate the wrist roll of the robot in case the object attached on the gripper
-        self.manip_wrapper.move_to_position(lift_pos = min(self.manip_wrapper.robot.get_six_joints()[1] + 0.3, 1.1))
-        self.manip_wrapper.move_to_position(wrist_roll = 2.5, blocking = True)
-        self.manip_wrapper.move_to_position(wrist_roll = -2.5, blocking = True)
+        self.manip_wrapper.move_to_position(
+            lift_pos=min(self.manip_wrapper.robot.get_six_joints()[1] + 0.3, 1.1)
+        )
+        self.manip_wrapper.move_to_position(wrist_roll=2.5, blocking=True)
+        self.manip_wrapper.move_to_position(wrist_roll=-2.5, blocking=True)
 
         # Wait for some time and shrink the arm back
-        self.manip_wrapper.move_to_position(gripper_pos=1, 
-                                lift_pos = 1.05,
-                                arm_pos = 0)
+        self.manip_wrapper.move_to_position(gripper_pos=1, lift_pos=1.05, arm_pos=0)
         self.manip_wrapper.move_to_position(wrist_pitch=-1.57)
 
         # Shift the base back to the original point as we are certain that orginal point is navigable in navigation obstacle map
-        self.manip_wrapper.move_to_position(base_trans = -self.manip_wrapper.robot.get_six_joints()[0])
+        self.manip_wrapper.move_to_position(
+            base_trans=-self.manip_wrapper.robot.get_six_joints()[0]
+        )
         return True
 
-    def manipulate(self, text, init_tilt = INIT_HEAD_TILT, base_node = TOP_CAMERA_NODE):
-        '''
-            An API for running manipulation. By calling this API, human will ask the robot to pick up objects
-            specified by text queries A
-            - hello_robot: a wrapper for home-robot StretchClient controller
-            - socoket: we use this to communicate with workstation to get estimated gripper pose
-            - text: queries specifying target object
-            - transform node: node name for coordinate systems of target gripper pose (usually the coordinate system on the robot gripper)
-            - base node: node name for coordinate systems of estimated gipper poses given by anygrasp
-        '''
+    def manipulate(self, text, init_tilt=INIT_HEAD_TILT, base_node=TOP_CAMERA_NODE):
+        """
+        An API for running manipulation. By calling this API, human will ask the robot to pick up objects
+        specified by text queries A
+        - hello_robot: a wrapper for home-robot StretchClient controller
+        - socoket: we use this to communicate with workstation to get estimated gripper pose
+        - text: queries specifying target object
+        - transform node: node name for coordinate systems of target gripper pose (usually the coordinate system on the robot gripper)
+        - base node: node name for coordinate systems of estimated gipper poses given by anygrasp
+        """
 
         self.robot.switch_to_manipulation_mode()
         self.robot.look_at_ee()
 
         gripper_pos = 1
 
-        self.manip_wrapper.move_to_position(arm_pos=INIT_ARM_POS,
-                                head_pan=INIT_HEAD_PAN,
-                                head_tilt=init_tilt,
-                                gripper_pos = gripper_pos,
-                                lift_pos=INIT_LIFT_POS,
-                                wrist_pitch = INIT_WRIST_PITCH,
-                                wrist_roll = INIT_WRIST_ROLL,
-                                wrist_yaw = INIT_WRIST_YAW)
+        self.manip_wrapper.move_to_position(
+            arm_pos=INIT_ARM_POS,
+            head_pan=INIT_HEAD_PAN,
+            head_tilt=init_tilt,
+            gripper_pos=gripper_pos,
+            lift_pos=INIT_LIFT_POS,
+            wrist_pitch=INIT_WRIST_PITCH,
+            wrist_roll=INIT_WRIST_ROLL,
+            wrist_yaw=INIT_WRIST_YAW,
+        )
 
         camera = RealSenseCamera(self.robot)
 
         rotation, translation, depth, width = capture_and_process_image(
-            camera = camera,
-            mode = 'pick',
-            obj = text,
-            socket = self.image_sender.manip_socket, 
-            hello_robot = self.manip_wrapper)
-    
+            camera=camera,
+            mode="pick",
+            obj=text,
+            socket=self.image_sender.manip_socket,
+            hello_robot=self.manip_wrapper,
+        )
+
         if rotation is None:
             return False
-        
+
         if width < 0.05 and self.re == 3:
             gripper_width = 0.45
-        elif width < 0.075 and self.re == 3: 
+        elif width < 0.075 and self.re == 3:
             gripper_width = 0.6
         else:
             gripper_width = 1
 
-        if input('Do you want to do this manipulation? Y or N ') != 'N':
-            pickup(self.manip_wrapper, rotation, translation, base_node, self.transform_node, gripper_depth = depth, gripper_width = gripper_width)
-    
+        if input("Do you want to do this manipulation? Y or N ") != "N":
+            pickup(
+                self.manip_wrapper,
+                rotation,
+                translation,
+                base_node,
+                self.transform_node,
+                gripper_depth=depth,
+                gripper_width=gripper_width,
+            )
+
         # Shift the base back to the original point as we are certain that orginal point is navigable in navigation obstacle map
-        self.manip_wrapper.move_to_position(base_trans = -self.manip_wrapper.robot.get_six_joints()[0])
+        self.manip_wrapper.move_to_position(
+            base_trans=-self.manip_wrapper.robot.get_six_joints()[0]
+        )
 
         return True
 
@@ -323,48 +351,58 @@ def save(self):
         with self.image_processor.voxel_map_lock:
             self.image_processor.write_to_pickle()
 
+
 def send_array(socket, A, flags=0, copy=True, track=False):
     """send a numpy array with metadata"""
     A = np.array(A)
     md = dict(
-        dtype = str(A.dtype),
-        shape = A.shape,
+        dtype=str(A.dtype),
+        shape=A.shape,
     )
-    socket.send_json(md, flags|zmq.SNDMORE)
+    socket.send_json(md, flags | zmq.SNDMORE)
     return socket.send(np.ascontiguousarray(A), flags, copy=copy, track=track)
 
+
 def recv_array(socket, flags=0, copy=True, track=False):
     """recv a numpy array"""
     md = socket.recv_json(flags=flags)
     msg = socket.recv(flags=flags, copy=copy, track=track)
-    A = np.frombuffer(msg, dtype=md['dtype'])
-    return A.reshape(md['shape'])
+    A = np.frombuffer(msg, dtype=md["dtype"])
+    return A.reshape(md["shape"])
+
 
 def send_rgb_img(socket, img):
-    img = img.astype(np.uint8) 
+    img = img.astype(np.uint8)
     encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 90]
-    _, img_encoded = cv2.imencode('.jpg', img, encode_param)
+    _, img_encoded = cv2.imencode(".jpg", img, encode_param)
     socket.send(img_encoded.tobytes())
 
+
 def recv_rgb_img(socket):
     img = socket.recv()
     img = np.frombuffer(img, dtype=np.uint8)
     img = cv2.imdecode(img, cv2.IMREAD_COLOR)
     return img
 
+
 def send_depth_img(socket, depth_img):
     depth_img = (depth_img * 1000).astype(np.uint16)
-    encode_param = [int(cv2.IMWRITE_PNG_COMPRESSION), 3]  # Compression level from 0 (no compression) to 9 (max compression)
-    _, depth_img_encoded = cv2.imencode('.png', depth_img, encode_param)
+    encode_param = [
+        int(cv2.IMWRITE_PNG_COMPRESSION),
+        3,
+    ]  # Compression level from 0 (no compression) to 9 (max compression)
+    _, depth_img_encoded = cv2.imencode(".png", depth_img, encode_param)
     socket.send(depth_img_encoded.tobytes())
 
+
 def recv_depth_img(socket):
     depth_img = socket.recv()
     depth_img = np.frombuffer(depth_img, dtype=np.uint8)
     depth_img = cv2.imdecode(depth_img, cv2.IMREAD_UNCHANGED)
-    depth_img = (depth_img / 1000.)
+    depth_img = depth_img / 1000.0
     return depth_img
 
+
 def send_everything(socket, rgb, depth, intrinsics, pose):
     send_rgb_img(socket, rgb)
     socket.recv_string()
@@ -375,44 +413,47 @@ def send_everything(socket, rgb, depth, intrinsics, pose):
     send_array(socket, pose)
     socket.recv_string()
 
+
 def recv_everything(socket):
     rgb = recv_rgb_img(socket)
-    socket.send_string('')
+    socket.send_string("")
     depth = recv_depth_img(socket)
-    socket.send_string('')
+    socket.send_string("")
     intrinsics = recv_array(socket)
-    socket.send_string('')
+    socket.send_string("")
     pose = recv_array(socket)
-    socket.send_string('')
+    socket.send_string("")
     return rgb, depth, intrinsics, pose
 
+
 class ImageSender:
-    def __init__(self, 
-        stop_and_photo = False, 
-        ip = '100.108.67.79', 
-        image_port = 5560,
-        text_port = 5561,
-        manip_port = 5557,
-        color_name = "/camera/color",
-        depth_name = "/camera/aligned_depth_to_color",
-        camera_name = "/camera_pose",
-        slop_time_seconds = 0.05,
-        queue_size = 100,
+    def __init__(
+        self,
+        stop_and_photo=False,
+        ip="100.108.67.79",
+        image_port=5560,
+        text_port=5561,
+        manip_port=5557,
+        color_name="/camera/color",
+        depth_name="/camera/aligned_depth_to_color",
+        camera_name="/camera_pose",
+        slop_time_seconds=0.05,
+        queue_size=100,
     ):
         context = zmq.Context()
         self.img_socket = context.socket(zmq.REQ)
-        self.img_socket.connect('tcp://' + str(ip) + ':' + str(image_port))
+        self.img_socket.connect("tcp://" + str(ip) + ":" + str(image_port))
         self.text_socket = context.socket(zmq.REQ)
-        self.text_socket.connect('tcp://' + str(ip) + ':' + str(text_port))
+        self.text_socket.connect("tcp://" + str(ip) + ":" + str(text_port))
         self.manip_socket = context.socket(zmq.REQ)
-        self.manip_socket.connect('tcp://' + str(ip) + ':' + str(manip_port))
+        self.manip_socket.connect("tcp://" + str(ip) + ":" + str(manip_port))
 
     def query_text(self, text, start):
         self.text_socket.send_string(text)
         self.text_socket.recv_string()
         send_array(self.text_socket, start)
         return recv_array(self.text_socket)
-        
+
     def send_images(self, obs):
         rgb = obs.rgb
         depth = obs.depth
diff --git a/src/stretch/dynav/run_manip.py b/src/stretch/dynav/run_manip.py
index 3f63d4a37..35ae069de 100644
--- a/src/stretch/dynav/run_manip.py
+++ b/src/stretch/dynav/run_manip.py
@@ -1,3 +1,12 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 # This source code is licensed under the MIT license found in the
@@ -6,23 +15,25 @@
 import click
 import numpy as np
 
+from stretch.agent import RobotClient
+
 # Mapping and perception
 from stretch.core.parameters import Parameters, get_parameters
 from stretch.dynav import RobotAgentMDP
-from stretch.agent import RobotClient
 
 
 def compute_tilt(camera_xyz, target_xyz):
-    '''
-        a util function for computing robot head tilts so the robot can look at the target object after navigation
-        - camera_xyz: estimated (x, y, z) coordinates of camera
-        - target_xyz: estimated (x, y, z) coordinates of the target object
-    '''
+    """
+    a util function for computing robot head tilts so the robot can look at the target object after navigation
+    - camera_xyz: estimated (x, y, z) coordinates of camera
+    - target_xyz: estimated (x, y, z) coordinates of the target object
+    """
     vector = camera_xyz - target_xyz
     return -np.arctan2(vector[2], np.linalg.norm(vector[:2]))
 
+
 @click.command()
-@click.option("--ip", default='100.108.67.79', type=str)
+@click.option("--ip", default="100.108.67.79", type=str)
 @click.option("--manual-wait", default=False, is_flag=True)
 @click.option("--random-goals", default=False, is_flag=True)
 @click.option("--explore-iter", default=-1)
@@ -49,7 +60,7 @@ def main(
         random_goals(bool): randomly sample frontier goals instead of looking for closest
     """
     click.echo("Will connect to a Stretch robot and collect a short trajectory.")
-    robot = RobotClient(robot_ip = "100.79.44.11")
+    robot = RobotClient(robot_ip="100.79.44.11")
 
     print("- Load parameters")
     parameters = get_parameters("dynav_config.yaml")
@@ -58,18 +69,16 @@ def main(
         parameters["exploration_steps"] = explore_iter
 
     print("- Start robot agent with data collection")
-    demo = RobotAgentMDP(
-        robot, parameters, ip = ip, re = re
-    )
+    demo = RobotAgentMDP(robot, parameters, ip=ip, re=re)
 
-    while input('STOP? Y/N') != 'Y':
-        if input('You want to run manipulation: y/n') == 'y':
-            text = input('Enter object name: ')
+    while input("STOP? Y/N") != "Y":
+        if input("You want to run manipulation: y/n") == "y":
+            text = input("Enter object name: ")
             theta = -0.6
             demo.manipulate(text, theta)
-            
-        if input('You want to run placing: y/n') == 'y':
-            text = input('Enter receptacle name: ')
+
+        if input("You want to run placing: y/n") == "y":
+            text = input("Enter receptacle name: ")
             theta = -0.6
             demo.place(text, theta)
 
diff --git a/src/stretch/dynav/run_ok_nav.py b/src/stretch/dynav/run_ok_nav.py
index 5417a6a73..756c3f9ba 100644
--- a/src/stretch/dynav/run_ok_nav.py
+++ b/src/stretch/dynav/run_ok_nav.py
@@ -1,30 +1,41 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import threading
+import time
+
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 import click
+import cv2
 import matplotlib.pyplot as plt
 import numpy as np
 from PIL import Image
 
+from stretch.agent import RobotClient
+
 # Mapping and perception
 from stretch.core.parameters import Parameters, get_parameters
 from stretch.dynav import RobotAgentMDP
 
 # Chat and UI tools
 from stretch.utils.point_cloud import numpy_to_pcd, show_point_cloud
-from stretch.agent import RobotClient
 
-import cv2
-import time
-import threading
 
 def compute_tilt(camera_xyz, target_xyz):
-    '''
-        a util function for computing robot head tilts so the robot can look at the target object after navigation
-        - camera_xyz: estimated (x, y, z) coordinates of camera
-        - target_xyz: estimated (x, y, z) coordinates of the target object
-    '''
+    """
+    a util function for computing robot head tilts so the robot can look at the target object after navigation
+    - camera_xyz: estimated (x, y, z) coordinates of camera
+    - target_xyz: estimated (x, y, z) coordinates of the target object
+    """
     if not isinstance(camera_xyz, np.ndarray):
         camera_xyz = np.array(camera_xyz)
     if not isinstance(target_xyz, np.ndarray):
@@ -32,13 +43,14 @@ def compute_tilt(camera_xyz, target_xyz):
     vector = camera_xyz - target_xyz
     return -np.arctan2(vector[2], np.linalg.norm(vector[:2]))
 
+
 @click.command()
-@click.option("--ip", default='100.108.67.79', type=str)
+@click.option("--ip", default="100.108.67.79", type=str)
 @click.option("--manual-wait", default=False, is_flag=True)
 @click.option("--random-goals", default=False, is_flag=True)
 @click.option("--explore-iter", default=-1)
 @click.option("--re", default=1, type=int)
-@click.option("--method", default='dynamem', type=str)
+@click.option("--method", default="dynamem", type=str)
 @click.option("--env", default=1, type=int)
 @click.option("--test", default=1, type=int)
 @click.option(
@@ -53,7 +65,7 @@ def main(
     navigate_home: bool = False,
     explore_iter: int = 5,
     re: int = 1,
-    method: str = 'dynamem',
+    method: str = "dynamem",
     env: int = 1,
     test: int = 1,
     input_path: str = None,
@@ -66,7 +78,7 @@ def main(
         random_goals(bool): randomly sample frontier goals instead of looking for closest
     """
     click.echo("Will connect to a Stretch robot and collect a short trajectory.")
-    robot = RobotClient(robot_ip = "100.79.44.11")
+    robot = RobotClient(robot_ip="100.79.44.11")
 
     print("- Load parameters")
     parameters = get_parameters("dynav_config.yaml")
@@ -75,12 +87,10 @@ def main(
         parameters["exploration_steps"] = explore_iter
     object_to_find, location_to_place = None, None
     robot.move_to_nav_posture()
-    robot.set_velocity(v = 30., w = 15.)
+    robot.set_velocity(v=30.0, w=15.0)
 
     print("- Start robot agent with data collection")
-    demo = RobotAgentMDP(
-        robot, parameters, ip = ip, re = re, env_num = env, test_num = test, method = method
-    )
+    demo = RobotAgentMDP(robot, parameters, ip=ip, re=re, env_num=env, test_num=test, method=method)
 
     if input_path is None:
         demo.rotate_in_place()
@@ -102,38 +112,38 @@ def main(
     # img_thread.start()
 
     while True:
-        mode = input('select mode? E/N/S')
-        if mode == 'S':
+        mode = input("select mode? E/N/S")
+        if mode == "S":
             demo.image_processor.write_to_pickle()
             break
-        if mode == 'E':
+        if mode == "E":
             robot.switch_to_navigation_mode()
             for epoch in range(explore_iter):
-                print('\n', 'Exploration epoch ', epoch, '\n')
+                print("\n", "Exploration epoch ", epoch, "\n")
                 if not demo.run_exploration():
-                    print('Exploration failed! Quitting!')
+                    print("Exploration failed! Quitting!")
                     continue
         else:
             text = None
             point = None
-            if input('You want to run manipulation: y/n') != 'n':
+            if input("You want to run manipulation: y/n") != "n":
                 robot.move_to_nav_posture()
                 robot.switch_to_navigation_mode()
-                text = input('Enter object name: ')
+                text = input("Enter object name: ")
                 point = demo.navigate(text)
                 if point is None:
-                    print('Navigation Failure!')
-                cv2.imwrite(text + '.jpg', robot.get_observation().rgb[:, :, [2, 1, 0]])
+                    print("Navigation Failure!")
+                cv2.imwrite(text + ".jpg", robot.get_observation().rgb[:, :, [2, 1, 0]])
 
-            if input('You want to run manipulation: y/n') != 'n':
+            if input("You want to run manipulation: y/n") != "n":
                 robot.switch_to_navigation_mode()
                 xyt = robot.get_base_pose()
                 xyt[2] = xyt[2] + np.pi / 2
-                robot.navigate_to(xyt, blocking = True)
+                robot.navigate_to(xyt, blocking=True)
 
                 robot.switch_to_manipulation_mode()
                 if text is None:
-                    text = input('Enter object name: ')
+                    text = input("Enter object name: ")
                 camera_xyz = robot.get_head_pose()[:3, 3]
                 if point is not None:
                     theta = compute_tilt(camera_xyz, point)
@@ -141,26 +151,26 @@ def main(
                     theta = -0.6
                 demo.manipulate(text, theta)
                 robot.look_front()
-            
+
             text = None
             point = None
-            if input('You want to run placing: y/n') != 'n':
+            if input("You want to run placing: y/n") != "n":
                 robot.switch_to_navigation_mode()
-                text = input('Enter receptacle name: ')
+                text = input("Enter receptacle name: ")
                 point = demo.navigate(text)
                 if point is None:
-                    print('Navigation Failure')
-                cv2.imwrite(text + '.jpg', robot.get_observation().rgb[:, :, [2, 1, 0]])
-        
-            if input('You want to run placing: y/n') != 'n':
+                    print("Navigation Failure")
+                cv2.imwrite(text + ".jpg", robot.get_observation().rgb[:, :, [2, 1, 0]])
+
+            if input("You want to run placing: y/n") != "n":
                 robot.switch_to_navigation_mode()
                 xyt = robot.get_base_pose()
                 xyt[2] = xyt[2] + np.pi / 2
-                robot.navigate_to(xyt, blocking = True)
-                
+                robot.navigate_to(xyt, blocking=True)
+
                 robot.switch_to_manipulation_mode()
                 if text is None:
-                    text = input('Enter receptacle name: ')
+                    text = input("Enter receptacle name: ")
                 camera_xyz = robot.get_head_pose()[:3, 3]
                 if point is not None:
                     theta = compute_tilt(camera_xyz, point)
diff --git a/src/stretch/dynav/scannet.py b/src/stretch/dynav/scannet.py
index 2fe92aaae..6d16e1a9f 100644
--- a/src/stretch/dynav/scannet.py
+++ b/src/stretch/dynav/scannet.py
@@ -1,209 +1,217 @@
-CLASS_LABELS_200 =(
-    'wall',
-    'background',
-    'object',
-    'range hood',
-    'door',
-    'stove',
-    'bathroom stall',
-    'divider',
-    'clothes dryer',
-    'file cabinet',
-    'cabinet',
-    'bathroom cabinet',
-    'blackboard',
-    'curtain',
-    'blinds',
-    'tv',
-    'mattress',
-    'bathtub',
-    'glass',
-    'shower',
-    'power outlet',
-    'water dispenser',
-    'bulletin board',
-    'printer',
-    'windowsill',
-    'bookshelf',
-    'dresser',
-    'storage organizer',
-    'fireplace',
-    'fire alarm',
-    'piano',
-    'shelf',
-    'kitchen counter',
-    'washing machine',
-    'stairs',
-    'paper cutter',
-    'sink',
-    'window',
-    'refrigerator',
-    'counter',
-    'closet',
-    'bathroom vanity',
-    'radiator',
-    'vent',
-    'kitchen cabinet',
-    'water cooler',
-    'doorframe',
-    'closet door',
-    'table',
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
 
-    'shower head',
-    'spray',
-    'inhaler',
-    'guitar case',
-    'plunger',
-    'toilet paper dispenser',
-    'adapter',
-    'soy sauce',
-    'pipe',
-    'bottle',
-    'scale',
-    'paper towel',
-    'paper towel roll',
-    'scissors',
-    'tape',
-    'chopsticks',
-    'case of water bottles',
-    'hand sanitizer',
-    'laptop',
-    'alcohol disinfection',
-    'keyboard',
-    'coffee maker',
-    'light',
-    'toaster',
-    'stuffed animal',
-    'toilet seat cover dispenser',
-    'ironing board',
-    'fire extinguisher',
-    'fruit',
-    'container',
-    'bag',
-    'oven',
-    'body wash',
-    'bucket',
-    'cd case',
-    'tray',
-    'bowl',
-    'speaker',
-    'crate',
-    'projector',
-    'book',
-    'school bag',
-    'laundry detergent',
-    'clothes',
-    'candle',
-    'basket',
-    'face wash',
-    'notebook',
-    'purse',
-    'trash bin',
-    'paper bag',
-    'package',
-    'disinfecting wipes',
-    'recycling bin',
-    'headphones',
-    'mouse',
-    'shower gel',
-    'dustpan',
-    'cup',
-    'vacuum cleaner',
-    'dish rack',
-    'coffee kettle',
-    'plants',
-    'rag',
-    'can',
-    'cushion',
-    'monitor',
-    'fan',
-    'tube',
-    'box',
-    'ball',
-    'bicycle',
-    'guitar',
-    'trash can',
-    'hand sanitizers',
-    'paper towel dispenser',
-    'whiteboard',
-    'bin',
-    'potted plant',
-    'tennis',
-    'soap dish',
-    'structure',
-    'calendar',
-    'dumbbell',
-    'fish oil',
-    'ottoman',
-    'stool',
-    'hand wash',
-    'lamp',
-    'toaster oven',
-    'music stand',
-    'water bottle',
-    'clock',
-    'charger',
-    'picture',
-    'bascketball',
-    'microwave',
-    'screwdriver',
-    'rack',
-    'apple',
-    'suitcase',
-    'ladder',
-    'ping pong ball',
-    'dishwasher',
-    'storage container',
-    'toilet paper holder',
-    'coat rack',
-    'soap dispenser',
-    'banana',
-    'toilet paper',
-    'mug',
-    'marker pen',
-    'hat',
-    'aerosol',
-    'luggage',
-    'poster',
-    'bed',
-    'cart',
-    'light switch',
-    'backpack',
-    'power strip',
-    'baseball',
-    'mustard',
-    'water pitcher',
-    'couch',
-    'beverage',
-    'toy',
-    'salt',
-    'plant',
-    'pillow',
-    'broom',
-    'pepper',
-    'muffins',
-    'multivitamin',
-    'towel',
-    'storage bin',
-    'nightstand',
-    'telephone',
-    'tissue box',
-    'hair dryer',
-    'mirror',
-    'sign',
-    'plate',
-    'tripod',
-    'chair',
-    'plastic bag',
-    'umbrella',
-    'paper',
-    'laundry hamper',
-    'food',
-    'jacket',
-    'computer tower',
-    'keyboard piano',
-    'person',
-    'machine',
-    'projector screen',
-    'shoe',
-)
\ No newline at end of file
+CLASS_LABELS_200 = (
+    "wall",
+    "background",
+    "object",
+    "range hood",
+    "door",
+    "stove",
+    "bathroom stall",
+    "divider",
+    "clothes dryer",
+    "file cabinet",
+    "cabinet",
+    "bathroom cabinet",
+    "blackboard",
+    "curtain",
+    "blinds",
+    "tv",
+    "mattress",
+    "bathtub",
+    "glass",
+    "shower",
+    "power outlet",
+    "water dispenser",
+    "bulletin board",
+    "printer",
+    "windowsill",
+    "bookshelf",
+    "dresser",
+    "storage organizer",
+    "fireplace",
+    "fire alarm",
+    "piano",
+    "shelf",
+    "kitchen counter",
+    "washing machine",
+    "stairs",
+    "paper cutter",
+    "sink",
+    "window",
+    "refrigerator",
+    "counter",
+    "closet",
+    "bathroom vanity",
+    "radiator",
+    "vent",
+    "kitchen cabinet",
+    "water cooler",
+    "doorframe",
+    "closet door",
+    "table",
+    "shower head",
+    "spray",
+    "inhaler",
+    "guitar case",
+    "plunger",
+    "toilet paper dispenser",
+    "adapter",
+    "soy sauce",
+    "pipe",
+    "bottle",
+    "scale",
+    "paper towel",
+    "paper towel roll",
+    "scissors",
+    "tape",
+    "chopsticks",
+    "case of water bottles",
+    "hand sanitizer",
+    "laptop",
+    "alcohol disinfection",
+    "keyboard",
+    "coffee maker",
+    "light",
+    "toaster",
+    "stuffed animal",
+    "toilet seat cover dispenser",
+    "ironing board",
+    "fire extinguisher",
+    "fruit",
+    "container",
+    "bag",
+    "oven",
+    "body wash",
+    "bucket",
+    "cd case",
+    "tray",
+    "bowl",
+    "speaker",
+    "crate",
+    "projector",
+    "book",
+    "school bag",
+    "laundry detergent",
+    "clothes",
+    "candle",
+    "basket",
+    "face wash",
+    "notebook",
+    "purse",
+    "trash bin",
+    "paper bag",
+    "package",
+    "disinfecting wipes",
+    "recycling bin",
+    "headphones",
+    "mouse",
+    "shower gel",
+    "dustpan",
+    "cup",
+    "vacuum cleaner",
+    "dish rack",
+    "coffee kettle",
+    "plants",
+    "rag",
+    "can",
+    "cushion",
+    "monitor",
+    "fan",
+    "tube",
+    "box",
+    "ball",
+    "bicycle",
+    "guitar",
+    "trash can",
+    "hand sanitizers",
+    "paper towel dispenser",
+    "whiteboard",
+    "bin",
+    "potted plant",
+    "tennis",
+    "soap dish",
+    "structure",
+    "calendar",
+    "dumbbell",
+    "fish oil",
+    "ottoman",
+    "stool",
+    "hand wash",
+    "lamp",
+    "toaster oven",
+    "music stand",
+    "water bottle",
+    "clock",
+    "charger",
+    "picture",
+    "bascketball",
+    "microwave",
+    "screwdriver",
+    "rack",
+    "apple",
+    "suitcase",
+    "ladder",
+    "ping pong ball",
+    "dishwasher",
+    "storage container",
+    "toilet paper holder",
+    "coat rack",
+    "soap dispenser",
+    "banana",
+    "toilet paper",
+    "mug",
+    "marker pen",
+    "hat",
+    "aerosol",
+    "luggage",
+    "poster",
+    "bed",
+    "cart",
+    "light switch",
+    "backpack",
+    "power strip",
+    "baseball",
+    "mustard",
+    "water pitcher",
+    "couch",
+    "beverage",
+    "toy",
+    "salt",
+    "plant",
+    "pillow",
+    "broom",
+    "pepper",
+    "muffins",
+    "multivitamin",
+    "towel",
+    "storage bin",
+    "nightstand",
+    "telephone",
+    "tissue box",
+    "hair dryer",
+    "mirror",
+    "sign",
+    "plate",
+    "tripod",
+    "chair",
+    "plastic bag",
+    "umbrella",
+    "paper",
+    "laundry hamper",
+    "food",
+    "jacket",
+    "computer tower",
+    "keyboard piano",
+    "person",
+    "machine",
+    "projector screen",
+    "shoe",
+)
diff --git a/src/stretch/dynav/voxel_map_localizer.py b/src/stretch/dynav/voxel_map_localizer.py
index e35a3cbe9..6bd42a1bb 100644
--- a/src/stretch/dynav/voxel_map_localizer.py
+++ b/src/stretch/dynav/voxel_map_localizer.py
@@ -1,32 +1,40 @@
-import numpy as np
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
 
-import cv2
+import math
+from typing import List, Optional, Tuple, Union
 
+import clip
+import cv2
+import numpy as np
 import torch
 import torch.nn.functional as F
 import torchvision.transforms as T
-from torchvision.transforms.functional import InterpolationMode
-
-import clip
-
-from stretch.dynav.mapping_utils import VoxelizedPointcloud
-
-from typing import List, Optional, Tuple, Union
+from PIL import Image
+from sklearn.cluster import DBSCAN
 from torch import Tensor
-
-from transformers import AutoProcessor, AutoModel
+from torchvision.transforms.functional import InterpolationMode
 
 # from ultralytics import YOLOWorld
-from transformers import Owlv2Processor, Owlv2ForObjectDetection
+from transformers import AutoModel, AutoProcessor, Owlv2ForObjectDetection, Owlv2Processor
 
-import math
-from PIL import Image
+from stretch.dynav.mapping_utils import VoxelizedPointcloud
 
-from sklearn.cluster import DBSCAN
 
 def get_inv_intrinsics(intrinsics):
     # return intrinsics.double().inverse().to(intrinsics)
-    fx, fy, ppx, ppy = intrinsics[..., 0, 0], intrinsics[..., 1, 1], intrinsics[..., 0, 2], intrinsics[..., 1, 2]
+    fx, fy, ppx, ppy = (
+        intrinsics[..., 0, 0],
+        intrinsics[..., 1, 1],
+        intrinsics[..., 0, 2],
+        intrinsics[..., 1, 2],
+    )
     inv_intrinsics = torch.zeros_like(intrinsics)
     inv_intrinsics[..., 0, 0] = 1.0 / fx
     inv_intrinsics[..., 1, 1] = 1.0 / fy
@@ -35,6 +43,7 @@ def get_inv_intrinsics(intrinsics):
     inv_intrinsics[..., 2, 2] = 1.0
     return inv_intrinsics
 
+
 def get_xyz(depth, pose, intrinsics):
     """Returns the XYZ coordinates for a set of points.
 
@@ -74,14 +83,23 @@ def get_xyz(depth, pose, intrinsics):
     xyz = xyz @ get_inv_intrinsics(intrinsics).transpose(-1, -2)
     xyz = xyz * depth.flatten(1).unsqueeze(-1)
     xyz = (xyz[..., None, :] * pose[..., None, :3, :3]).sum(dim=-1) + pose[..., None, :3, 3]
-    
+
     xyz = xyz.unflatten(1, (height, width))
 
     return xyz
 
-class VoxelMapLocalizer():
-    def __init__(self, voxel_map_wrapper = None, exist_model = True, clip_model = None, processor = None, device = 'cuda', siglip = True):
-        print('Localizer V3')
+
+class VoxelMapLocalizer:
+    def __init__(
+        self,
+        voxel_map_wrapper=None,
+        exist_model=True,
+        clip_model=None,
+        processor=None,
+        device="cuda",
+        siglip=True,
+    ):
+        print("Localizer V3")
         self.voxel_map_wrapper = voxel_map_wrapper
         self.device = device
         # self.clip_model, self.preprocessor = clip.load(model_config, device=device)
@@ -92,21 +110,27 @@ def __init__(self, voxel_map_wrapper = None, exist_model = True, clip_model = No
             self.clip_model, self.preprocessor = clip.load("ViT-B/16", device=self.device)
             self.clip_model.eval()
         else:
-            self.clip_model = AutoModel.from_pretrained("google/siglip-so400m-patch14-384").to(self.device)
+            self.clip_model = AutoModel.from_pretrained("google/siglip-so400m-patch14-384").to(
+                self.device
+            )
             self.preprocessor = AutoProcessor.from_pretrained("google/siglip-so400m-patch14-384")
             self.clip_model.eval()
-        self.voxel_pcd = VoxelizedPointcloud(voxel_size = 0.05).to(self.device)
+        self.voxel_pcd = VoxelizedPointcloud(voxel_size=0.05).to(self.device)
         # self.exist_model = YOLOWorld("yolov8l-worldv2.pt")
         self.existence_checking_model = exist_model
         if exist_model:
-            print('WE ARE USING OWLV2!')
-            self.exist_processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
-            self.exist_model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble").to(self.device)
+            print("WE ARE USING OWLV2!")
+            self.exist_processor = AutoProcessor.from_pretrained(
+                "google/owlv2-base-patch16-ensemble"
+            )
+            self.exist_model = Owlv2ForObjectDetection.from_pretrained(
+                "google/owlv2-base-patch16-ensemble"
+            ).to(self.device)
         else:
-            print('YOU ARE USING NOTHING!')
-        
+            print("YOU ARE USING NOTHING!")
 
-    def add(self,
+    def add(
+        self,
         points: Tensor,
         features: Optional[Tensor],
         rgb: Optional[Tensor],
@@ -123,15 +147,13 @@ def add(self,
             weights = weights.to(self.device)
         # if weight_decay is not None and self.voxel_pcd._weights is not None:
         #     self.voxel_pcd._weights *= weight_decay
-        self.voxel_pcd.add(points = points, 
-                        features = features,
-                        rgb = rgb,
-                        weights = weights,
-                        obs_count = obs_count)
+        self.voxel_pcd.add(
+            points=points, features=features, rgb=rgb, weights=weights, obs_count=obs_count
+        )
 
     def calculate_clip_and_st_embeddings_for_queries(self, queries):
         if isinstance(queries, str):
-            queries = [queries] 
+            queries = [queries]
         if self.siglip:
             inputs = self.preprocessor(text=queries, padding="max_length", return_tensors="pt")
             for input in inputs:
@@ -144,7 +166,7 @@ def calculate_clip_and_st_embeddings_for_queries(self, queries):
         # all_clip_tokens = self.clip_model.encode_text(text)
         all_clip_tokens = F.normalize(all_clip_tokens, p=2, dim=-1)
         return all_clip_tokens
-        
+
     def find_alignment_over_model(self, queries):
         clip_text_tokens = self.calculate_clip_and_st_embeddings_for_queries(queries)
         points, features, weights, _ = self.voxel_pcd.get_pointcloud()
@@ -152,21 +174,21 @@ def find_alignment_over_model(self, queries):
             return None
         features = F.normalize(features, p=2, dim=-1)
         point_alignments = clip_text_tokens.float() @ features.float().T
-    
+
         # print(point_alignments.shape)
         return point_alignments
 
     def find_alignment_for_A(self, A):
         points, features, _, _ = self.voxel_pcd.get_pointcloud()
         alignments = self.find_alignment_over_model(A).cpu()
-        return points[alignments.argmax(dim = -1)].detach().cpu()
-    
+        return points[alignments.argmax(dim=-1)].detach().cpu()
+
     def find_obs_id_for_A(self, A):
         obs_counts = self.voxel_pcd._obs_counts
         alignments = self.find_alignment_over_model(A).cpu()
-        return obs_counts[alignments.argmax(dim = -1)].detach().cpu()
+        return obs_counts[alignments.argmax(dim=-1)].detach().cpu()
 
-    def compute_coord(self, text, obs_id, threshold = 0.2):
+    def compute_coord(self, text, obs_id, threshold=0.2):
         # print(obs_id, len(self.voxel_map_wrapper.observations))
         if obs_id <= 0 or obs_id > len(self.voxel_map_wrapper.observations):
             return None
@@ -177,13 +199,15 @@ def compute_coord(self, text, obs_id, threshold = 0.2):
         xyzs = get_xyz(depth, pose, K)[0]
         # rgb[depth >= 2.5] = 0
         rgb = rgb.permute(2, 0, 1).to(torch.uint8)
-        inputs = self.exist_processor(text=[['a photo of a ' + text]], images=rgb, return_tensors="pt")
+        inputs = self.exist_processor(
+            text=[["a photo of a " + text]], images=rgb, return_tensors="pt"
+        )
         for input in inputs:
-            inputs[input] = inputs[input].to('cuda')
-        
+            inputs[input] = inputs[input].to("cuda")
+
         with torch.no_grad():
             outputs = self.exist_model(**inputs)
-    
+
         target_sizes = torch.Tensor([rgb.size()[-2:]]).to(self.device)
         results = self.exist_processor.image_processor.post_process_object_detection(
             outputs, threshold=threshold, target_sizes=target_sizes
@@ -194,30 +218,37 @@ def compute_coord(self, text, obs_id, threshold = 0.2):
         # w, h = depth.shape
         # tl_x, tl_y, br_x, br_y = int(max(0, tl_x.item())), int(max(0, tl_y.item())), int(min(h, br_x.item())), int(min(w, br_y.item()))
         # return torch.median(xyzs[tl_y: br_y, tl_x: br_x].reshape(-1, 3), dim = 0).values
-        for idx, (score, bbox) in enumerate(sorted(zip(results['scores'], results['boxes']), key=lambda x: x[0], reverse=True)):
-        
+        for idx, (score, bbox) in enumerate(
+            sorted(zip(results["scores"], results["boxes"]), key=lambda x: x[0], reverse=True)
+        ):
+
             tl_x, tl_y, br_x, br_y = bbox
             w, h = depth.shape
-            tl_x, tl_y, br_x, br_y = int(max(0, tl_x.item())), int(max(0, tl_y.item())), int(min(h, br_x.item())), int(min(w, br_y.item()))
-            
-            if torch.median(depth[tl_y: br_y, tl_x: br_x].reshape(-1)) < 3:
-                return torch.median(xyzs[tl_y: br_y, tl_x: br_x].reshape(-1, 3), dim = 0).values
+            tl_x, tl_y, br_x, br_y = (
+                int(max(0, tl_x.item())),
+                int(max(0, tl_y.item())),
+                int(min(h, br_x.item())),
+                int(min(w, br_y.item())),
+            )
+
+            if torch.median(depth[tl_y:br_y, tl_x:br_x].reshape(-1)) < 3:
+                return torch.median(xyzs[tl_y:br_y, tl_x:br_x].reshape(-1, 3), dim=0).values
         return None
 
-    def verify_point(self, A, point, distance_threshold = 0.1, similarity_threshold = 0.14):
+    def verify_point(self, A, point, distance_threshold=0.1, similarity_threshold=0.14):
         if isinstance(point, np.ndarray):
             point = torch.from_numpy(point)
         points, _, _, _ = self.voxel_pcd.get_pointcloud()
-        distances = torch.linalg.norm(point - points.detach().cpu(), dim = -1)
+        distances = torch.linalg.norm(point - points.detach().cpu(), dim=-1)
         if torch.min(distances) > distance_threshold:
-            print('Points are so far from other points!')
+            print("Points are so far from other points!")
             return False
         alignments = self.find_alignment_over_model(A).detach().cpu()[0]
         if torch.max(alignments[distances <= distance_threshold]) < similarity_threshold:
-            print('Points close the the point are not similar to the text!')
-        return torch.max(alignments[distances < distance_threshold]) >= similarity_threshold       
+            print("Points close the the point are not similar to the text!")
+        return torch.max(alignments[distances < distance_threshold]) >= similarity_threshold
 
-    def localize_A(self, A, debug = True, return_debug = False):
+    def localize_A(self, A, debug=True, return_debug=False):
         # centroids, extends, similarity_max_list, points, obs_max_list, debug_text = self.find_clusters_for_A(A, return_obs_counts = True, debug = debug)
         # if len(centroids) == 0:
         #     if not debug:
@@ -244,7 +275,7 @@ def localize_A(self, A, debug = True, return_debug = False):
         #         debug_text += '#### - Instance ' +  str(idx + 1) + ' has high cosine similarity (' + str(round(similarity, 3)) +  '). **😃** Directly navigate to it.\n'
 
         #         break
-            
+
         #     debug_text += '#### - Cannot verify whether this instance is the target. **😞** \n'
 
         # if target_point is None:
@@ -259,17 +290,19 @@ def localize_A(self, A, debug = True, return_debug = False):
         alignments = self.find_alignment_over_model(A).cpu()
         # alignments = alignments[0, points[:, -1] >= 0.1]
         # points = points[points[:, -1] >= 0.1]
-        point = points[alignments.argmax(dim = -1)].detach().cpu().squeeze()
+        point = points[alignments.argmax(dim=-1)].detach().cpu().squeeze()
         obs_counts = self.voxel_pcd._obs_counts
-        image_id = obs_counts[alignments.argmax(dim = -1)].detach().cpu()
-        debug_text = ''
+        image_id = obs_counts[alignments.argmax(dim=-1)].detach().cpu()
+        debug_text = ""
         target_point = None
 
         res = self.compute_coord(A, image_id)
         # res = None
         if res is not None:
             target_point = res
-            debug_text += '#### - Obejct is detected in observations . **😃** Directly navigate to it.\n'
+            debug_text += (
+                "#### - Obejct is detected in observations . **😃** Directly navigate to it.\n"
+            )
         else:
             # debug_text += '#### - Directly ignore this instance is the target. **😞** \n'
             if self.siglip:
@@ -279,9 +312,11 @@ def localize_A(self, A, debug = True, return_debug = False):
             if cosine_similarity_check:
                 target_point = point
 
-                debug_text += '#### - The point has high cosine similarity. **😃** Directly navigate to it.\n'
+                debug_text += (
+                    "#### - The point has high cosine similarity. **😃** Directly navigate to it.\n"
+                )
             else:
-                debug_text += '#### - Cannot verify whether this instance is the target. **😞** \n'
+                debug_text += "#### - Cannot verify whether this instance is the target. **😞** \n"
         # print('target_point', target_point)
         if not debug:
             return target_point
@@ -290,9 +325,9 @@ def localize_A(self, A, debug = True, return_debug = False):
         else:
             return target_point, debug_text, image_id, point
 
-    def find_clusters_for_A(self, A, return_obs_counts = False, debug = False, turning_point = None):
+    def find_clusters_for_A(self, A, return_obs_counts=False, debug=False, turning_point=None):
 
-        debug_text = ''
+        debug_text = ""
 
         points, features, _, _ = self.voxel_pcd.get_pointcloud()
         alignments = self.find_alignment_over_model(A).cpu().reshape(-1).detach().numpy()
@@ -306,7 +341,9 @@ def find_clusters_for_A(self, A, return_obs_counts = False, debug = False, turni
         points = points[mask]
         if len(points) == 0:
 
-            debug_text += '### - No instance found! Maybe target object has not been observed yet. **😭**\n'
+            debug_text += (
+                "### - No instance found! Maybe target object has not been observed yet. **😭**\n"
+            )
 
             output = [[], [], [], []]
             if return_obs_counts:
@@ -318,20 +355,26 @@ def find_clusters_for_A(self, A, return_obs_counts = False, debug = False, turni
         else:
             if return_obs_counts:
                 obs_ids = self.voxel_pcd._obs_counts.detach().cpu().numpy()[mask]
-                centroids, extends, similarity_max_list, points, obs_max_list = find_clusters(points.detach().cpu().numpy(), alignments, obs = obs_ids)
+                centroids, extends, similarity_max_list, points, obs_max_list = find_clusters(
+                    points.detach().cpu().numpy(), alignments, obs=obs_ids
+                )
                 output = [centroids, extends, similarity_max_list, points, obs_max_list]
             else:
-                centroids, extends, similarity_max_list, points = find_clusters(points.detach().cpu().numpy(), alignments, obs = None)
+                centroids, extends, similarity_max_list, points = find_clusters(
+                    points.detach().cpu().numpy(), alignments, obs=None
+                )
                 output = [centroids, extends, similarity_max_list, points]
 
-            debug_text += '### - Found ' + str(len(centroids)) + ' instances that might be target object.\n'
+            debug_text += (
+                "### - Found " + str(len(centroids)) + " instances that might be target object.\n"
+            )
             if debug:
                 output.append(debug_text)
-            
+
             return output
 
 
-def find_clusters(vertices: np.ndarray, similarity: np.ndarray, obs = None):
+def find_clusters(vertices: np.ndarray, similarity: np.ndarray, obs=None):
     # Calculate the number of top values directly
     top_positions = vertices
     # top_values = probability_over_all_points[top_indices].flatten()
@@ -347,7 +390,7 @@ def find_clusters(vertices: np.ndarray, similarity: np.ndarray, obs = None):
     similarity_list = []
     points = []
     obs_max_list = []
-    
+
     for cluster_id in set(labels):
         if cluster_id == -1:  # Ignore noise
             continue
@@ -377,4 +420,4 @@ def find_clusters(vertices: np.ndarray, similarity: np.ndarray, obs = None):
     if obs is not None:
         return centroids, extends, similarity_list, points, obs_max_list
     else:
-        return centroids, extends, similarity_list, points
\ No newline at end of file
+        return centroids, extends, similarity_list, points
diff --git a/src/stretch/dynav/voxel_map_server.py b/src/stretch/dynav/voxel_map_server.py
index 6ce7dbb1b..5437a0d5f 100644
--- a/src/stretch/dynav/voxel_map_server.py
+++ b/src/stretch/dynav/voxel_map_server.py
@@ -1,59 +1,77 @@
-import zmq
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
 
-from stretch.dynav.scannet import CLASS_LABELS_200
+import datetime
+import os
+import pickle
+import threading
+
+# import wget
+import time
+from io import BytesIO
+from pathlib import Path
 
+import clip
 import cv2
+import hydra
 import numpy as np
+import open3d as o3d
+import rerun as rr
+import scipy
 import torch
+
+# from stretch.utils.morphology import get_edges
 import torch.nn.functional as F
 import torchvision.transforms.functional as V
-# from segment_anything import sam_model_registry, SamPredictor
-# from transformers import AutoProcessor, OwlViTForObjectDetection
-from ultralytics import YOLO, SAM, YOLOWorld
+import wget
+import zmq
+from matplotlib import pyplot as plt
+from PIL import Image
 from sam2.build_sam import build_sam2
 from sam2.sam2_image_predictor import SAM2ImagePredictor
-
-import clip
 from torchvision import transforms
+from transformers import AutoModel, AutoProcessor
 
-import os
-# import wget
-import time
-
-import open3d as o3d
+# from segment_anything import sam_model_registry, SamPredictor
+# from transformers import AutoProcessor, OwlViTForObjectDetection
+from ultralytics import SAM, YOLO, YOLOWorld
 
-from matplotlib import pyplot as plt
-import pickle
-from pathlib import Path
-import wget
-from stretch.dynav.voxel_map_localizer import VoxelMapLocalizer
 from stretch.core import get_parameters
+from stretch.dynav.communication_util import (
+    load_socket,
+    recv_array,
+    recv_depth_img,
+    recv_everything,
+    recv_rgb_img,
+    send_array,
+    send_depth_img,
+    send_everything,
+    send_rgb_img,
+)
 from stretch.dynav.mapping_utils import (
-    SparseVoxelMap, 
-    SparseVoxelMapNavigationSpace, 
     AStar,
-    VoxelizedPointcloud
+    SparseVoxelMap,
+    SparseVoxelMapNavigationSpace,
+    VoxelizedPointcloud,
 )
+from stretch.dynav.scannet import CLASS_LABELS_200
+from stretch.dynav.voxel_map_localizer import VoxelMapLocalizer
 
-import datetime
-
-import threading
-import scipy
-import hydra
-
-from transformers import AutoProcessor, AutoModel
-import rerun as rr
-
-from io import BytesIO
-from PIL import Image
-
-from stretch.dynav.communication_util import load_socket, send_array, recv_array, send_rgb_img, recv_rgb_img, send_depth_img, recv_depth_img, send_everything, recv_everything
-# from stretch.utils.morphology import get_edges
-import torch.nn.functional as F
 
 def get_inv_intrinsics(intrinsics):
     # return intrinsics.double().inverse().to(intrinsics)
-    fx, fy, ppx, ppy = intrinsics[..., 0, 0], intrinsics[..., 1, 1], intrinsics[..., 0, 2], intrinsics[..., 1, 2]
+    fx, fy, ppx, ppy = (
+        intrinsics[..., 0, 0],
+        intrinsics[..., 1, 1],
+        intrinsics[..., 0, 2],
+        intrinsics[..., 1, 2],
+    )
     inv_intrinsics = torch.zeros_like(intrinsics)
     inv_intrinsics[..., 0, 0] = 1.0 / fx
     inv_intrinsics[..., 1, 1] = 1.0 / fy
@@ -62,6 +80,7 @@ def get_inv_intrinsics(intrinsics):
     inv_intrinsics[..., 2, 2] = 1.0
     return inv_intrinsics
 
+
 def get_xyz(depth, pose, intrinsics):
     """Returns the XYZ coordinates for a set of points.
 
@@ -101,40 +120,42 @@ def get_xyz(depth, pose, intrinsics):
     xyz = xyz @ get_inv_intrinsics(intrinsics).transpose(-1, -2)
     xyz = xyz * depth.flatten(1).unsqueeze(-1)
     xyz = (xyz[..., None, :] * pose[..., None, :3, :3]).sum(dim=-1) + pose[..., None, :3, 3]
-    
+
     xyz = xyz.unflatten(1, (height, width))
 
     return xyz
 
+
 class ImageProcessor:
-    def __init__(self,  
-        vision_method = 'mask*lip', 
-        siglip = True,
-        device = 'cuda',
-        min_depth = 0.25,
-        max_depth = 2.5,
-        img_port = 5558,
-        text_port = 5556,
-        open_communication = True,
-        rerun = True,
-        static = True,
-        log = None,
-        image_shape = (450, 350)
+    def __init__(
+        self,
+        vision_method="mask*lip",
+        siglip=True,
+        device="cuda",
+        min_depth=0.25,
+        max_depth=2.5,
+        img_port=5558,
+        text_port=5556,
+        open_communication=True,
+        rerun=True,
+        static=True,
+        log=None,
+        image_shape=(450, 350),
     ):
         self.static = static
         self.siglip = siglip
         current_datetime = datetime.datetime.now()
         if log is None:
-            self.log = 'debug_' + current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
+            self.log = "debug_" + current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
         else:
             self.log = log
         self.rerun = rerun
         if self.rerun:
             if self.static:
-                rr.init(self.log, spawn = False)
-                rr.connect('100.108.67.79:9876')
+                rr.init(self.log, spawn=False)
+                rr.connect("100.108.67.79:9876")
             else:
-                rr.init(self.log, spawn = True)
+                rr.init(self.log, spawn=True)
         self.min_depth = min_depth
         self.max_depth = max_depth
         self.obs_count = 0
@@ -144,17 +165,19 @@ def __init__(self,
         self.vision_method = vision_method
         # If cuda is not available, then device will be forced to be cpu
         if not torch.cuda.is_available():
-            device = 'cpu'
+            device = "cpu"
         self.device = device
 
         self.create_obstacle_map()
         self.create_vision_model()
 
-        self.voxel_map_lock = threading.Lock()  # Create a lock for synchronizing access to `self.voxel_map_localizer`
+        self.voxel_map_lock = (
+            threading.Lock()
+        )  # Create a lock for synchronizing access to `self.voxel_map_localizer`
 
         self.traj = None
         self.image_shape = image_shape
-        
+
         if open_communication:
             self.img_socket = load_socket(img_port)
             self.text_socket = load_socket(text_port)
@@ -162,7 +185,7 @@ def __init__(self,
             self.img_thread = threading.Thread(target=self._recv_image)
             self.img_thread.daemon = True
             self.img_thread.start()
-    
+
     def create_obstacle_map(self):
         print("- Load parameters")
         parameters = get_parameters("dynav_config.yaml")
@@ -172,29 +195,21 @@ def create_obstacle_map(self):
             local_radius=parameters["local_radius"],
             obs_min_height=parameters["obs_min_height"],
             obs_max_height=parameters["obs_max_height"],
-            obs_min_density = parameters["obs_min_density"],
-            exp_min_density = parameters["exp_min_density"],
+            obs_min_density=parameters["obs_min_density"],
+            exp_min_density=parameters["exp_min_density"],
             min_depth=self.min_depth,
             max_depth=self.max_depth,
             pad_obstacles=parameters["pad_obstacles"],
-            add_local_radius_points=parameters.get(
-                "add_local_radius_points", default=True
-            ),
+            add_local_radius_points=parameters.get("add_local_radius_points", default=True),
             remove_visited_from_obstacles=parameters.get(
                 "remove_visited_from_obstacles", default=False
             ),
             smooth_kernel_size=parameters.get("filters/smooth_kernel_size", -1),
             use_median_filter=parameters.get("filters/use_median_filter", False),
             median_filter_size=parameters.get("filters/median_filter_size", 5),
-            median_filter_max_error=parameters.get(
-                "filters/median_filter_max_error", 0.01
-            ),
-            use_derivative_filter=parameters.get(
-                "filters/use_derivative_filter", False
-            ),
-            derivative_filter_threshold=parameters.get(
-                "filters/derivative_filter_threshold", 0.5
-            )
+            median_filter_max_error=parameters.get("filters/median_filter_max_error", 0.01),
+            use_derivative_filter=parameters.get("filters/use_derivative_filter", False),
+            derivative_filter_threshold=parameters.get("filters/derivative_filter_threshold", 0.5),
         )
         self.space = SparseVoxelMapNavigationSpace(
             self.voxel_map,
@@ -215,64 +230,98 @@ def create_vision_model(self):
             self.clip_model, self.clip_preprocess = clip.load("ViT-B/16", device=self.device)
             self.clip_model.eval()
         else:
-            self.clip_model = AutoModel.from_pretrained("google/siglip-so400m-patch14-384").to(self.device)
+            self.clip_model = AutoModel.from_pretrained("google/siglip-so400m-patch14-384").to(
+                self.device
+            )
             self.clip_preprocess = AutoProcessor.from_pretrained("google/siglip-so400m-patch14-384")
             self.clip_model.eval()
-        if self.vision_method != 'mask*lip':
+        if self.vision_method != "mask*lip":
             sam_checkpoint = f"./sam2_hiera_small.pt"
             sam_config = "sam2_hiera_s.yaml"
             if not os.path.exists(sam_checkpoint):
-                wget.download('https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_small.pt', out = sam_checkpoint)
-            sam2_model = build_sam2(sam_config, sam_checkpoint, device=self.device, apply_postprocessing=False)
+                wget.download(
+                    "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_small.pt",
+                    out=sam_checkpoint,
+                )
+            sam2_model = build_sam2(
+                sam_config, sam_checkpoint, device=self.device, apply_postprocessing=False
+            )
             self.mask_predictor = SAM2ImagePredictor(sam2_model)
-        self.yolo_model = YOLOWorld('yolov8s-worldv2.pt')
+        self.yolo_model = YOLOWorld("yolov8s-worldv2.pt")
         self.texts = CLASS_LABELS_200
         self.yolo_model.set_classes(self.texts)
-        self.voxel_map_localizer = VoxelMapLocalizer(self.voxel_map, clip_model = self.clip_model, processor = self.clip_preprocess, device = self.device, siglip = self.siglip)
+        self.voxel_map_localizer = VoxelMapLocalizer(
+            self.voxel_map,
+            clip_model=self.clip_model,
+            processor=self.clip_preprocess,
+            device=self.device,
+            siglip=self.siglip,
+        )
 
     def process_text(self, text, start_pose):
         if self.rerun:
-            rr.log('/object', rr.Clear(recursive = True), static = self.static)
-            rr.log('/robot_start_pose', rr.Clear(recursive = True), static = self.static)
-            rr.log('/direction', rr.Clear(recursive = True), static = self.static)
-            rr.log('robot_monologue', rr.Clear(recursive = True), static = self.static)
-            rr.log('/Past_observation_most_similar_to_text', rr.Clear(recursive = True), static = self.static)
+            rr.log("/object", rr.Clear(recursive=True), static=self.static)
+            rr.log("/robot_start_pose", rr.Clear(recursive=True), static=self.static)
+            rr.log("/direction", rr.Clear(recursive=True), static=self.static)
+            rr.log("robot_monologue", rr.Clear(recursive=True), static=self.static)
+            rr.log(
+                "/Past_observation_most_similar_to_text",
+                rr.Clear(recursive=True),
+                static=self.static,
+            )
             if not self.static:
-                rr.connect('100.108.67.79:9876')
-        debug_text = ''
-        mode = 'navigation'
+                rr.connect("100.108.67.79:9876")
+        debug_text = ""
+        mode = "navigation"
         obs = None
         localized_point = None
         waypoints = None
-        
-        if text is not None and text != '' and self.traj is not None:
-            print('saved traj', self.traj)
+
+        if text is not None and text != "" and self.traj is not None:
+            print("saved traj", self.traj)
             traj_target_point = self.traj[-1]
             if self.voxel_map_localizer.verify_point(text, traj_target_point):
                 localized_point = traj_target_point
-                debug_text += '## Last visual grounding results looks fine so directly use it.\n'
+                debug_text += "## Last visual grounding results looks fine so directly use it.\n"
                 # if self.planner.verify_path(self.traj[:-2]):
                 #     waypoints = self.traj[:-2]
                 #     debug_text += '## Last path planning results looks fine so directly use it.\n'
 
         if waypoints is None:
             # Do visual grounding
-            if text is not None and text != '' and localized_point is None:
+            if text is not None and text != "" and localized_point is None:
                 with self.voxel_map_lock:
-                    localized_point, debug_text, obs, pointcloud = self.voxel_map_localizer.localize_A(text, debug = True, return_debug = True)
+                    (
+                        localized_point,
+                        debug_text,
+                        obs,
+                        pointcloud,
+                    ) = self.voxel_map_localizer.localize_A(text, debug=True, return_debug=True)
                 if localized_point is not None:
-                    rr.log("/object", rr.Points3D([localized_point[0], localized_point[1], 1.5], colors=torch.Tensor([1, 0, 0]), radii=0.1), static = self.static)
+                    rr.log(
+                        "/object",
+                        rr.Points3D(
+                            [localized_point[0], localized_point[1], 1.5],
+                            colors=torch.Tensor([1, 0, 0]),
+                            radii=0.1,
+                        ),
+                        static=self.static,
+                    )
             # Do Frontier based exploration
-            if text is None or text == '' or localized_point is None:
-                debug_text += '## Navigation fails, so robot starts exploring environments.\n'
+            if text is None or text == "" or localized_point is None:
+                debug_text += "## Navigation fails, so robot starts exploring environments.\n"
                 localized_point = self.sample_frontier(start_pose, text)
-                mode = 'exploration'
-                rr.log("/object", rr.Points3D([0, 0, 0], colors=torch.Tensor([1, 0, 0]), radii=0), static = self.static)
-                print('\n', localized_point, '\n')
+                mode = "exploration"
+                rr.log(
+                    "/object",
+                    rr.Points3D([0, 0, 0], colors=torch.Tensor([1, 0, 0]), radii=0),
+                    static=self.static,
+                )
+                print("\n", localized_point, "\n")
 
             if localized_point is None:
                 return []
-        
+
             if len(localized_point) == 2:
                 localized_point = np.array([localized_point[0], localized_point[1], 0])
 
@@ -280,44 +329,46 @@ def process_text(self, text, start_pose):
 
             if self.rerun:
                 buf = BytesIO()
-                plt.savefig(buf, format='png')
+                plt.savefig(buf, format="png")
                 buf.seek(0)
                 img = Image.open(buf)
                 img = np.array(img)
                 buf.close()
-                rr.log('2d_map', rr.Image(img), static = self.static)
+                rr.log("2d_map", rr.Image(img), static=self.static)
             else:
-                if text != '':
-                    plt.savefig(self.log + '/debug_' + text + str(self.obs_count) + '.png')
+                if text != "":
+                    plt.savefig(self.log + "/debug_" + text + str(self.obs_count) + ".png")
                 else:
-                    plt.savefig(self.log + '/debug_exploration' + str(self.obs_count) + '.png')
+                    plt.savefig(self.log + "/debug_exploration" + str(self.obs_count) + ".png")
             plt.clf()
 
-            if obs is not None and mode == 'navigation':
+            if obs is not None and mode == "navigation":
                 rgb = self.voxel_map.observations[obs - 1].rgb
                 if not self.rerun:
                     if isinstance(rgb, torch.Tensor):
                         rgb = np.array(rgb)
-                    cv2.imwrite(self.log + '/debug_' + text + '.png', rgb[:, :, [2, 1, 0]])
+                    cv2.imwrite(self.log + "/debug_" + text + ".png", rgb[:, :, [2, 1, 0]])
                 else:
-                    rr.log('/Past_observation_most_similar_to_text', rr.Image(rgb), static = self.static)
+                    rr.log(
+                        "/Past_observation_most_similar_to_text", rr.Image(rgb), static=self.static
+                    )
             waypoints = None
 
             if point is None:
-                print('Unable to find any target point, some exception might happen')
+                print("Unable to find any target point, some exception might happen")
             else:
-                print('Target point is', point)
+                print("Target point is", point)
                 res = self.planner.plan(start_pose, point)
             if res.success:
                 waypoints = [pt.state for pt in res.trajectory]
             else:
                 waypoints = None
-                print('[FAILURE]', res.reason)
-        # If we are navigating to some object of interst, send (x, y, z) of 
+                print("[FAILURE]", res.reason)
+        # If we are navigating to some object of interst, send (x, y, z) of
         # the object so that we can make sure the robot looks at the object after navigation
         traj = []
         if waypoints is not None:
-            finished = len(waypoints) <= 10 and mode == 'navigation'
+            finished = len(waypoints) <= 10 and mode == "navigation"
             if finished:
                 self.traj = None
             else:
@@ -330,31 +381,47 @@ def process_text(self, text, start_pose):
                 if isinstance(localized_point, torch.Tensor):
                     localized_point = localized_point.tolist()
                 traj.append(localized_point)
-            print('Planned trajectory:', traj)
+            print("Planned trajectory:", traj)
 
         if self.rerun:
-            if text is not None and text != '':
-                debug_text = '### The goal is to navigate to ' + text + '.\n' + debug_text
+            if text is not None and text != "":
+                debug_text = "### The goal is to navigate to " + text + ".\n" + debug_text
             else:
-                debug_text = '### I have not received any text query from human user.\n ### So, I plan to explore the environment with Frontier-based exploration.\n'
-            debug_text = '# Robot\'s monologue: \n' + debug_text
-            rr.log("robot_monologue", rr.TextDocument(debug_text, media_type = rr.MediaType.MARKDOWN), static = self.static)
-            
+                debug_text = "### I have not received any text query from human user.\n ### So, I plan to explore the environment with Frontier-based exploration.\n"
+            debug_text = "# Robot's monologue: \n" + debug_text
+            rr.log(
+                "robot_monologue",
+                rr.TextDocument(debug_text, media_type=rr.MediaType.MARKDOWN),
+                static=self.static,
+            )
+
         if traj is not None:
             origins = []
             vectors = []
             for idx in range(len(traj)):
                 if idx != len(traj) - 1:
                     origins.append([traj[idx][0], traj[idx][1], 1.5])
-                    vectors.append([traj[idx + 1][0] - traj[idx][0], traj[idx + 1][1] - traj[idx][1], 0])
-            rr.log("/direction", rr.Arrows3D(origins = origins, vectors = vectors, colors=torch.Tensor([0, 1, 0]), radii=0.1), static = self.static)
-            rr.log("/robot_start_pose", rr.Points3D([start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1), static = self.static)
+                    vectors.append(
+                        [traj[idx + 1][0] - traj[idx][0], traj[idx + 1][1] - traj[idx][1], 0]
+                    )
+            rr.log(
+                "/direction",
+                rr.Arrows3D(
+                    origins=origins, vectors=vectors, colors=torch.Tensor([0, 1, 0]), radii=0.1
+                ),
+                static=self.static,
+            )
+            rr.log(
+                "/robot_start_pose",
+                rr.Points3D(
+                    [start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1
+                ),
+                static=self.static,
+            )
 
         # self.write_to_pickle()
         return traj
 
-
-
     # def process_text(self, text, start_pose):
     #     if self.rerun:
     #         # if not self.static:
@@ -366,7 +433,7 @@ def process_text(self, text, start_pose):
     #         rr.log('/Past_observation_most_similar_to_text', rr.Clear(recursive = True), static = self.static)
     #         if not self.static:
     #             rr.connect('100.108.67.79:9876')
-        
+
     #     debug_text = ''
     #     mode = 'navigation'
     #     obs = None
@@ -386,7 +453,7 @@ def process_text(self, text, start_pose):
 
     #     if localized_point is None:
     #         return []
-        
+
     #     if len(localized_point) == 2:
     #         localized_point = np.array([localized_point[0], localized_point[1], 0])
 
@@ -435,7 +502,7 @@ def process_text(self, text, start_pose):
     #         res = self.planner.plan(start_pose, point)
     #         if res.success:
     #             waypoints = [pt.state for pt in res.trajectory]
-    #             # If we are navigating to some object of interst, send (x, y, z) of 
+    #             # If we are navigating to some object of interst, send (x, y, z) of
     #             # the object so that we can make sure the robot looks at the object after navigation
     #             print(waypoints)
     #             # finished = (len(waypoints) <= 4 or torch.linalg.norm(torch.Tensor(point)[:2] - torch.Tensor(start_pose[:2])) > 0.8) and mode == 'navigation'
@@ -453,7 +520,7 @@ def process_text(self, text, start_pose):
     #             print('Planned trajectory:', traj)
     #         else:
     #             print('[FAILURE]', res.reason)
-            
+
     #     if traj is not None:
     #         origins = []
     #         vectors = []
@@ -463,11 +530,11 @@ def process_text(self, text, start_pose):
     #                 vectors.append([traj[idx + 1][0] - traj[idx][0], traj[idx + 1][1] - traj[idx][1], 0])
     #         rr.log("/direction", rr.Arrows3D(origins = origins, vectors = vectors, colors=torch.Tensor([0, 1, 0]), radii=0.05), static = self.static)
     #         rr.log("/robot_start_pose", rr.Points3D([start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1), static = self.static)
-        
+
     #     self.write_to_pickle()
     #     return traj
 
-    def sample_navigation(self, start, point, mode = 'navigation'):
+    def sample_navigation(self, start, point, mode="navigation"):
         # plt.clf()
         obstacles, _ = self.voxel_map.get_2d_map()
         plt.imshow(obstacles)
@@ -475,50 +542,60 @@ def sample_navigation(self, start, point, mode = 'navigation'):
             start_pt = self.planner.to_pt(start)
             # plt.scatter(start_pt[1], start_pt[0], s = 10)
             return None
-        goal = self.space.sample_target_point(start, point, self.planner, exploration = mode != 'navigation')
+        goal = self.space.sample_target_point(
+            start, point, self.planner, exploration=mode != "navigation"
+        )
         print("point:", point, "goal:", goal)
         obstacles, explored = self.voxel_map.get_2d_map()
         start_pt = self.planner.to_pt(start)
-        plt.scatter(start_pt[1], start_pt[0], s = 15, c = 'b')
+        plt.scatter(start_pt[1], start_pt[0], s=15, c="b")
         point_pt = self.planner.to_pt(point)
-        plt.scatter(point_pt[1], point_pt[0], s = 15, c = 'r')
+        plt.scatter(point_pt[1], point_pt[0], s=15, c="r")
         if goal is not None:
             goal_pt = self.planner.to_pt(goal)
-            plt.scatter(goal_pt[1], goal_pt[0], s = 10, c = 'g')
+            plt.scatter(goal_pt[1], goal_pt[0], s=10, c="g")
         return goal
 
-    def sample_frontier(self, start_pose = [0, 0, 0], text = None):
+    def sample_frontier(self, start_pose=[0, 0, 0], text=None):
         with self.voxel_map_lock:
-            if text is not None and text != '':
-                index, time_heuristics, alignments_heuristics, total_heuristics = self.space.sample_exploration(start_pose, self.planner, self.voxel_map_localizer, text, debug = False)
+            if text is not None and text != "":
+                (
+                    index,
+                    time_heuristics,
+                    alignments_heuristics,
+                    total_heuristics,
+                ) = self.space.sample_exploration(
+                    start_pose, self.planner, self.voxel_map_localizer, text, debug=False
+                )
             else:
-                index, time_heuristics, _, total_heuristics = self.space.sample_exploration(start_pose, self.planner, None, None, debug = False)
+                index, time_heuristics, _, total_heuristics = self.space.sample_exploration(
+                    start_pose, self.planner, None, None, debug=False
+                )
                 alignments_heuristics = time_heuristics
-                
+
         obstacles, explored = self.voxel_map.get_2d_map()
         # plt.clf()
         # plt.imshow(obstacles * 0.5 + explored * 0.5)
         # plt.scatter(index[1], index[0], s = 20, c = 'r')
         return self.voxel_map.grid_coords_to_xyt(torch.tensor([index[0], index[1]]))
-            
 
     def _recv_image(self):
         while True:
             # data = recv_array(self.img_socket)
             rgb, depth, intrinsics, pose = recv_everything(self.img_socket)
-            print('Image received')
+            print("Image received")
             start_time = time.time()
             self.process_rgbd_images(rgb, depth, intrinsics, pose)
             process_time = time.time() - start_time
-            print('Image processing takes', process_time, 'seconds')
-            print('processing took ' + str(process_time) + ' seconds')
+            print("Image processing takes", process_time, "seconds")
+            print("processing took " + str(process_time) + " seconds")
 
     def forward_one_block(self, resblocks, x):
         q, k, v = None, None, None
         y = resblocks.ln_1(x)
         y = F.linear(y, resblocks.attn.in_proj_weight, resblocks.attn.in_proj_bias)
         N, L, C = y.shape
-        y = y.view(N, L, 3, C//3).permute(2, 0, 1, 3).reshape(3*N, L, C//3)
+        y = y.view(N, L, 3, C // 3).permute(2, 0, 1, 3).reshape(3 * N, L, C // 3)
         y = F.linear(y, resblocks.attn.out_proj.weight, resblocks.attn.out_proj.bias)
         q, k, v = y.tensor_split(3, dim=0)
         v += x
@@ -530,7 +607,7 @@ def forward_one_block_siglip(self, resblocks, x):
         q, k, v = None, None, None
         x = F.linear(x, resblocks.in_proj_weight, resblocks.in_proj_bias)
         N, L, C = x.shape
-        x = x.view(N, L, 3, C//3).permute(2, 0, 1, 3).reshape(3*N, L, C//3)
+        x = x.view(N, L, 3, C // 3).permute(2, 0, 1, 3).reshape(3 * N, L, C // 3)
         x = F.linear(x, resblocks.out_proj.weight, resblocks.out_proj.bias)
         q, k, v = x.tensor_split(3, dim=0)
 
@@ -539,14 +616,16 @@ def forward_one_block_siglip(self, resblocks, x):
     def extract_mask_clip_features(self, x, image_shape):
         if self.siglip:
             with torch.no_grad():
-                output = self.clip_model.vision_model(x['pixel_values'], output_hidden_states = True)
+                output = self.clip_model.vision_model(x["pixel_values"], output_hidden_states=True)
             feat = output.last_hidden_state
             feat = self.forward_one_block_siglip(self.clip_model.vision_model.head.attention, feat)
             feat = self.clip_model.vision_model.head.layernorm(feat)
             feat = feat + self.clip_model.vision_model.head.mlp(feat)
             feat = feat.detach().cpu()
             with torch.no_grad():
-                N, L, H, W = self.clip_model.vision_model.embeddings.patch_embedding(x['pixel_values']).shape
+                N, L, H, W = self.clip_model.vision_model.embeddings.patch_embedding(
+                    x["pixel_values"]
+                ).shape
             feat = feat.reshape(N, H, W, L).permute(0, 3, 1, 2)
         else:
             with torch.no_grad():
@@ -554,7 +633,14 @@ def extract_mask_clip_features(self, x, image_shape):
                 N, L, H, W = x.shape
                 x = x.reshape(x.shape[0], x.shape[1], -1)
                 x = x.permute(0, 2, 1)
-                x = torch.cat([self.clip_model.visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)
+                x = torch.cat(
+                    [
+                        self.clip_model.visual.class_embedding.to(x.dtype)
+                        + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device),
+                        x,
+                    ],
+                    dim=1,
+                )
                 x = x + self.clip_model.visual.positional_embedding.to(x.dtype)
                 x = self.clip_model.visual.ln_pre(x)
                 x = x.permute(1, 0, 2)
@@ -568,37 +654,56 @@ def extract_mask_clip_features(self, x, image_shape):
                 x = self.clip_model.visual.ln_post(x)
                 x = x @ self.clip_model.visual.proj
                 feat = x.reshape(N, H, W, -1).permute(0, 3, 1, 2)
-        feat = F.interpolate(feat, image_shape, mode = 'bilinear', align_corners = True)
-        feat = F.normalize(feat, dim = 1)
+        feat = F.interpolate(feat, image_shape, mode="bilinear", align_corners=True)
+        feat = F.normalize(feat, dim=1)
         return feat.permute(0, 2, 3, 1)
-    
+
     def run_mask_clip(self, rgb, mask, world_xyz):
 
         with torch.no_grad():
-            results = self.yolo_model.predict(rgb.permute(1,2,0)[:, :, [2, 1, 0]].numpy(), conf=0.1, verbose=False)
+            results = self.yolo_model.predict(
+                rgb.permute(1, 2, 0)[:, :, [2, 1, 0]].numpy(), conf=0.1, verbose=False
+            )
             xyxy_tensor = results[0].boxes.xyxy
             if len(xyxy_tensor) == 0:
                 return
-            bounding_boxes = torch.stack(sorted(xyxy_tensor, key=lambda box: (box[2] - box[0]) * (box[3] - box[1]), reverse = True), dim = 0)
+            bounding_boxes = torch.stack(
+                sorted(
+                    xyxy_tensor, key=lambda box: (box[2] - box[0]) * (box[3] - box[1]), reverse=True
+                ),
+                dim=0,
+            )
             bbox_mask = torch.zeros_like(mask)
             for box in bounding_boxes:
                 tl_x, tl_y, br_x, br_y = box
-                bbox_mask[max(int(tl_y), 0): min(int(br_y), rgb.shape[1]), max(int(tl_x), 0): min(int(br_x), rgb.shape[2])] = 1
+                bbox_mask[
+                    max(int(tl_y), 0) : min(int(br_y), rgb.shape[1]),
+                    max(int(tl_x), 0) : min(int(br_x), rgb.shape[2]),
+                ] = 1
             bbox_mask = bbox_mask.bool()
             # print(mask, bbox_mask)
             mask = torch.logical_or(mask, ~bbox_mask)
-            
+
             if not self.siglip:
-                if self.device == 'cpu':
-                    input = self.clip_preprocess(transforms.ToPILImage()(rgb)).unsqueeze(0).to(self.device)
+                if self.device == "cpu":
+                    input = (
+                        self.clip_preprocess(transforms.ToPILImage()(rgb))
+                        .unsqueeze(0)
+                        .to(self.device)
+                    )
                 else:
-                    input = self.clip_preprocess(transforms.ToPILImage()(rgb)).unsqueeze(0).to(self.device).half()
+                    input = (
+                        self.clip_preprocess(transforms.ToPILImage()(rgb))
+                        .unsqueeze(0)
+                        .to(self.device)
+                        .half()
+                    )
             else:
                 input = self.clip_preprocess(images=rgb, padding="max_length", return_tensors="pt")
                 for i in input:
                     input[i] = input[i].to(self.device)
             features = self.extract_mask_clip_features(input, rgb.shape[-2:])[0].cpu()
-            
+
         valid_xyz = world_xyz[~mask]
         features = features[~mask]
         valid_rgb = rgb.permute(1, 2, 0)[~mask]
@@ -607,28 +712,35 @@ def run_mask_clip(self, rgb, mask, world_xyz):
 
     def run_owl_sam_clip(self, rgb, mask, world_xyz):
         with torch.no_grad():
-            results = self.yolo_model.predict(rgb.permute(1,2,0)[:, :, [2, 1, 0]].numpy(), conf=0.1, verbose=False)
+            results = self.yolo_model.predict(
+                rgb.permute(1, 2, 0)[:, :, [2, 1, 0]].numpy(), conf=0.1, verbose=False
+            )
             xyxy_tensor = results[0].boxes.xyxy
             if len(xyxy_tensor) == 0:
                 return
 
-            self.mask_predictor.set_image(rgb.permute(1,2,0).numpy())
+            self.mask_predictor.set_image(rgb.permute(1, 2, 0).numpy())
             # bounding_boxes = torch.stack(sorted(results[0]['boxes'], key=lambda box: (box[2] - box[0]) * (box[3] - box[1]), reverse = True), dim = 0)
-            bounding_boxes = torch.stack(sorted(xyxy_tensor, key=lambda box: (box[2] - box[0]) * (box[3] - box[1]), reverse = True), dim = 0)
+            bounding_boxes = torch.stack(
+                sorted(
+                    xyxy_tensor, key=lambda box: (box[2] - box[0]) * (box[3] - box[1]), reverse=True
+                ),
+                dim=0,
+            )
             # transformed_boxes = self.mask_predictor.transform.apply_boxes_torch(bounding_boxes.detach().to(self.device), rgb.shape[-2:])
             # masks, _, _= self.mask_predictor.predict_torch(
-            masks, _, _= self.mask_predictor.predict(
+            masks, _, _ = self.mask_predictor.predict(
                 point_coords=None,
                 point_labels=None,
                 # boxes=transformed_boxes,
-                box = bounding_boxes,
-                multimask_output=False
+                box=bounding_boxes,
+                multimask_output=False,
             )
             if masks.ndim == 3:
                 masks = torch.Tensor(masks).bool()
             else:
                 masks = torch.Tensor(masks[:, 0, :, :]).bool()
-            
+
             # Debug code, visualize all bounding boxes and segmentation masks
 
             image_vis = np.array(rgb.permute(1, 2, 0))
@@ -637,8 +749,10 @@ def run_owl_sam_clip(self, rgb, mask, world_xyz):
             for idx, box in enumerate(bounding_boxes):
                 tl_x, tl_y, br_x, br_y = box
                 tl_x, tl_y, br_x, br_y = tl_x.item(), tl_y.item(), br_x.item(), br_y.item()
-                cv2.rectangle(image_vis, (int(tl_x), int(tl_y)), (int(br_x), int(br_y)), (255, 0, 0), 2)
-            image_vis = cv2.cvtColor(image_vis, cv2.COLOR_RGB2BGR) 
+                cv2.rectangle(
+                    image_vis, (int(tl_x), int(tl_y)), (int(br_x), int(br_y)), (255, 0, 0), 2
+                )
+            image_vis = cv2.cvtColor(image_vis, cv2.COLOR_RGB2BGR)
             for vis_mask in masks:
                 segmentation_color_map[vis_mask.detach().cpu().numpy()] = [0, 255, 0]
             image_vis = cv2.addWeighted(image_vis, 0.7, segmentation_color_map, 0.3, 0)
@@ -646,21 +760,38 @@ def run_owl_sam_clip(self, rgb, mask, world_xyz):
                 cv2.imwrite(self.log + "/seg" + str(self.obs_count) + ".jpg", image_vis)
             # else:
             #     rr.log('Segmentation mask', rr.Image(image_vis[:, :, [2, 1, 0]]))
-    
+
             crops = []
             if not self.siglip:
                 for (box, mask) in zip(bounding_boxes, masks):
                     tl_x, tl_y, br_x, br_y = box
-                    crops.append(self.clip_preprocess(transforms.ToPILImage()(rgb[:, max(int(tl_y), 0): min(int(br_y), rgb.shape[1]), max(int(tl_x), 0): min(int(br_x), rgb.shape[2])])))
-                features = self.clip_model.encode_image(torch.stack(crops, dim = 0).to(self.device))
+                    crops.append(
+                        self.clip_preprocess(
+                            transforms.ToPILImage()(
+                                rgb[
+                                    :,
+                                    max(int(tl_y), 0) : min(int(br_y), rgb.shape[1]),
+                                    max(int(tl_x), 0) : min(int(br_x), rgb.shape[2]),
+                                ]
+                            )
+                        )
+                    )
+                features = self.clip_model.encode_image(torch.stack(crops, dim=0).to(self.device))
             else:
                 for box in bounding_boxes:
                     tl_x, tl_y, br_x, br_y = box
-                    crops.append(rgb[:, max(int(tl_y), 0): min(int(br_y), rgb.shape[1]), max(int(tl_x), 0): min(int(br_x), rgb.shape[2])])
-                inputs = self.clip_preprocess(images = crops, padding="max_length", return_tensors="pt").to(self.device)
+                    crops.append(
+                        rgb[
+                            :,
+                            max(int(tl_y), 0) : min(int(br_y), rgb.shape[1]),
+                            max(int(tl_x), 0) : min(int(br_x), rgb.shape[2]),
+                        ]
+                    )
+                inputs = self.clip_preprocess(
+                    images=crops, padding="max_length", return_tensors="pt"
+                ).to(self.device)
                 features = self.clip_model.get_image_features(**inputs)
-            features = F.normalize(features, dim = -1).cpu()
-
+            features = F.normalize(features, dim=-1).cpu()
 
         for idx, (sam_mask, feature) in enumerate(zip(masks.cpu(), features.cpu())):
             valid_mask = torch.logical_and(~mask, sam_mask)
@@ -670,10 +801,10 @@ def run_owl_sam_clip(self, rgb, mask, world_xyz):
             feature = feature.repeat(valid_xyz.shape[0], 1)
             valid_rgb = rgb.permute(1, 2, 0)[valid_mask]
             self.add_to_voxel_pcd(valid_xyz, feature, valid_rgb)
-    
-    def add_to_voxel_pcd(self, valid_xyz, feature, valid_rgb, weights = None, threshold = 0.95):
+
+    def add_to_voxel_pcd(self, valid_xyz, feature, valid_rgb, weights=None, threshold=0.95):
         # Adding all points to voxelizedPointCloud is useless and expensive, we should exclude threshold of all points
-        selected_indices = torch.randperm(len(valid_xyz))[:int((1 - threshold) * len(valid_xyz))]
+        selected_indices = torch.randperm(len(valid_xyz))[: int((1 - threshold) * len(valid_xyz))]
         if len(selected_indices) == 0:
             return
         if valid_xyz is not None:
@@ -685,11 +816,13 @@ def add_to_voxel_pcd(self, valid_xyz, feature, valid_rgb, weights = None, thresh
         if weights is not None:
             weights = weights[selected_indices]
         with self.voxel_map_lock:
-            self.voxel_map_localizer.add(points = valid_xyz, 
-                                    features = feature,
-                                    rgb = valid_rgb,
-                                    weights = weights,
-                                    obs_count = self.obs_count)
+            self.voxel_map_localizer.add(
+                points=valid_xyz,
+                features=feature,
+                rgb=valid_rgb,
+                weights=weights,
+                obs_count=self.obs_count,
+            )
 
     def process_rgbd_images(self, rgb, depth, intrinsics, pose):
         # import time
@@ -698,29 +831,38 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
             os.mkdir(self.log)
         self.obs_count += 1
 
-        cv2.imwrite(self.log + '/rgb' + str(self.obs_count) + '.jpg', rgb[:, :, [2, 1, 0]])
-        np.save(self.log + '/rgb' + str(self.obs_count) + '.npy', rgb)
-        np.save(self.log + '/depth' + str(self.obs_count) + '.npy', depth)
-        np.save(self.log + '/intrinsics' + str(self.obs_count) + '.npy', intrinsics)
-        np.save(self.log + '/pose' + str(self.obs_count) + '.npy', pose)
+        cv2.imwrite(self.log + "/rgb" + str(self.obs_count) + ".jpg", rgb[:, :, [2, 1, 0]])
+        np.save(self.log + "/rgb" + str(self.obs_count) + ".npy", rgb)
+        np.save(self.log + "/depth" + str(self.obs_count) + ".npy", depth)
+        np.save(self.log + "/intrinsics" + str(self.obs_count) + ".npy", intrinsics)
+        np.save(self.log + "/pose" + str(self.obs_count) + ".npy", pose)
 
         rgb, depth = torch.from_numpy(rgb), torch.from_numpy(depth)
         rgb = rgb.permute(2, 0, 1).to(torch.uint8)
-        
+
         h, w = self.image_shape
         h_image, w_image = depth.shape
-        rgb = F.interpolate(rgb.unsqueeze(0), size=self.image_shape, mode='bilinear', align_corners=False).squeeze(0)
-        depth = F.interpolate(depth.unsqueeze(0).unsqueeze(0), size=self.image_shape, mode='bilinear', align_corners=False).squeeze(0).squeeze(0)
+        rgb = F.interpolate(
+            rgb.unsqueeze(0), size=self.image_shape, mode="bilinear", align_corners=False
+        ).squeeze(0)
+        depth = (
+            F.interpolate(
+                depth.unsqueeze(0).unsqueeze(0),
+                size=self.image_shape,
+                mode="bilinear",
+                align_corners=False,
+            )
+            .squeeze(0)
+            .squeeze(0)
+        )
         intrinsics[0, 0] *= w / w_image
         intrinsics[1, 1] *= h / h_image
         intrinsics[0, 2] *= w / w_image
         intrinsics[1, 2] *= h / h_image
 
         world_xyz = get_xyz(depth, pose, intrinsics).squeeze(0)
-        
-        median_depth = torch.from_numpy(
-            scipy.ndimage.median_filter(depth, size=5)
-        )
+
+        median_depth = torch.from_numpy(scipy.ndimage.median_filter(depth, size=5))
         median_filter_error = (depth - median_depth).abs()
         # edges = get_edges(depth, threshold=0.5)
         valid_depth = torch.logical_and(depth < self.max_depth, depth > self.min_depth)
@@ -729,25 +871,32 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
             & (median_filter_error < 0.01).bool()
             # & ~edges
         )
-        
+
         with self.voxel_map_lock:
-            if self.vision_method == 'mask&*lip':
+            if self.vision_method == "mask&*lip":
                 min_samples_clear = 3
             else:
                 min_samples_clear = 10
-            self.voxel_map_localizer.voxel_pcd.clear_points(depth, torch.from_numpy(intrinsics), torch.from_numpy(pose), min_samples_clear = min_samples_clear)
-            self.voxel_map.voxel_pcd.clear_points(depth, torch.from_numpy(intrinsics), torch.from_numpy(pose))
-        
-        if self.vision_method == 'mask&*lip':
+            self.voxel_map_localizer.voxel_pcd.clear_points(
+                depth,
+                torch.from_numpy(intrinsics),
+                torch.from_numpy(pose),
+                min_samples_clear=min_samples_clear,
+            )
+            self.voxel_map.voxel_pcd.clear_points(
+                depth, torch.from_numpy(intrinsics), torch.from_numpy(pose)
+            )
+
+        if self.vision_method == "mask&*lip":
             self.run_owl_sam_clip(rgb, ~valid_depth, world_xyz)
         else:
             self.run_mask_clip(rgb, ~valid_depth, world_xyz)
 
         self.voxel_map.add(
-            camera_pose = torch.Tensor(pose), 
-            rgb = torch.Tensor(rgb).permute(1, 2, 0), 
-            depth = torch.Tensor(depth), 
-            camera_K = torch.Tensor(intrinsics)
+            camera_pose=torch.Tensor(pose),
+            rgb=torch.Tensor(rgb).permute(1, 2, 0),
+            depth=torch.Tensor(depth),
+            camera_K=torch.Tensor(intrinsics),
         )
         obs, exp = self.voxel_map.get_2d_map()
         if self.rerun:
@@ -755,35 +904,43 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
             #     rr.set_time_sequence("frame", self.obs_count)
             # rr.log('robot_pov', rr.Image(rgb.permute(1, 2, 0)), static = self.static)
             if self.voxel_map.voxel_pcd._points is not None:
-                rr.log("Obstalce_map/pointcloud", rr.Points3D(self.voxel_map.voxel_pcd._points.detach().cpu(), \
-                                                              colors=self.voxel_map.voxel_pcd._rgb.detach().cpu() / 255., radii=0.03), static = self.static)
+                rr.log(
+                    "Obstalce_map/pointcloud",
+                    rr.Points3D(
+                        self.voxel_map.voxel_pcd._points.detach().cpu(),
+                        colors=self.voxel_map.voxel_pcd._rgb.detach().cpu() / 255.0,
+                        radii=0.03,
+                    ),
+                    static=self.static,
+                )
             if self.voxel_map_localizer.voxel_pcd._points is not None:
-                rr.log("Semantic_memory/pointcloud", rr.Points3D(self.voxel_map_localizer.voxel_pcd._points.detach().cpu(), \
-                                                                 colors=self.voxel_map_localizer.voxel_pcd._rgb.detach().cpu() / 255., radii=0.03), static = self.static)
+                rr.log(
+                    "Semantic_memory/pointcloud",
+                    rr.Points3D(
+                        self.voxel_map_localizer.voxel_pcd._points.detach().cpu(),
+                        colors=self.voxel_map_localizer.voxel_pcd._rgb.detach().cpu() / 255.0,
+                        radii=0.03,
+                    ),
+                    static=self.static,
+                )
             # rr.log("Obstalce_map/2D_obs_map", rr.Image(obs.int() * 127 + exp.int() * 127))
         else:
-            cv2.imwrite(self.log + '/debug_' + str(self.obs_count) + '.jpg', np.asarray(obs.int() * 127 + exp.int() * 127))
+            cv2.imwrite(
+                self.log + "/debug_" + str(self.obs_count) + ".jpg",
+                np.asarray(obs.int() * 127 + exp.int() * 127),
+            )
         # end_time = time.time()
         # print('image processing takes ' + str(end_time - start_time) + ' seconds.')
 
     def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
-        print('Reading from ', pickle_file_name)
-        rr.init('Debug', spawn = True)
+        print("Reading from ", pickle_file_name)
+        rr.init("Debug", spawn=True)
         if isinstance(pickle_file_name, str):
             pickle_file_name = Path(pickle_file_name)
         assert pickle_file_name.exists(), f"No file found at {pickle_file_name}"
         with pickle_file_name.open("rb") as f:
             data = pickle.load(f)
-        for i, (
-            camera_pose,
-            xyz,
-            rgb,
-            feats,
-            depth,
-            base_pose,
-            K,
-            world_xyz,
-        ) in enumerate(
+        for i, (camera_pose, xyz, rgb, feats, depth, base_pose, K, world_xyz,) in enumerate(
             zip(
                 data["camera_poses"],
                 data["xyz"],
@@ -842,14 +999,18 @@ def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
         self.voxel_map_localizer.voxel_pcd._rgb = data["combined_rgb"]
         self.voxel_map_localizer.voxel_pcd._obs_counts = data["obs_id"]
         self.voxel_map_localizer.voxel_pcd._entity_ids = data["entity_id"]
-        self.voxel_map_localizer.voxel_pcd.obs_count = max(self.voxel_map_localizer.voxel_pcd._obs_counts).item()
-        self.voxel_map.voxel_pcd.obs_count = max(self.voxel_map_localizer.voxel_pcd._obs_counts).item()
+        self.voxel_map_localizer.voxel_pcd.obs_count = max(
+            self.voxel_map_localizer.voxel_pcd._obs_counts
+        ).item()
+        self.voxel_map.voxel_pcd.obs_count = max(
+            self.voxel_map_localizer.voxel_pcd._obs_counts
+        ).item()
 
     def write_to_pickle(self):
         """Write out to a pickle file. This is a rough, quick-and-easy output for debugging, not intended to replace the scalable data writer in data_tools for bigger efforts."""
-        if not os.path.exists('debug'):
-            os.mkdir('debug')
-        filename = 'debug/' + self.log + '.pkl'
+        if not os.path.exists("debug"):
+            os.mkdir("debug")
+        filename = "debug/" + self.log + ".pkl"
         data = {}
         data["camera_poses"] = []
         data["camera_K"] = []
@@ -884,21 +1045,22 @@ def write_to_pickle(self):
         data["entity_id"] = self.voxel_map_localizer.voxel_pcd._entity_ids
         with open(filename, "wb") as f:
             pickle.dump(data, f)
-        print('write all data to', filename)
+        print("write all data to", filename)
+
 
 # @hydra.main(version_base="1.2", config_path=".", config_name="config.yaml")
 # def main(cfg):
 #     torch.manual_seed(1)
 #     imageProcessor = ImageProcessor(rerun = False, static = False, min_depth = 0., max_depth = 2.5)
-    # imageProcessor = ImageProcessor(rerun = cfg.rerun, static = cfg.static, min_depth = cfg.min_depth, max_depth = cfg.max_depth)
-    # if not cfg.pickle_file_name is None:
-    #     imageProcessor.read_from_pickle(cfg.pickle_file_name)
-    # print(imageProcessor.voxel_map_localizer.voxel_pcd._points)
-    # if cfg.open_communication:
-    #     while True:
-    #         imageProcessor.recv_text()
-    # imageProcessor.read_from_pickle('env.pkl', -1)
-    # imageProcessor.write_to_pickle()
+# imageProcessor = ImageProcessor(rerun = cfg.rerun, static = cfg.static, min_depth = cfg.min_depth, max_depth = cfg.max_depth)
+# if not cfg.pickle_file_name is None:
+#     imageProcessor.read_from_pickle(cfg.pickle_file_name)
+# print(imageProcessor.voxel_map_localizer.voxel_pcd._points)
+# if cfg.open_communication:
+#     while True:
+#         imageProcessor.recv_text()
+# imageProcessor.read_from_pickle('env.pkl', -1)
+# imageProcessor.write_to_pickle()
 
 if __name__ == "__main__":
     main(None)
diff --git a/src/stretch/motion/kinematics.py b/src/stretch/motion/kinematics.py
index bd645fcdc..b22716511 100644
--- a/src/stretch/motion/kinematics.py
+++ b/src/stretch/motion/kinematics.py
@@ -567,7 +567,7 @@ def manip_ik(
         update_pb: bool = True,
         num_attempts: int = 1,
         verbose: bool = False,
-        node_name = None
+        node_name=None,
     ):
         """IK in manipulation mode. Takes in a 4x4 pose_query matrix in se(3) and initial
         configuration of the robot.
@@ -592,7 +592,7 @@ def manip_ik(
             raise NotImplementedError()
 
         q, success, debug_info = self.manip_ik_solver.compute_ik(
-            pos, quat, q0, num_attempts=num_attempts, verbose=verbose, node_name = node_name
+            pos, quat, q0, num_attempts=num_attempts, verbose=verbose, node_name=node_name
         )
 
         if q is not None and success:
diff --git a/src/stretch/motion/pinocchio_ik_solver.py b/src/stretch/motion/pinocchio_ik_solver.py
index 656aa2229..d783f02cd 100644
--- a/src/stretch/motion/pinocchio_ik_solver.py
+++ b/src/stretch/motion/pinocchio_ik_solver.py
@@ -134,10 +134,16 @@ def _qmap_model2control(self, q_input: np.ndarray) -> np.ndarray:
 
         return q_out
 
-    def get_frame_pose(self, config: Union[np.ndarray, dict], node_a: str, node_b: str, ignore_missing_joints: bool = False):
-        '''
-            Get a transformation matrix transforming from node_a frame to node_b frame
-        '''
+    def get_frame_pose(
+        self,
+        config: Union[np.ndarray, dict],
+        node_a: str,
+        node_b: str,
+        ignore_missing_joints: bool = False,
+    ):
+        """
+        Get a transformation matrix transforming from node_a frame to node_b frame
+        """
         q_model = self._qmap_control2model(config, ignore_missing_joints=ignore_missing_joints)
         # print('q_model', q_model)
         pinocchio.forwardKinematics(self.model, self.data, q_model)
@@ -194,7 +200,7 @@ def compute_ik(
         num_attempts: int = 1,
         verbose: bool = False,
         ignore_missing_joints: bool = False,
-        node_name = None
+        node_name=None,
     ) -> Tuple[np.ndarray, bool, dict]:
         """given end-effector position and quaternion, return joint values.
 
@@ -221,7 +227,7 @@ def compute_ik(
         while True:
             pinocchio.forwardKinematics(self.model, self.data, q)
             if node_name is not None:
-                frame_idx = [f.name for f in self.model.frames].index(node_name) 
+                frame_idx = [f.name for f in self.model.frames].index(node_name)
             else:
                 frame_idx = self.ee_frame_idx
             pinocchio.updateFramePlacement(self.model, self.data, frame_idx)
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 668ec5ca3..0939c1d6c 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -211,7 +211,7 @@ def __init__(
                 vectors=[[1, 0, 0], [0, 1, 0], [0, 0, 1]],
                 colors=[[255, 0, 0], [0, 255, 0], [0, 0, 255]],
             ),
-            static = True
+            static=True,
         )
 
         self.bbox_colors_memory = {}
@@ -241,10 +241,14 @@ def setup_blueprint(self, collapse_panels: bool):
     def log_head_camera(self, obs):
         """Log head camera pose and images"""
         rr.set_time_seconds("realtime", time.time())
-        rr.log("world/head_camera/rgb", rr.Image(obs["rgb"]), static = True)
-        rr.log("world/head_camera/depth", rr.DepthImage(obs["depth"]), static = True)
+        rr.log("world/head_camera/rgb", rr.Image(obs["rgb"]), static=True)
+        rr.log("world/head_camera/depth", rr.DepthImage(obs["depth"]), static=True)
         rot, trans = decompose_homogeneous_matrix(obs["camera_pose"])
-        rr.log("world/head_camera", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3), static = True)
+        rr.log(
+            "world/head_camera",
+            rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3),
+            static=True,
+        )
         rr.log(
             "world/head_camera",
             rr.Pinhole(
@@ -252,7 +256,7 @@ def log_head_camera(self, obs):
                 image_from_camera=obs["camera_K"],
                 image_plane_distance=0.15,
             ),
-            static = True
+            static=True,
         )
 
     def log_robot_xyt(self, obs):
@@ -268,10 +272,14 @@ def log_robot_xyt(self, obs):
             colors=[255, 0, 0, 255],
         )
 
-        rr.log("world/robot/arrow", rb_arrow, static = True)
-        rr.log("world/robot/blob", rr.Points3D([0, 0, 0], colors=[255, 0, 0, 255], radii=0.13), static = True)
+        rr.log("world/robot/arrow", rb_arrow, static=True)
+        rr.log(
+            "world/robot/blob",
+            rr.Points3D([0, 0, 0], colors=[255, 0, 0, 255], radii=0.13),
+            static=True,
+        )
         # rr.log("world/robot/arrow", rb_arrow)
-  
+
         rr.log(
             "world/robot",
             rr.Transform3D(
@@ -279,7 +287,7 @@ def log_robot_xyt(self, obs):
                 rotation=rr.RotationAxisAngle(axis=[0, 0, 1], radians=theta),
                 axis_length=0.7,
             ),
-            static = True
+            static=True,
         )
 
     def log_ee_frame(self, obs):
@@ -293,8 +301,10 @@ def log_ee_frame(self, obs):
         ee_arrow = rr.Arrows3D(
             origins=[0, 0, 0], vectors=[0.2, 0, 0], radii=0.02, labels="ee", colors=[0, 255, 0, 255]
         )
-        rr.log("world/ee/arrow", ee_arrow, static = True)
-        rr.log("world/ee", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3), static = True)
+        rr.log("world/ee/arrow", ee_arrow, static=True)
+        rr.log(
+            "world/ee", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3), static=True
+        )
 
     def log_ee_camera(self, servo):
         """Log end effector camera pose and images
@@ -303,10 +313,14 @@ def log_ee_camera(self, servo):
         """
         rr.set_time_seconds("realtime", time.time())
         # EE Camera
-        rr.log("world/ee_camera/rgb", rr.Image(servo.ee_rgb), static = True)
-        rr.log("world/ee_camera/depth", rr.DepthImage(servo.ee_depth), static = True)
+        rr.log("world/ee_camera/rgb", rr.Image(servo.ee_rgb), static=True)
+        rr.log("world/ee_camera/depth", rr.DepthImage(servo.ee_depth), static=True)
         rot, trans = decompose_homogeneous_matrix(servo.ee_camera_pose)
-        rr.log("world/ee_camera", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3), static = True)
+        rr.log(
+            "world/ee_camera",
+            rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3),
+            static=True,
+        )
         rr.log(
             "world/ee_camera",
             rr.Pinhole(
@@ -314,7 +328,7 @@ def log_ee_camera(self, servo):
                 image_from_camera=servo.ee_camera_K,
                 image_plane_distance=0.15,
             ),
-            static = True
+            static=True,
         )
 
     def log_robot_state(self, obs):
@@ -322,7 +336,11 @@ def log_robot_state(self, obs):
         rr.set_time_seconds("realtime", time.time())
         state = obs["joint"]
         for k in HelloStretchIdx.name_to_idx:
-            rr.log(f"robot_state/joint_pose/{k}", rr.Scalar(state[HelloStretchIdx.name_to_idx[k]]), static = True)
+            rr.log(
+                f"robot_state/joint_pose/{k}",
+                rr.Scalar(state[HelloStretchIdx.name_to_idx[k]]),
+                static=True,
+            )
 
     def log_robot_transforms(self, obs):
         """
@@ -350,7 +368,7 @@ def update_voxel_map(
         rr.log(
             "world/point_cloud",
             rr.Points3D(positions=points, radii=np.ones(rgb.shape[0]) * 0.01, colors=np.int64(rgb)),
-            static = True
+            static=True,
         )
 
         t1 = timeit.default_timer()
@@ -378,7 +396,7 @@ def update_voxel_map(
                 radii=np.ones(points.shape[0]) * obstacle_radius,
                 colors=[255, 0, 0],
             ),
-            static = True
+            static=True,
         )
         rr.log(
             "world/explored",
@@ -387,7 +405,7 @@ def update_voxel_map(
                 radii=np.ones(points.shape[0]) * explored_radius,
                 colors=[255, 255, 255],
             ),
-            static = True
+            static=True,
         )
         t6 = timeit.default_timer()
 
@@ -426,7 +444,7 @@ def update_scene_graph(
                 rr.log(
                     f"world/{instance.id}_{name}",
                     rr.Points3D(positions=point_cloud_rgb, colors=np.int64(pcd_rgb)),
-                    static = True
+                    static=True,
                 )
                 half_sizes = [(b[0] - b[1]) / 2 for b in bbox_bounds]
                 bounds.append(half_sizes)
@@ -444,7 +462,7 @@ def update_scene_graph(
                     radii=0.01,
                     colors=colors,
                 ),
-                static = True
+                static=True,
             )
             t1 = timeit.default_timer()
             print("Time to log scene graph objects: ", t1 - t0)

From 15b4fc293fd62ba57bc7aa84ec055bd43d2453bd Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 23 Sep 2024 11:22:35 -0400
Subject: [PATCH 214/410] Run pre-commit hooks and fix formatting; many errors
 remain

---
 src/stretch/agent/zmq_client.py               |  36 +-
 src/stretch/dynav/__init__.py                 |  11 +-
 src/stretch/dynav/communication_util.py       |  55 +-
 src/stretch/dynav/llm_localizer.py            | 270 +++++---
 src/stretch/dynav/llm_server.py               | 426 +++++++-----
 src/stretch/dynav/mapping_utils/__init__.py   |  16 +-
 src/stretch/dynav/mapping_utils/a_star.py     |  66 +-
 src/stretch/dynav/mapping_utils/voxel.py      | 125 ++--
 src/stretch/dynav/mapping_utils/voxel_map.py  | 148 ++--
 .../dynav/mapping_utils/voxelized_pcd.py      | 153 +++--
 src/stretch/dynav/ok_robot_hw/README.md       |  27 +-
 src/stretch/dynav/ok_robot_hw/camera.py       |  37 +-
 .../dynav/ok_robot_hw/global_parameters.py    |  13 +-
 .../dynav/ok_robot_hw/image_publisher.py      |  41 +-
 src/stretch/dynav/ok_robot_hw/robot.py        | 233 ++++---
 .../dynav/ok_robot_hw/utils/__init__.py       |  13 +-
 .../ok_robot_hw/utils/communication_utils.py  |  42 +-
 .../dynav/ok_robot_hw/utils/grasper_utils.py  | 191 +++---
 src/stretch/dynav/ok_robot_hw/utils/utils.py  |  19 +-
 src/stretch/dynav/robot_agent_manip_mdp.py    | 299 ++++----
 src/stretch/dynav/run_manip.py                |  43 +-
 src/stretch/dynav/run_ok_nav.py               |  88 +--
 src/stretch/dynav/scannet.py                  | 424 ++++++------
 src/stretch/dynav/voxel_map_localizer.py      | 189 ++++--
 src/stretch/dynav/voxel_map_server.py         | 636 +++++++++++-------
 src/stretch/motion/kinematics.py              |   4 +-
 src/stretch/motion/pinocchio_ik_solver.py     |  18 +-
 src/stretch/visualization/rerun.py            |  60 +-
 28 files changed, 2168 insertions(+), 1515 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index d2fd6958c..c03a99d03 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -18,6 +18,7 @@
 import click
 import numpy as np
 import zmq
+from scipy.spatial.transform import Rotation as R
 from termcolor import colored
 
 import stretch.motion.constants as constants
@@ -29,15 +30,12 @@
 from stretch.core.robot import AbstractRobotClient
 from stretch.motion import PlanResult, RobotModel
 from stretch.motion.kinematics import HelloStretchIdx, HelloStretchKinematics
-from stretch.utils.geometry import angle_difference
+from stretch.utils.geometry import angle_difference, posquat2sophus, sophus2posquat
 from stretch.utils.image import Camera
 from stretch.utils.memory import lookup_address
 from stretch.utils.point_cloud import show_point_cloud
 from stretch.visualization.rerun import RerunVsualizer
 
-from scipy.spatial.transform import Rotation
-from stretch.utils.geometry import posquat2sophus, sophus2posquat
-
 # TODO: debug code - remove later if necessary
 # import faulthandler
 # faulthandler.enable()
@@ -250,8 +248,8 @@ def get_joint_positions(self, timeout: float = 5.0) -> np.ndarray:
 
     def get_six_joints(self, timeout: float = 5.0) -> np.ndarray:
         """Get the six major joint positions"""
-        joint_positions = self.get_joint_positions(timeout = timeout)
-        return  np.array(self._extract_joint_pos(joint_positions))
+        joint_positions = self.get_joint_positions(timeout=timeout)
+        return np.array(self._extract_joint_pos(joint_positions))
 
     def get_joint_velocities(self, timeout: float = 5.0) -> np.ndarray:
         """Get the current joint velocities"""
@@ -308,10 +306,10 @@ def get_gripper_position(self):
         joint_state = self.get_joint_positions()
         return joint_state[HelloStretchIdx.GRIPPER]
 
-    def get_ee_pose(self, matrix=False, link_name=None, q = None):
+    def get_ee_pose(self, matrix=False, link_name=None, q=None):
         if q is None:
             q = self.get_joint_positions()
-        pos, quat = self._robot_model.manip_fk(q, node = link_name)
+        pos, quat = self._robot_model.manip_fk(q, node=link_name)
 
         if matrix:
             pose = posquat2sophus(pos, quat)
@@ -329,7 +327,7 @@ def solve_ik(
         relative: bool = False,
         initial_cfg: np.ndarray = None,
         debug: bool = False,
-        node_name = None
+        node_name=None,
     ) -> Optional[np.ndarray]:
         """Solve inverse kinematics appropriately (or at least try to) and get the joint position
         that we will be moving to.
@@ -369,7 +367,7 @@ def solve_ik(
 
         # Perform IK
         full_body_cfg, ik_success, ik_debug_info = self._robot_model.manip_ik(
-            (pos_ik_goal, quat_ik_goal), q0=initial_cfg, node_name = node_name
+            (pos_ik_goal, quat_ik_goal), q0=initial_cfg, node_name=node_name
         )
 
         # Expected to return None if we did not get a solution
@@ -422,11 +420,15 @@ def head_to(
 
     def look_front(self, blocking: bool = True, timeout: float = 10.0):
         """Let robot look to its front."""
-        self.head_to(constants.look_front[0], constants.look_front[1], blocking = blocking, timeout = timeout)
+        self.head_to(
+            constants.look_front[0], constants.look_front[1], blocking=blocking, timeout=timeout
+        )
 
     def look_at_ee(self, blocking: bool = True, timeout: float = 10.0):
         """Let robot look to its arm."""
-        self.head_to(constants.look_at_ee[0], constants.look_at_ee[1], blocking = blocking, timeout = timeout)
+        self.head_to(
+            constants.look_at_ee[0], constants.look_at_ee[1], blocking=blocking, timeout=timeout
+        )
 
     def arm_to(
         self,
@@ -579,9 +581,7 @@ def navigate_to(
             self._rerun.update_nav_goal(xyt)
         self.send_action(next_action, timeout=timeout, verbose=verbose)
 
-    def set_velocity(
-        self, v: float, w: float
-    ):
+    def set_velocity(self, v: float, w: float):
         """Move to xyt in global coordinates or relative coordinates."""
         next_action = {"v": v, "w": w}
         self.send_action(next_action)
@@ -957,8 +957,8 @@ def get_observation(self):
             observation.camera_pose = self._obs.get("camera_pose", None)
             observation.seq_id = self._seq_id
         return observation
- 
-    def get_images(self, compute_xyz = False):
+
+    def get_images(self, compute_xyz=False):
         obs = self.get_observation()
         if compute_xyz:
             return obs.rgb, obs.depth, obs.xyz
@@ -983,7 +983,7 @@ def execute_trajectory(
         per_waypoint_timeout: float = 10.0,
         final_timeout: float = 10.0,
         relative: bool = False,
-        blocking: bool = False
+        blocking: bool = False,
     ):
         """Execute a multi-step trajectory; this is always blocking since it waits to reach each one in turn."""
 
diff --git a/src/stretch/dynav/__init__.py b/src/stretch/dynav/__init__.py
index 3448d7994..770e2711e 100644
--- a/src/stretch/dynav/__init__.py
+++ b/src/stretch/dynav/__init__.py
@@ -1 +1,10 @@
-from .robot_agent_manip_mdp import RobotAgentMDP
\ No newline at end of file
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+from .robot_agent_manip_mdp import RobotAgentMDP
diff --git a/src/stretch/dynav/communication_util.py b/src/stretch/dynav/communication_util.py
index 7680ed147..aa5d5b192 100644
--- a/src/stretch/dynav/communication_util.py
+++ b/src/stretch/dynav/communication_util.py
@@ -1,6 +1,16 @@
-import zmq
-import numpy as np
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 import cv2
+import numpy as np
+import zmq
+
 
 def load_socket(port_number):
     context = zmq.Context()
@@ -9,48 +19,58 @@ def load_socket(port_number):
 
     return socket
 
+
 def send_array(socket, A, flags=0, copy=True, track=False):
     """send a numpy array with metadata"""
     A = np.array(A)
     md = dict(
-        dtype = str(A.dtype),
-        shape = A.shape,
+        dtype=str(A.dtype),
+        shape=A.shape,
     )
-    socket.send_json(md, flags|zmq.SNDMORE)
+    socket.send_json(md, flags | zmq.SNDMORE)
     return socket.send(np.ascontiguousarray(A), flags, copy=copy, track=track)
 
+
 def recv_array(socket, flags=0, copy=True, track=False):
     """recv a numpy array"""
     md = socket.recv_json(flags=flags)
     msg = socket.recv(flags=flags, copy=copy, track=track)
-    A = np.frombuffer(msg, dtype=md['dtype'])
-    return A.reshape(md['shape'])
+    A = np.frombuffer(msg, dtype=md["dtype"])
+    return A.reshape(md["shape"])
+
 
 def send_rgb_img(socket, img):
-    img = img.astype(np.uint8) 
+    img = img.astype(np.uint8)
     encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 90]
-    _, img_encoded = cv2.imencode('.jpg', img, encode_param)
+    _, img_encoded = cv2.imencode(".jpg", img, encode_param)
     socket.send(img_encoded.tobytes())
 
+
 def recv_rgb_img(socket):
     img = socket.recv()
     img = np.frombuffer(img, dtype=np.uint8)
     img = cv2.imdecode(img, cv2.IMREAD_COLOR)
     return img
 
+
 def send_depth_img(socket, depth_img):
     depth_img = (depth_img * 1000).astype(np.uint16)
-    encode_param = [int(cv2.IMWRITE_PNG_COMPRESSION), 3]  # Compression level from 0 (no compression) to 9 (max compression)
-    _, depth_img_encoded = cv2.imencode('.png', depth_img, encode_param)
+    encode_param = [
+        int(cv2.IMWRITE_PNG_COMPRESSION),
+        3,
+    ]  # Compression level from 0 (no compression) to 9 (max compression)
+    _, depth_img_encoded = cv2.imencode(".png", depth_img, encode_param)
     socket.send(depth_img_encoded.tobytes())
 
+
 def recv_depth_img(socket):
     depth_img = socket.recv()
     depth_img = np.frombuffer(depth_img, dtype=np.uint8)
     depth_img = cv2.imdecode(depth_img, cv2.IMREAD_UNCHANGED)
-    depth_img = (depth_img / 1000.)
+    depth_img = depth_img / 1000.0
     return depth_img
 
+
 def send_everything(socket, rgb, depth, intrinsics, pose):
     send_rgb_img(socket, rgb)
     socket.recv_string()
@@ -61,13 +81,14 @@ def send_everything(socket, rgb, depth, intrinsics, pose):
     send_array(socket, pose)
     socket.recv_string()
 
+
 def recv_everything(socket):
     rgb = recv_rgb_img(socket)
-    socket.send_string('')
+    socket.send_string("")
     depth = recv_depth_img(socket)
-    socket.send_string('')
+    socket.send_string("")
     intrinsics = recv_array(socket)
-    socket.send_string('')
+    socket.send_string("")
     pose = recv_array(socket)
-    socket.send_string('')
-    return rgb, depth, intrinsics, pose
\ No newline at end of file
+    socket.send_string("")
+    return rgb, depth, intrinsics, pose
diff --git a/src/stretch/dynav/llm_localizer.py b/src/stretch/dynav/llm_localizer.py
index 4dd767e20..d99168f5d 100644
--- a/src/stretch/dynav/llm_localizer.py
+++ b/src/stretch/dynav/llm_localizer.py
@@ -1,27 +1,32 @@
-from io import BytesIO
-from concurrent.futures import ThreadPoolExecutor, as_completed
-import time
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import base64
 import os
+import time
+from collections import OrderedDict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from io import BytesIO
+from typing import Optional
 
-import torch
+import google.generativeai as genai
 import numpy as np
+import torch
+from openai import OpenAI
 from PIL import Image
-
-from typing import Optional
 from torch import Tensor
+from transformers import AutoProcessor, Owlv2ForObjectDetection
 
-from stretch.utils.point_cloud_torch import unproject_masked_depth_to_xyz_coordinates
 from stretch.dynav.mapping_utils import VoxelizedPointcloud
+from stretch.utils.point_cloud_torch import unproject_masked_depth_to_xyz_coordinates
 
-from transformers import AutoProcessor
-from transformers import Owlv2ForObjectDetection
-
-import google.generativeai as genai
-from openai import OpenAI
-import base64
-from collections import OrderedDict
-
-genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
+genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
 generation_config = genai.GenerationConfig(temperature=0)
 safety_settings = [
     {
@@ -46,9 +51,15 @@
     },
 ]
 
+
 def get_inv_intrinsics(intrinsics):
     # return intrinsics.double().inverse().to(intrinsics)
-    fx, fy, ppx, ppy = intrinsics[..., 0, 0], intrinsics[..., 1, 1], intrinsics[..., 0, 2], intrinsics[..., 1, 2]
+    fx, fy, ppx, ppy = (
+        intrinsics[..., 0, 0],
+        intrinsics[..., 1, 1],
+        intrinsics[..., 0, 2],
+        intrinsics[..., 1, 2],
+    )
     inv_intrinsics = torch.zeros_like(intrinsics)
     inv_intrinsics[..., 0, 0] = 1.0 / fx
     inv_intrinsics[..., 1, 1] = 1.0 / fy
@@ -57,6 +68,7 @@ def get_inv_intrinsics(intrinsics):
     inv_intrinsics[..., 2, 2] = 1.0
     return inv_intrinsics
 
+
 def get_xyz(depth, pose, intrinsics):
     """Returns the XYZ coordinates for a set of points.
 
@@ -96,52 +108,57 @@ def get_xyz(depth, pose, intrinsics):
     xyz = xyz @ get_inv_intrinsics(intrinsics).transpose(-1, -2)
     xyz = xyz * depth.flatten(1).unsqueeze(-1)
     xyz = (xyz[..., None, :] * pose[..., None, :3, :3]).sum(dim=-1) + pose[..., None, :3, 3]
-    
+
     xyz = xyz.unflatten(1, (height, width))
 
     return xyz
 
 
-class LLM_Localizer():
-    def __init__(self, voxel_map_wrapper = None, exist_model = 'gemini-1.5-pro', loc_model = 'owlv2', device = 'cuda'):
+class LLM_Localizer:
+    def __init__(
+        self, voxel_map_wrapper=None, exist_model="gemini-1.5-pro", loc_model="owlv2", device="cuda"
+    ):
         self.voxel_map_wrapper = voxel_map_wrapper
         self.device = device
         self.voxel_pcd = VoxelizedPointcloud(voxel_size=0.2).to(self.device)
         self.existence_checking_model = exist_model
-        
-        self.api_key_1 = os.getenv('GOOGLE_API_KEY')
-        self.api_key_2 = os.getenv('GOOGLE_API_KEY_2')
-        self.api_key_3 = os.getenv('GOOGLE_API_KEY_3')
-        self.api_key = self.api_key_1
 
+        self.api_key_1 = os.getenv("GOOGLE_API_KEY")
+        self.api_key_2 = os.getenv("GOOGLE_API_KEY_2")
+        self.api_key_3 = os.getenv("GOOGLE_API_KEY_3")
+        self.api_key = self.api_key_1
 
         self.context_length = 60
         self.count_threshold = 3
-        if 'gpt' in self.existence_checking_model:
+        if "gpt" in self.existence_checking_model:
             self.max_img_per_request = 30
         else:
             self.max_img_per_request = 200
 
+        if exist_model == "gpt-4o":
+            print("WE ARE USING OPENAI GPT4o")
+            self.gpt_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+        elif exist_model == "gemini-1.5-pro":
+            print("WE ARE USING GEMINI 1.5 PRO")
 
-        if exist_model == 'gpt-4o':
-            print('WE ARE USING OPENAI GPT4o')
-            self.gpt_client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
-        elif exist_model == 'gemini-1.5-pro':
-            print('WE ARE USING GEMINI 1.5 PRO')
-            
-        elif exist_model == 'gemini-1.5-flash':
-            print('WE ARE USING GEMINI 1.5 FLASH')
+        elif exist_model == "gemini-1.5-flash":
+            print("WE ARE USING GEMINI 1.5 FLASH")
         else:
-            print('YOU ARE USING NOTHING!')
+            print("YOU ARE USING NOTHING!")
         self.location_checking_model = loc_model
-        if loc_model == 'owlv2':
-            self.exist_processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
-            self.exist_model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble").to(self.device)
-            print('WE ARE USING OWLV2 FOR LOCALIZATION!')
+        if loc_model == "owlv2":
+            self.exist_processor = AutoProcessor.from_pretrained(
+                "google/owlv2-base-patch16-ensemble"
+            )
+            self.exist_model = Owlv2ForObjectDetection.from_pretrained(
+                "google/owlv2-base-patch16-ensemble"
+            ).to(self.device)
+            print("WE ARE USING OWLV2 FOR LOCALIZATION!")
         else:
-            print('YOU ARE USING VOXEL MAP FOR LOCALIZATION!')
-        
-    def add(self,
+            print("YOU ARE USING VOXEL MAP FOR LOCALIZATION!")
+
+    def add(
+        self,
         points: Tensor,
         features: Optional[Tensor],
         rgb: Optional[Tensor],
@@ -155,35 +172,42 @@ def add(self,
             rgb = rgb.to(self.device)
         if weights is not None:
             weights = weights.to(self.device)
-        self.voxel_pcd.add(points = points, 
-                        features = features,
-                        rgb = rgb,
-                        weights = weights,
-                        obs_count = obs_count)
-
-    def compute_coord(self, text, image_info, threshold = 0.2):
-        rgb = image_info['image']
+        self.voxel_pcd.add(
+            points=points, features=features, rgb=rgb, weights=weights, obs_count=obs_count
+        )
+
+    def compute_coord(self, text, image_info, threshold=0.2):
+        rgb = image_info["image"]
         inputs = self.exist_processor(text=text, images=rgb, return_tensors="pt")
         for input in inputs:
-            inputs[input] = inputs[input].to('cuda')
-        
+            inputs[input] = inputs[input].to("cuda")
+
         with torch.no_grad():
             outputs = self.exist_model(**inputs)
-    
+
         target_sizes = torch.Tensor([rgb.size[::-1]]).to(self.device)
         results = self.exist_processor.image_processor.post_process_object_detection(
             outputs, threshold=threshold, target_sizes=target_sizes
         )[0]
-        depth = image_info['depth']
-        xyzs = image_info['xyz']
+        depth = image_info["depth"]
+        xyzs = image_info["xyz"]
         temp_lst = []
-        for idx, (score, bbox) in enumerate(sorted(zip(results['scores'], results['boxes']), key=lambda x: x[0], reverse=True)):
-        
+        for idx, (score, bbox) in enumerate(
+            sorted(zip(results["scores"], results["boxes"]), key=lambda x: x[0], reverse=True)
+        ):
+
             tl_x, tl_y, br_x, br_y = bbox
             w, h = depth.shape
-            tl_x, tl_y, br_x, br_y = int(max(0, tl_x.item())), int(max(0, tl_y.item())), int(min(h, br_x.item())), int(min(w, br_y.item()))
-            if np.median(depth[tl_y: br_y, tl_x: br_x].reshape(-1)) < 3:
-                coordinate = torch.from_numpy(np.median(xyzs[tl_y: br_y, tl_x: br_x].reshape(-1, 3), axis = 0))
+            tl_x, tl_y, br_x, br_y = (
+                int(max(0, tl_x.item())),
+                int(max(0, tl_y.item())),
+                int(min(h, br_x.item())),
+                int(min(w, br_y.item())),
+            )
+            if np.median(depth[tl_y:br_y, tl_x:br_x].reshape(-1)) < 3:
+                coordinate = torch.from_numpy(
+                    np.median(xyzs[tl_y:br_y, tl_x:br_x].reshape(-1, 3), axis=0)
+                )
                 # temp_lst.append(coordinate)
                 return coordinate
         return None
@@ -198,9 +222,13 @@ def owl_locater(self, A, encoded_image, timestamps_lst):
                 image_info = encoded_image[i][-1]
                 res = self.compute_coord(A, image_info, threshold=0.2)
                 if res is not None:
-                    debug_text = '#### - Obejct is detected in observations where instance' + str(i + 1) + ' comes from. **😃** Directly navigate to it.\n'
+                    debug_text = (
+                        "#### - Obejct is detected in observations where instance"
+                        + str(i + 1)
+                        + " comes from. **😃** Directly navigate to it.\n"
+                    )
                     return res, debug_text, i, None
-        debug_text = '#### - All instances are not the target! Maybe target object has not been observed yet. **😭**\n'
+        debug_text = "#### - All instances are not the target! Maybe target object has not been observed yet. **😭**\n"
 
         # if query_coord_lst != []:
         #     query_coord_lst = np.array(query_coord_lst)
@@ -218,28 +246,39 @@ def owl_locater(self, A, encoded_image, timestamps_lst):
         #     centroid = largest_cluster_points.mean(axis=0)
         #     return centroid, debug_text, None, None
         return None, debug_text, None, None
-    
+
     def process_chunk(self, chunk, sys_prompt, user_prompt):
         for i in range(50):
             try:
-                if 'gpt' in self.existence_checking_model:
+                if "gpt" in self.existence_checking_model:
                     start_time = time.time()
-                    response = self.gpt_client.chat.completions.create(
-                        model=self.existence_checking_model,
-                        messages=[
-                            {"role": "system", "content": sys_prompt},
-                            {"role": "user", "content": user_prompt},
-                            {"role": "user", "content": chunk}
+                    response = (
+                        self.gpt_client.chat.completions.create(
+                            model=self.existence_checking_model,
+                            messages=[
+                                {"role": "system", "content": sys_prompt},
+                                {"role": "user", "content": user_prompt},
+                                {"role": "user", "content": chunk},
                             ],
                             temperature=0.0,
-                        ).choices[0].message.content
-                    
+                        )
+                        .choices[0]
+                        .message.content
+                    )
+
                     end_time = time.time()
-                    print('GPT request cost:', end_time-start_time)
+                    print("GPT request cost:", end_time - start_time)
                 else:
-                    model = genai.GenerativeModel(model_name=f"models/{self.existence_checking_model}-exp-0827", system_instruction=sys_prompt)
+                    model = genai.GenerativeModel(
+                        model_name=f"models/{self.existence_checking_model}-exp-0827",
+                        system_instruction=sys_prompt,
+                    )
                     # "models/{self.existence_checking_model}-exp-0827"
-                    response = model.generate_content(chunk + [user_prompt], generation_config=generation_config, safety_settings=safety_settings).text
+                    response = model.generate_content(
+                        chunk + [user_prompt],
+                        generation_config=generation_config,
+                        safety_settings=safety_settings,
+                    ).text
                 # print("Assistant: ", response)
                 return response
             except Exception as e:
@@ -256,17 +295,17 @@ def process_chunk(self, chunk, sys_prompt, user_prompt):
         return "Execution Failed"
 
     def update_dict(self, A, timestamps_dict, content):
-        timestamps_lst = content.split('\n')
+        timestamps_lst = content.split("\n")
         for item in timestamps_lst:
-            if len(item) < 3 or ':' not in item:
+            if len(item) < 3 or ":" not in item:
                 continue
-            key, value_str = item.split(':')
+            key, value_str = item.split(":")
             if A not in key:
                 continue
-            if 'None' in value_str:
+            if "None" in value_str:
                 value = None
             else:
-                value = list(map(int, value_str.replace(' ', '').split(',')))
+                value = list(map(int, value_str.replace(" ", "").split(",")))
             if key in timestamps_dict:
                 if timestamps_dict[key] is None:
                     timestamps_dict[key] = value
@@ -277,7 +316,7 @@ def update_dict(self, A, timestamps_dict, content):
 
     def llm_locator(self, A, encoded_image):
         timestamps_dict = {}
-    
+
         sys_prompt = f"""
         For object query I give, you need to find timestamps of images that the object is shown, without any unnecessary explanation or space. If the object never exist, please output the object name and the word "None" for timestamps.
 
@@ -291,16 +330,26 @@ def llm_locator(self, A, encoded_image):
         white box: 2,4,10"""
 
         user_prompt = f"""The object you need to find is {A}"""
-        if 'gpt' in self.existence_checking_model:
-            content = [item for sublist in list(encoded_image.values()) for item in sublist[:2]][-self.context_length*2:]
+        if "gpt" in self.existence_checking_model:
+            content = [item for sublist in list(encoded_image.values()) for item in sublist[:2]][
+                -self.context_length * 2 :
+            ]
         else:
-            content = [item for sublist in list(encoded_image.values()) for item in sublist[0]][-self.context_length*2:]
-        
-        content_chunks = [content[i:i + 2* self.max_img_per_request] for i in range(0, len(content), 2* self.max_img_per_request)]
-        
+            content = [item for sublist in list(encoded_image.values()) for item in sublist[0]][
+                -self.context_length * 2 :
+            ]
+
+        content_chunks = [
+            content[i : i + 2 * self.max_img_per_request]
+            for i in range(0, len(content), 2 * self.max_img_per_request)
+        ]
+
         with ThreadPoolExecutor(max_workers=5) as executor:
-            future_to_chunk = {executor.submit(self.process_chunk, chunk, sys_prompt, user_prompt): chunk for chunk in content_chunks}
-            
+            future_to_chunk = {
+                executor.submit(self.process_chunk, chunk, sys_prompt, user_prompt): chunk
+                for chunk in content_chunks
+            }
+
             for future in as_completed(future_to_chunk):
                 chunk = future_to_chunk[future]
                 try:
@@ -310,23 +359,22 @@ def llm_locator(self, A, encoded_image):
                 except Exception as e:
                     print(f"Exception occurred: {e}")
         if A not in timestamps_dict:
-            return None, 'debug_text', None, None
+            return None, "debug_text", None, None
 
         timestamps_lst = timestamps_dict[A]
         if timestamps_lst is None:
-            return None, 'debug_text', None, None
+            return None, "debug_text", None, None
         timestamps_lst = sorted(timestamps_lst, reverse=True)
         # return None
         return self.owl_locater(A, encoded_image, timestamps_lst)
-    
 
-    def localize_A(self, A, debug = True, return_debug = False):
+    def localize_A(self, A, debug=True, return_debug=False):
         encoded_image = OrderedDict()
         counts = torch.bincount(self.voxel_pcd._obs_counts)
         filtered_obs = (counts > self.count_threshold).nonzero(as_tuple=True)[0].tolist()
         filtered_obs = sorted(filtered_obs)
 
-        for obs_id in filtered_obs: 
+        for obs_id in filtered_obs:
             obs_id -= 1
             rgb = np.copy(self.voxel_map_wrapper.observations[obs_id].rgb.numpy())
             depth = self.voxel_map_wrapper.observations[obs_id].depth
@@ -337,28 +385,38 @@ def localize_A(self, A, debug = True, return_debug = False):
 
             rgb[depth > 2.5] = [0, 0, 0]
 
-            image = Image.fromarray(rgb.astype(np.uint8), mode='RGB')
-            if 'gemini' in self.existence_checking_model:
-                encoded_image[obs_id] = [[f"Following is the image took on timestamp {obs_id}: ", image], {'image':image, 'xyz': xyz, 'depth':depth}]
-            elif 'gpt' in self.existence_checking_model:
+            image = Image.fromarray(rgb.astype(np.uint8), mode="RGB")
+            if "gemini" in self.existence_checking_model:
+                encoded_image[obs_id] = [
+                    [f"Following is the image took on timestamp {obs_id}: ", image],
+                    {"image": image, "xyz": xyz, "depth": depth},
+                ]
+            elif "gpt" in self.existence_checking_model:
                 buffered = BytesIO()
                 image.save(buffered, format="PNG")
                 img_bytes = buffered.getvalue()
-                base64_encoded = base64.b64encode(img_bytes).decode('utf-8')
-                encoded_image[obs_id] = [{"type": "text", "text": f"Following is the image took on timestamp {obs_id}: "},
-                    {"type": "image_url", "image_url": {
-                        "url": f"data:image/png;base64,{base64_encoded}"}
-                    }, {'image':image, 'xyz':xyz, 'depth':depth}]
+                base64_encoded = base64.b64encode(img_bytes).decode("utf-8")
+                encoded_image[obs_id] = [
+                    {
+                        "type": "text",
+                        "text": f"Following is the image took on timestamp {obs_id}: ",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/png;base64,{base64_encoded}"},
+                    },
+                    {"image": image, "xyz": xyz, "depth": depth},
+                ]
                 # print(obs_id)
-        
+
         start_time = time.time()
         target_point, debug_text, obs, point = self.llm_locator(A, encoded_image)
         end_time = time.time()
-        print('It takes', end_time - start_time)
+        print("It takes", end_time - start_time)
 
         if not debug:
             return target_point
         elif not return_debug:
             return target_point, debug_text
         else:
-            return target_point, debug_text, obs, point
\ No newline at end of file
+            return target_point, debug_text, obs, point
diff --git a/src/stretch/dynav/llm_server.py b/src/stretch/dynav/llm_server.py
index de83a4a6d..eaec0c5c4 100644
--- a/src/stretch/dynav/llm_server.py
+++ b/src/stretch/dynav/llm_server.py
@@ -1,52 +1,69 @@
-import zmq
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
 
-from stretch.dynav.scannet import CLASS_LABELS_200
+import datetime
+import os
+import pickle
+import threading
 
+# import wget
+import time
+from collections import OrderedDict
+from io import BytesIO
+from pathlib import Path
+
+import clip
 import cv2
+import hydra
 import numpy as np
+import open3d as o3d
+import rerun as rr
+import scipy
 import torch
 import torch.nn.functional as F
 import torchvision.transforms.functional as V
-
-import clip
+import zmq
+from matplotlib import pyplot as plt
+from PIL import Image
 from torchvision import transforms
+from transformers import AutoModel, AutoProcessor
 
-import os
-# import wget
-import time
-
-import open3d as o3d
-
-from matplotlib import pyplot as plt
-import pickle
-from pathlib import Path
-from stretch.dynav.llm_localizer import LLM_Localizer
 from stretch.core import get_parameters
+from stretch.dynav.communication_util import (
+    load_socket,
+    recv_array,
+    recv_depth_img,
+    recv_everything,
+    recv_rgb_img,
+    send_array,
+    send_depth_img,
+    send_everything,
+    send_rgb_img,
+)
+from stretch.dynav.llm_localizer import LLM_Localizer
 from stretch.dynav.mapping_utils import (
-    SparseVoxelMap, 
-    SparseVoxelMapNavigationSpace, 
     AStar,
-    VoxelizedPointcloud
+    SparseVoxelMap,
+    SparseVoxelMapNavigationSpace,
+    VoxelizedPointcloud,
 )
+from stretch.dynav.scannet import CLASS_LABELS_200
 
-import datetime
-
-import threading
-import scipy
-import hydra
-
-from transformers import AutoProcessor, AutoModel
-import rerun as rr
-
-from io import BytesIO
-from PIL import Image
-
-from stretch.dynav.communication_util import load_socket, send_array, recv_array, send_rgb_img, recv_rgb_img, send_depth_img, recv_depth_img, send_everything, recv_everything
-from collections import OrderedDict
 
 def get_inv_intrinsics(intrinsics):
     # return intrinsics.double().inverse().to(intrinsics)
-    fx, fy, ppx, ppy = intrinsics[..., 0, 0], intrinsics[..., 1, 1], intrinsics[..., 0, 2], intrinsics[..., 1, 2]
+    fx, fy, ppx, ppy = (
+        intrinsics[..., 0, 0],
+        intrinsics[..., 1, 1],
+        intrinsics[..., 0, 2],
+        intrinsics[..., 1, 2],
+    )
     inv_intrinsics = torch.zeros_like(intrinsics)
     inv_intrinsics[..., 0, 0] = 1.0 / fx
     inv_intrinsics[..., 1, 1] = 1.0 / fy
@@ -55,6 +72,7 @@ def get_inv_intrinsics(intrinsics):
     inv_intrinsics[..., 2, 2] = 1.0
     return inv_intrinsics
 
+
 def get_xyz(depth, pose, intrinsics):
     """Returns the XYZ coordinates for a set of points.
 
@@ -94,35 +112,37 @@ def get_xyz(depth, pose, intrinsics):
     xyz = xyz @ get_inv_intrinsics(intrinsics).transpose(-1, -2)
     xyz = xyz * depth.flatten(1).unsqueeze(-1)
     xyz = (xyz[..., None, :] * pose[..., None, :3, :3]).sum(dim=-1) + pose[..., None, :3, 3]
-    
+
     xyz = xyz.unflatten(1, (height, width))
 
     return xyz
 
+
 class ImageProcessor:
-    def __init__(self,  
-        vision_method = 'pro_owl', 
-        siglip = True,
-        device = 'cuda',
-        min_depth = 0.25,
-        max_depth = 2.5,
-        img_port = 5555,
-        text_port = 5556,
-        open_communication = True,
-        rerun = True,
-        static = True,
-        log = None
+    def __init__(
+        self,
+        vision_method="pro_owl",
+        siglip=True,
+        device="cuda",
+        min_depth=0.25,
+        max_depth=2.5,
+        img_port=5555,
+        text_port=5556,
+        open_communication=True,
+        rerun=True,
+        static=True,
+        log=None,
     ):
         self.static = static
         self.siglip = siglip
         current_datetime = datetime.datetime.now()
         if log is None:
-            self.log = 'debug_' + current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
+            self.log = "debug_" + current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
         else:
             self.log = log
         self.rerun = rerun
         if self.rerun:
-            rr.init(self.log, spawn = True)
+            rr.init(self.log, spawn=True)
             # if self.static:
             #     rr.init(self.log, spawn = False)
             #     rr.connect('100.108.67.79:9876')
@@ -137,14 +157,16 @@ def __init__(self,
         self.vision_method = vision_method
         # If cuda is not available, then device will be forced to be cpu
         if not torch.cuda.is_available():
-            device = 'cpu'
+            device = "cpu"
         self.device = device
 
         self.create_obstacle_map()
         self.create_vision_model()
 
-        self.voxel_map_lock = threading.Lock()  # Create a lock for synchronizing access to `self.voxel_map_localizer`
-        
+        self.voxel_map_lock = (
+            threading.Lock()
+        )  # Create a lock for synchronizing access to `self.voxel_map_localizer`
+
         if open_communication:
             self.img_socket = load_socket(img_port)
             self.text_socket = load_socket(text_port)
@@ -152,7 +174,7 @@ def __init__(self,
             self.img_thread = threading.Thread(target=self._recv_image)
             self.img_thread.daemon = True
             self.img_thread.start()
-    
+
     def create_obstacle_map(self):
         print("- Load parameters")
         parameters = get_parameters("dynav_config.yaml")
@@ -162,29 +184,21 @@ def create_obstacle_map(self):
             local_radius=parameters["local_radius"],
             obs_min_height=parameters["obs_min_height"],
             obs_max_height=parameters["obs_max_height"],
-            obs_min_density = parameters["obs_min_density"],
-            exp_min_density = parameters["exp_min_density"],
+            obs_min_density=parameters["obs_min_density"],
+            exp_min_density=parameters["exp_min_density"],
             min_depth=self.min_depth,
             max_depth=self.max_depth,
             pad_obstacles=parameters["pad_obstacles"],
-            add_local_radius_points=parameters.get(
-                "add_local_radius_points", default=True
-            ),
+            add_local_radius_points=parameters.get("add_local_radius_points", default=True),
             remove_visited_from_obstacles=parameters.get(
                 "remove_visited_from_obstacles", default=False
             ),
             smooth_kernel_size=parameters.get("filters/smooth_kernel_size", -1),
             use_median_filter=parameters.get("filters/use_median_filter", False),
             median_filter_size=parameters.get("filters/median_filter_size", 5),
-            median_filter_max_error=parameters.get(
-                "filters/median_filter_max_error", 0.01
-            ),
-            use_derivative_filter=parameters.get(
-                "filters/use_derivative_filter", False
-            ),
-            derivative_filter_threshold=parameters.get(
-                "filters/derivative_filter_threshold", 0.5
-            )
+            median_filter_max_error=parameters.get("filters/median_filter_max_error", 0.01),
+            use_derivative_filter=parameters.get("filters/use_derivative_filter", False),
+            derivative_filter_threshold=parameters.get("filters/derivative_filter_threshold", 0.5),
         )
         self.space = SparseVoxelMapNavigationSpace(
             self.voxel_map,
@@ -200,42 +214,69 @@ def create_obstacle_map(self):
         self.planner = AStar(self.space)
 
     def create_vision_model(self):
-        if self.vision_method == 'gpt_owl':
-            self.voxel_map_localizer = LLM_Localizer(self.voxel_map, exist_model = 'gpt-4o', loc_model = 'owlv2', device = self.device)
-        elif self.vision_method == 'flash_owl':
-            self.voxel_map_localizer = LLM_Localizer(self.voxel_map, exist_model = 'gemini-1.5-flash', loc_model = 'owlv2', device = self.device)    
-        elif self.vision_method == 'pro_owl':
-            self.voxel_map_localizer = LLM_Localizer(self.voxel_map, exist_model = 'gemini-1.5-pro', loc_model = 'owlv2', device = self.device)
+        if self.vision_method == "gpt_owl":
+            self.voxel_map_localizer = LLM_Localizer(
+                self.voxel_map, exist_model="gpt-4o", loc_model="owlv2", device=self.device
+            )
+        elif self.vision_method == "flash_owl":
+            self.voxel_map_localizer = LLM_Localizer(
+                self.voxel_map,
+                exist_model="gemini-1.5-flash",
+                loc_model="owlv2",
+                device=self.device,
+            )
+        elif self.vision_method == "pro_owl":
+            self.voxel_map_localizer = LLM_Localizer(
+                self.voxel_map, exist_model="gemini-1.5-pro", loc_model="owlv2", device=self.device
+            )
 
     def process_text(self, text, start_pose):
         if self.rerun:
-            rr.log('/object', rr.Clear(recursive = True), static = self.static)
-            rr.log('/robot_start_pose', rr.Clear(recursive = True), static = self.static)
-            rr.log('/direction', rr.Clear(recursive = True), static = self.static)
-            rr.log('robot_monologue', rr.Clear(recursive = True), static = self.static)
-            rr.log('/Past_observation_most_similar_to_text', rr.Clear(recursive = True), static = self.static)
+            rr.log("/object", rr.Clear(recursive=True), static=self.static)
+            rr.log("/robot_start_pose", rr.Clear(recursive=True), static=self.static)
+            rr.log("/direction", rr.Clear(recursive=True), static=self.static)
+            rr.log("robot_monologue", rr.Clear(recursive=True), static=self.static)
+            rr.log(
+                "/Past_observation_most_similar_to_text",
+                rr.Clear(recursive=True),
+                static=self.static,
+            )
             if not self.static:
-                rr.connect('100.108.67.79:9876')
-        debug_text = ''
-        mode = 'navigation'
+                rr.connect("100.108.67.79:9876")
+        debug_text = ""
+        mode = "navigation"
         obs = None
         # Do visual grounding
-        if text is not None and text != '':
+        if text is not None and text != "":
             with self.voxel_map_lock:
-                localized_point, debug_text, obs, pointcloud = self.voxel_map_localizer.localize_A(text, debug = True, return_debug = True)
+                localized_point, debug_text, obs, pointcloud = self.voxel_map_localizer.localize_A(
+                    text, debug=True, return_debug=True
+                )
             if localized_point is not None:
-                rr.log("/object", rr.Points3D([localized_point[0], localized_point[1], 1.5], colors=torch.Tensor([1, 0, 0]), radii=0.1), static = self.static)
+                rr.log(
+                    "/object",
+                    rr.Points3D(
+                        [localized_point[0], localized_point[1], 1.5],
+                        colors=torch.Tensor([1, 0, 0]),
+                        radii=0.1,
+                    ),
+                    static=self.static,
+                )
         # Do Frontier based exploration
-        if text is None or text == '' or localized_point is None:
-            debug_text += '## Navigation fails, so robot starts exploring environments.\n'
+        if text is None or text == "" or localized_point is None:
+            debug_text += "## Navigation fails, so robot starts exploring environments.\n"
             localized_point = self.sample_frontier(start_pose, text)
-            mode = 'exploration'
-            rr.log("/object", rr.Points3D([0, 0, 0], colors=torch.Tensor([1, 0, 0]), radii=0), static = self.static)
-            print('\n', localized_point, '\n')
+            mode = "exploration"
+            rr.log(
+                "/object",
+                rr.Points3D([0, 0, 0], colors=torch.Tensor([1, 0, 0]), radii=0),
+                static=self.static,
+            )
+            print("\n", localized_point, "\n")
 
         if localized_point is None:
             return []
-        
+
         if len(localized_point) == 2:
             localized_point = np.array([localized_point[0], localized_point[1], 0])
 
@@ -243,48 +284,52 @@ def process_text(self, text, start_pose):
 
         if self.rerun:
             buf = BytesIO()
-            plt.savefig(buf, format='png')
+            plt.savefig(buf, format="png")
             buf.seek(0)
             img = Image.open(buf)
             img = np.array(img)
             buf.close()
-            rr.log('2d_map', rr.Image(img), static = self.static)
+            rr.log("2d_map", rr.Image(img), static=self.static)
         else:
-            if text != '':
-                plt.savefig(self.log + '/debug_' + text + str(self.obs_count) + '.png')
+            if text != "":
+                plt.savefig(self.log + "/debug_" + text + str(self.obs_count) + ".png")
             else:
-                plt.savefig(self.log + '/debug_exploration' + str(self.obs_count) + '.png')
+                plt.savefig(self.log + "/debug_exploration" + str(self.obs_count) + ".png")
         plt.clf()
 
         if self.rerun:
-            if text is not None and text != '':
-                debug_text = '### The goal is to navigate to ' + text + '.\n' + debug_text
+            if text is not None and text != "":
+                debug_text = "### The goal is to navigate to " + text + ".\n" + debug_text
             else:
-                debug_text = '### I have not received any text query from human user.\n ### So, I plan to explore the environment with Frontier-based exploration.\n'
-            debug_text = '# Robot\'s monologue: \n' + debug_text
-            rr.log("robot_monologue", rr.TextDocument(debug_text, media_type = rr.MediaType.MARKDOWN), static = self.static)
+                debug_text = "### I have not received any text query from human user.\n ### So, I plan to explore the environment with Frontier-based exploration.\n"
+            debug_text = "# Robot's monologue: \n" + debug_text
+            rr.log(
+                "robot_monologue",
+                rr.TextDocument(debug_text, media_type=rr.MediaType.MARKDOWN),
+                static=self.static,
+            )
 
-        if obs is not None and mode == 'navigation':
+        if obs is not None and mode == "navigation":
             rgb = self.voxel_map.observations[obs].rgb
             if not self.rerun:
                 if isinstance(rgb, torch.Tensor):
                     rgb = np.array(rgb)
-                cv2.imwrite(self.log + '/debug_' + text + '.png', rgb[:, :, [2, 1, 0]])
+                cv2.imwrite(self.log + "/debug_" + text + ".png", rgb[:, :, [2, 1, 0]])
             else:
-                rr.log('/Past_observation_most_similar_to_text', rr.Image(rgb), static = self.static)
+                rr.log("/Past_observation_most_similar_to_text", rr.Image(rgb), static=self.static)
         traj = []
         waypoints = None
 
         if point is None:
-            print('Unable to find any target point, some exception might happen')
+            print("Unable to find any target point, some exception might happen")
         else:
-            print('Target point is', point)
+            print("Target point is", point)
             res = self.planner.plan(start_pose, point)
             if res.success:
                 waypoints = [pt.state for pt in res.trajectory]
-                # If we are navigating to some object of interst, send (x, y, z) of 
+                # If we are navigating to some object of interst, send (x, y, z) of
                 # the object so that we can make sure the robot looks at the object after navigation
-                finished = len(waypoints) <= 10 and mode == 'navigation'
+                finished = len(waypoints) <= 10 and mode == "navigation"
                 if not finished:
                     waypoints = waypoints[:8]
                 traj = self.planner.clean_path_for_xy(waypoints)
@@ -294,19 +339,33 @@ def process_text(self, text, start_pose):
                     if isinstance(localized_point, torch.Tensor):
                         localized_point = localized_point.tolist()
                     traj.append(localized_point)
-                print('Planned trajectory:', traj)
+                print("Planned trajectory:", traj)
             else:
-                print('[FAILURE]', res.reason)
-            
+                print("[FAILURE]", res.reason)
+
         if traj is not None:
             origins = []
             vectors = []
             for idx in range(len(traj)):
                 if idx != len(traj) - 1:
                     origins.append([traj[idx][0], traj[idx][1], 1.5])
-                    vectors.append([traj[idx + 1][0] - traj[idx][0], traj[idx + 1][1] - traj[idx][1], 0])
-            rr.log("/direction", rr.Arrows3D(origins = origins, vectors = vectors, colors=torch.Tensor([0, 1, 0]), radii=0.05), static = self.static)
-            rr.log("/robot_start_pose", rr.Points3D([start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1), static = self.static)
+                    vectors.append(
+                        [traj[idx + 1][0] - traj[idx][0], traj[idx + 1][1] - traj[idx][1], 0]
+                    )
+            rr.log(
+                "/direction",
+                rr.Arrows3D(
+                    origins=origins, vectors=vectors, colors=torch.Tensor([0, 1, 0]), radii=0.05
+                ),
+                static=self.static,
+            )
+            rr.log(
+                "/robot_start_pose",
+                rr.Points3D(
+                    [start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1
+                ),
+                static=self.static,
+            )
 
         # self.write_to_pickle()
         return traj
@@ -317,34 +376,41 @@ def sample_navigation(self, start, point):
         plt.imshow(obstacles)
         if point is None:
             start_pt = self.planner.to_pt(start)
-            plt.scatter(start_pt[1], start_pt[0], s = 10)
+            plt.scatter(start_pt[1], start_pt[0], s=10)
             return None
         goal = self.space.sample_target_point(start, point, self.planner)
         print("point:", point, "goal:", goal)
         obstacles, explored = self.voxel_map.get_2d_map()
         start_pt = self.planner.to_pt(start)
-        plt.scatter(start_pt[1], start_pt[0], s = 15, c = 'b')
+        plt.scatter(start_pt[1], start_pt[0], s=15, c="b")
         point_pt = self.planner.to_pt(point)
-        plt.scatter(point_pt[1], point_pt[0], s = 15, c = 'g')
+        plt.scatter(point_pt[1], point_pt[0], s=15, c="g")
         if goal is not None:
             goal_pt = self.planner.to_pt(goal)
-            plt.scatter(goal_pt[1], goal_pt[0], s = 10, c = 'r')
+            plt.scatter(goal_pt[1], goal_pt[0], s=10, c="r")
         return goal
 
-    def sample_frontier(self, start_pose = [0, 0, 0], text = None):
+    def sample_frontier(self, start_pose=[0, 0, 0], text=None):
         with self.voxel_map_lock:
-            if text is not None and text != '':
-                index, time_heuristics, alignments_heuristics, total_heuristics = self.space.sample_exploration(start_pose, self.planner, None, None, debug = False)
+            if text is not None and text != "":
+                (
+                    index,
+                    time_heuristics,
+                    alignments_heuristics,
+                    total_heuristics,
+                ) = self.space.sample_exploration(start_pose, self.planner, None, None, debug=False)
             else:
-                index, time_heuristics, _, total_heuristics = self.space.sample_exploration(start_pose, self.planner, None, None, debug = False)
+                index, time_heuristics, _, total_heuristics = self.space.sample_exploration(
+                    start_pose, self.planner, None, None, debug=False
+                )
                 alignments_heuristics = time_heuristics
-                
+
         obstacles, explored = self.voxel_map.get_2d_map()
         plt.clf()
         plt.imshow(obstacles * 0.5 + explored * 0.5)
-        plt.scatter(index[1], index[0], s = 20, c = 'r')
+        plt.scatter(index[1], index[0], s=20, c="r")
         return self.voxel_map.grid_coords_to_xyt(torch.tensor([index[0], index[1]]))
-            
+
     def _recv_image(self):
         while True:
             # data = recv_array(self.img_socket)
@@ -352,11 +418,11 @@ def _recv_image(self):
             start_time = time.time()
             self.process_rgbd_images(rgb, depth, intrinsics, pose)
             process_time = time.time() - start_time
-            print('Image processing takes', process_time, 'seconds')
-    
-    def add_to_voxel_pcd(self, valid_xyz, feature, valid_rgb, weights = None, threshold = 0.95):
+            print("Image processing takes", process_time, "seconds")
+
+    def add_to_voxel_pcd(self, valid_xyz, feature, valid_rgb, weights=None, threshold=0.95):
         # Adding all points to voxelizedPointCloud is useless and expensive, we should exclude threshold of all points
-        selected_indices = torch.randperm(len(valid_xyz))[:int((1 - threshold) * len(valid_xyz))]
+        selected_indices = torch.randperm(len(valid_xyz))[: int((1 - threshold) * len(valid_xyz))]
         if len(selected_indices) == 0:
             return
         if valid_xyz is not None:
@@ -368,11 +434,13 @@ def add_to_voxel_pcd(self, valid_xyz, feature, valid_rgb, weights = None, thresh
         if weights is not None:
             weights = weights[selected_indices]
         with self.voxel_map_lock:
-            self.voxel_map_localizer.add(points = valid_xyz, 
-                                    features = feature,
-                                    rgb = valid_rgb,
-                                    weights = weights,
-                                    obs_count = self.obs_count)
+            self.voxel_map_localizer.add(
+                points=valid_xyz,
+                features=feature,
+                rgb=valid_rgb,
+                weights=weights,
+                obs_count=self.obs_count,
+            )
 
     def process_rgbd_images(self, rgb, depth, intrinsics, pose):
         if not os.path.exists(self.log):
@@ -380,69 +448,76 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
         self.obs_count += 1
         world_xyz = get_xyz(depth, pose, intrinsics).squeeze(0)
 
-        cv2.imwrite('debug/rgb' + str(self.obs_count) + '.jpg', rgb[:, :, [2, 1, 0]])
+        cv2.imwrite("debug/rgb" + str(self.obs_count) + ".jpg", rgb[:, :, [2, 1, 0]])
 
         rgb, depth = torch.from_numpy(rgb), torch.from_numpy(depth)
         rgb = rgb.permute(2, 0, 1).to(torch.uint8)
 
-        median_depth = torch.from_numpy(
-            scipy.ndimage.median_filter(depth, size=5)
-        )
+        median_depth = torch.from_numpy(scipy.ndimage.median_filter(depth, size=5))
         median_filter_error = (depth - median_depth).abs()
         valid_depth = torch.logical_and(depth < self.max_depth, depth > self.min_depth)
-        valid_depth = (
-            valid_depth
-            & (median_filter_error < 0.01).bool()
-        )
-        
+        valid_depth = valid_depth & (median_filter_error < 0.01).bool()
+
         with self.voxel_map_lock:
-            self.voxel_map_localizer.voxel_pcd.clear_points(depth, torch.from_numpy(intrinsics), torch.from_numpy(pose), min_samples_clear = 20)
-            self.voxel_map.voxel_pcd.clear_points(depth, torch.from_numpy(intrinsics), torch.from_numpy(pose))
-        
-        if '_owl' in self.vision_method:
+            self.voxel_map_localizer.voxel_pcd.clear_points(
+                depth, torch.from_numpy(intrinsics), torch.from_numpy(pose), min_samples_clear=20
+            )
+            self.voxel_map.voxel_pcd.clear_points(
+                depth, torch.from_numpy(intrinsics), torch.from_numpy(pose)
+            )
+
+        if "_owl" in self.vision_method:
             self.run_llm_owl(rgb, ~valid_depth, world_xyz)
 
         self.voxel_map.add(
-            camera_pose = torch.Tensor(pose), 
-            rgb = torch.Tensor(rgb).permute(1, 2, 0), 
-            depth = torch.Tensor(depth), 
-            camera_K = torch.Tensor(intrinsics)
+            camera_pose=torch.Tensor(pose),
+            rgb=torch.Tensor(rgb).permute(1, 2, 0),
+            depth=torch.Tensor(depth),
+            camera_K=torch.Tensor(intrinsics),
         )
         obs, exp = self.voxel_map.get_2d_map()
         if self.rerun:
             if self.voxel_map.voxel_pcd._points is not None:
-                rr.log("Obstalce_map/pointcloud", rr.Points3D(self.voxel_map.voxel_pcd._points.detach().cpu(), \
-                                                              colors=self.voxel_map.voxel_pcd._rgb.detach().cpu() / 255., radii=0.03), static = self.static)
+                rr.log(
+                    "Obstalce_map/pointcloud",
+                    rr.Points3D(
+                        self.voxel_map.voxel_pcd._points.detach().cpu(),
+                        colors=self.voxel_map.voxel_pcd._rgb.detach().cpu() / 255.0,
+                        radii=0.03,
+                    ),
+                    static=self.static,
+                )
             if self.voxel_map_localizer.voxel_pcd._points is not None:
-                rr.log("Semantic_memory/pointcloud", rr.Points3D(self.voxel_map_localizer.voxel_pcd._points.detach().cpu(), \
-                                                                 colors=self.voxel_map_localizer.voxel_pcd._rgb.detach().cpu() / 255., radii=0.03), static = self.static)
+                rr.log(
+                    "Semantic_memory/pointcloud",
+                    rr.Points3D(
+                        self.voxel_map_localizer.voxel_pcd._points.detach().cpu(),
+                        colors=self.voxel_map_localizer.voxel_pcd._rgb.detach().cpu() / 255.0,
+                        radii=0.03,
+                    ),
+                    static=self.static,
+                )
         else:
-            cv2.imwrite(self.log + '/debug_' + str(self.obs_count) + '.jpg', np.asarray(obs.int() * 127 + exp.int() * 127))
-    
+            cv2.imwrite(
+                self.log + "/debug_" + str(self.obs_count) + ".jpg",
+                np.asarray(obs.int() * 127 + exp.int() * 127),
+            )
+
     def run_llm_owl(self, rgb, mask, world_xyz):
         valid_xyz = world_xyz[~mask]
         valid_rgb = rgb.permute(1, 2, 0)[~mask]
         if len(valid_xyz) != 0:
             self.add_to_voxel_pcd(valid_xyz, None, valid_rgb)
-    
+
     def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
-        print('Reading from ', pickle_file_name)
-        rr.init('Debug', spawn = True)
+        print("Reading from ", pickle_file_name)
+        rr.init("Debug", spawn=True)
         if isinstance(pickle_file_name, str):
             pickle_file_name = Path(pickle_file_name)
         assert pickle_file_name.exists(), f"No file found at {pickle_file_name}"
         with pickle_file_name.open("rb") as f:
             data = pickle.load(f)
-        for i, (
-            camera_pose,
-            xyz,
-            rgb,
-            feats,
-            depth,
-            base_pose,
-            K,
-            world_xyz,
-        ) in enumerate(
+        for i, (camera_pose, xyz, rgb, feats, depth, base_pose, K, world_xyz,) in enumerate(
             zip(
                 data["camera_poses"],
                 data["xyz"],
@@ -483,15 +558,18 @@ def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
         self.voxel_map_localizer.voxel_pcd._rgb = data["combined_rgb"]
         self.voxel_map_localizer.voxel_pcd._obs_counts = data["obs_id"]
         self.voxel_map_localizer.voxel_pcd._entity_ids = data["entity_id"]
-        self.voxel_map_localizer.voxel_pcd.obs_count = max(self.voxel_map_localizer.voxel_pcd._obs_counts).item()
-        self.voxel_map.voxel_pcd.obs_count = max(self.voxel_map_localizer.voxel_pcd._obs_counts).item()
-
+        self.voxel_map_localizer.voxel_pcd.obs_count = max(
+            self.voxel_map_localizer.voxel_pcd._obs_counts
+        ).item()
+        self.voxel_map.voxel_pcd.obs_count = max(
+            self.voxel_map_localizer.voxel_pcd._obs_counts
+        ).item()
 
     def write_to_pickle(self):
         """Write out to a pickle file. This is a rough, quick-and-easy output for debugging, not intended to replace the scalable data writer in data_tools for bigger efforts."""
-        if not os.path.exists('debug'):
-            os.mkdir('debug')
-        filename = 'debug/' + self.log + '.pkl'
+        if not os.path.exists("debug"):
+            os.mkdir("debug")
+        filename = "debug/" + self.log + ".pkl"
         data = {}
         data["camera_poses"] = []
         data["camera_K"] = []
@@ -526,4 +604,4 @@ def write_to_pickle(self):
         data["entity_id"] = self.voxel_map_localizer.voxel_pcd._entity_ids
         with open(filename, "wb") as f:
             pickle.dump(data, f)
-        print('write all data to', filename)
+        print("write all data to", filename)
diff --git a/src/stretch/dynav/mapping_utils/__init__.py b/src/stretch/dynav/mapping_utils/__init__.py
index 5b9355132..6003d865a 100644
--- a/src/stretch/dynav/mapping_utils/__init__.py
+++ b/src/stretch/dynav/mapping_utils/__init__.py
@@ -1,8 +1,18 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+from .a_star import AStar
+from .voxel import SparseVoxelMap
+from .voxel_map import SparseVoxelMapNavigationSpace
+
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 from .voxelized_pcd import VoxelizedPointcloud, scatter3d
-from .voxel import SparseVoxelMap
-from .voxel_map import SparseVoxelMapNavigationSpace
-from .a_star import AStar
diff --git a/src/stretch/dynav/mapping_utils/a_star.py b/src/stretch/dynav/mapping_utils/a_star.py
index d7172df71..727c3a303 100644
--- a/src/stretch/dynav/mapping_utils/a_star.py
+++ b/src/stretch/dynav/mapping_utils/a_star.py
@@ -1,26 +1,39 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import heapq
+import math
 import time
 from random import random
-from typing import Callable, List, Optional, Tuple, Set
+from typing import Callable, List, Optional, Set, Tuple
 
 import numpy as np
+import torch
 
-from stretch.motion import PlanResult
 from stretch.dynav.mapping_utils import SparseVoxelMapNavigationSpace
+from stretch.motion import PlanResult
 
-import heapq
-import math
-import torch
 
-class Node():
+class Node:
     """Stores an individual spot in the tree"""
 
     def __init__(self, state):
         self.state = state
 
+
 def neighbors(pt: Tuple[int, int]) -> List[Tuple[int, int]]:
-    return [(pt[0] + dx, pt[1] + dy) for dx in range(-1, 2) for dy in range(-1, 2) if (dx, dy) != (0, 0)]
+    return [
+        (pt[0] + dx, pt[1] + dy) for dx in range(-1, 2) for dy in range(-1, 2) if (dx, dy) != (0, 0)
+    ]
 
-class AStar():
+
+class AStar:
     """Define RRT planning problem and parameters"""
 
     def __init__(
@@ -30,7 +43,7 @@ def __init__(
         """Create RRT planner with configuration"""
         self.space = space
         self.reset()
-    
+
     def compute_theta(self, cur_x, cur_y, end_x, end_y):
         theta = 0
         if end_x == cur_x and end_y >= cur_y:
@@ -41,7 +54,7 @@ def compute_theta(self, cur_x, cur_y, end_x, end_y):
             theta = np.arctan((end_y - cur_y) / (end_x - cur_x))
             if end_x < cur_x:
                 theta = theta + np.pi
-            # move theta into [-pi, pi] range, for this function specifically, 
+            # move theta into [-pi, pi] range, for this function specifically,
             # (theta -= 2 * pi) or (theta += 2 * pi) is enough
             if theta > np.pi:
                 theta = theta - 2 * np.pi
@@ -67,7 +80,7 @@ def verify_path(self, path) -> bool:
 
     def point_is_occupied(self, x: int, y: int) -> bool:
         return not bool(self._navigable[x][y])
-    
+
     def to_pt(self, xy: Tuple[float, float]) -> Tuple[int, int]:
         xy = torch.tensor([xy[0], xy[1]])
         pt = self.space.voxel_map.xy_to_grid_coords(xy)
@@ -96,9 +109,12 @@ def compute_obstacle_punishment(self, a: Tuple[int, int], weight: int, avoid: in
         return obstacle_punishment
 
     # A* heuristic
-    def compute_heuristic(self, a: Tuple[int, int], b: Tuple[int, int], weight = 6, avoid = 3) -> float:
-        return self.compute_dis(a, b) + weight * self.compute_obstacle_punishment(a, weight, avoid)\
+    def compute_heuristic(self, a: Tuple[int, int], b: Tuple[int, int], weight=6, avoid=3) -> float:
+        return (
+            self.compute_dis(a, b)
+            + weight * self.compute_obstacle_punishment(a, weight, avoid)
             + self.compute_obstacle_punishment(b, weight, avoid)
+        )
 
     def is_in_line_of_sight(self, start_pt: Tuple[int, int], end_pt: Tuple[int, int]) -> bool:
         """Checks if there is a line-of-sight between two points.
@@ -150,7 +166,9 @@ def clean_path_for_xy(self, waypoints):
         waypoints = [self.to_xy(waypt) for waypt in waypts]
         traj = []
         for i in range(len(waypoints) - 1):
-            theta = self.compute_theta(waypoints[i][0], waypoints[i][1], waypoints[i + 1][0], waypoints[i + 1][1])
+            theta = self.compute_theta(
+                waypoints[i][0], waypoints[i][1], waypoints[i + 1][0], waypoints[i + 1][1]
+            )
             traj.append([waypoints[i][0], waypoints[i][1], float(theta)])
         traj.append([waypoints[-1][0], waypoints[-1][1], goal[-1]])
         return traj
@@ -177,7 +195,7 @@ def clean_path(self, path) -> List[Tuple[int, int]]:
                 if self.is_in_line_of_sight(path[i][:2], path[j][:2]):
                     break
             else:
-               j = i + 1
+                j = i + 1
             # Include the mid waypoint to avoid the collision
             # if j - i >= 2:
             #     cleaned_path.append(path[(i + j) // 2])
@@ -185,7 +203,7 @@ def clean_path(self, path) -> List[Tuple[int, int]]:
             i = j
         return cleaned_path
 
-    def get_unoccupied_neighbor(self, pt: Tuple[int, int], goal_pt = None) -> Tuple[int, int]:
+    def get_unoccupied_neighbor(self, pt: Tuple[int, int], goal_pt=None) -> Tuple[int, int]:
         if not self.point_is_occupied(*pt):
             return pt
 
@@ -199,7 +217,9 @@ def get_unoccupied_neighbor(self, pt: Tuple[int, int], goal_pt = None) -> Tuple[
         for neighbor_pt in neighbor_pts:
             if not self.point_is_occupied(*neighbor_pt):
                 return neighbor_pt
-        print("The robot might stand on a non navigable point, so check obstacle map and restart roslaunch")
+        print(
+            "The robot might stand on a non navigable point, so check obstacle map and restart roslaunch"
+        )
         return None
         # raise ValueError("The robot might stand on a non navigable point, so check obstacle map and restart roslaunch")
 
@@ -305,20 +325,22 @@ def plan(self, start, goal, verbose: bool = True) -> PlanResult:
         if not self.space.is_valid(goal):
             if verbose:
                 print("[Planner] invalid goal")
-            return PlanResult(False, reason = "[Planner] invalid goal")
+            return PlanResult(False, reason="[Planner] invalid goal")
         # Add start to the tree
         # print('Start running A* ', time.time() - self.start_time, ' seconds after path planning starts')
-        waypoints = self.run_astar(start[:2], goal[:2]) 
+        waypoints = self.run_astar(start[:2], goal[:2])
         # print('Finish running A* ', time.time() - self.start_time, ' seconds after path planning starts')
 
         if waypoints is None:
             if verbose:
                 print("A* fails, check obstacle map")
-            return PlanResult(False, reason = "A* fails, check obstacle map")
+            return PlanResult(False, reason="A* fails, check obstacle map")
         trajectory = []
         for i in range(len(waypoints) - 1):
-            theta = self.compute_theta(waypoints[i][0], waypoints[i][1], waypoints[i + 1][0], waypoints[i + 1][1])
+            theta = self.compute_theta(
+                waypoints[i][0], waypoints[i][1], waypoints[i + 1][0], waypoints[i + 1][1]
+            )
             trajectory.append(Node([waypoints[i][0], waypoints[i][1], float(theta)]))
         trajectory.append(Node([waypoints[-1][0], waypoints[-1][1], goal[-1]]))
         # print('Finish computing theta ', time.time() - self.start_time, ' seconds after path planning starts')
-        return PlanResult(True, trajectory = trajectory)
\ No newline at end of file
+        return PlanResult(True, trajectory=trajectory)
diff --git a/src/stretch/dynav/mapping_utils/voxel.py b/src/stretch/dynav/mapping_utils/voxel.py
index 9fdbd6334..625e3cc98 100644
--- a/src/stretch/dynav/mapping_utils/voxel.py
+++ b/src/stretch/dynav/mapping_utils/voxel.py
@@ -1,3 +1,12 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 # This source code is licensed under the MIT license found in the
@@ -5,36 +14,34 @@
 import copy
 import logging
 import math
+import os
 import pickle
+import time
 from collections import namedtuple
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
+import cv2
 import numpy as np
 import open3d as open3d
 import scipy
 import skimage
 import torch
+import torch.nn.functional as F
+import torchvision.transforms.functional as V
 import trimesh
+from scipy.ndimage import gaussian_filter, maximum_filter
 from torch import Tensor
+from torchvision import transforms
 
 from stretch.core.interfaces import Observations
+from stretch.dynav.mapping_utils import VoxelizedPointcloud, scatter3d
 from stretch.motion import Footprint, PlanResult, RobotModel
 from stretch.utils.data_tools.dict import update
 from stretch.utils.morphology import binary_dilation, binary_erosion, get_edges
 from stretch.utils.point_cloud import create_visualization_geometries, numpy_to_pcd
 from stretch.utils.point_cloud_torch import unproject_masked_depth_to_xyz_coordinates
 from stretch.utils.visualization import create_disk
-from stretch.dynav.mapping_utils import VoxelizedPointcloud, scatter3d
-
-import torch.nn.functional as F
-import torchvision.transforms.functional as V
-from torchvision import transforms
-import os
-import cv2
-import time
-
-from scipy.ndimage import gaussian_filter, maximum_filter
 
 Frame = namedtuple(
     "Frame",
@@ -129,7 +136,9 @@ def __init__(
             resolution(float): in meters, size of a voxel
             feature_dim(int): size of feature embeddings to capture per-voxel point
         """
-        print('------------------------YOU ARE NOW RUNNING PEIQI VOXEL MAP CODES V3-----------------')
+        print(
+            "------------------------YOU ARE NOW RUNNING PEIQI VOXEL MAP CODES V3-----------------"
+        )
         # TODO: We an use fastai.store_attr() to get rid of this boilerplate code
         self.resolution = resolution
         self.feature_dim = feature_dim
@@ -248,11 +257,9 @@ def add_obs(
         depth = self.fix_data_type(obs.depth)
         xyz = self.fix_data_type(obs.xyz)
         camera_pose = self.fix_data_type(obs.camera_pose)
-        base_pose = torch.from_numpy(
-            np.array([obs.gps[0], obs.gps[1], obs.compass[0]])
-        ).float()
+        base_pose = torch.from_numpy(np.array([obs.gps[0], obs.gps[1], obs.compass[0]])).float()
         K = self.fix_data_type(obs.camera_K) if camera_K is None else camera_K
-        
+
         self.add(
             camera_pose=camera_pose,
             xyz=xyz,
@@ -317,9 +324,7 @@ def add(
                 assert (
                     feats.shape[-1] == self.feature_dim
                 ), f"features must match voxel feature dimenstionality of {self.feature_dim}"
-                assert (
-                    xyz.shape[0] == feats.shape[0]
-                ), "features must be available for each point"
+                assert xyz.shape[0] == feats.shape[0], "features must be available for each point"
             else:
                 pass
             if isinstance(xyz, np.ndarray):
@@ -329,9 +334,7 @@ def add(
         if camera_K is not None:
             assert camera_K.ndim == 2, "camera intrinsics K must be a 3x3 matrix"
         assert (
-            camera_pose.ndim == 2
-            and camera_pose.shape[0] == 4
-            and camera_pose.shape[1] == 4
+            camera_pose.ndim == 2 and camera_pose.shape[0] == 4 and camera_pose.shape[1] == 4
         ), "Camera pose must be a 4x4 matrix representing a pose in SE(3)"
         assert (
             xyz_frame in VALID_FRAMES
@@ -349,8 +352,7 @@ def add(
         if xyz is not None:
             if xyz_frame == "camera":
                 full_world_xyz = (
-                    torch.cat([xyz, torch.ones_like(xyz[..., [0]])], dim=-1)
-                    @ camera_pose.T
+                    torch.cat([xyz, torch.ones_like(xyz[..., [0]])], dim=-1) @ camera_pose.T
                 )[..., :3]
             elif xyz_frame == "world":
                 full_world_xyz = xyz
@@ -389,8 +391,7 @@ def add(
 
             if self.use_median_filter:
                 valid_depth = (
-                    valid_depth
-                    & (median_filter_error < self.median_filter_max_error).bool()
+                    valid_depth & (median_filter_error < self.median_filter_max_error).bool()
                 )
 
         # Add to voxel grid
@@ -401,7 +402,9 @@ def add(
 
         # TODO: weights could also be confidence, inv distance from camera, etc
         if world_xyz.nelement() > 0:
-            selected_indices = torch.randperm(len(world_xyz))[:int((1 - self.point_update_threshold) * len(world_xyz))]
+            selected_indices = torch.randperm(len(world_xyz))[
+                : int((1 - self.point_update_threshold) * len(world_xyz))
+            ]
             if len(selected_indices) == 0:
                 return
             if world_xyz is not None:
@@ -524,16 +527,7 @@ def read_from_pickle(self, filename: str, num_frames: int = -1):
         assert filename.exists(), f"No file found at {filename}"
         with filename.open("rb") as f:
             data = pickle.load(f)
-        for i, (
-            camera_pose,
-            xyz,
-            rgb,
-            feats,
-            depth,
-            base_pose,
-            K,
-            world_xyz,
-        ) in enumerate(
+        for i, (camera_pose, xyz, rgb, feats, depth, base_pose, K, world_xyz,) in enumerate(
             zip(
                 data["camera_poses"],
                 data["xyz"],
@@ -586,9 +580,7 @@ def recompute_map(self):
                 **frame.info,
             )
 
-    def get_2d_alignment_heuristics(
-        self, voxel_map_localizer, text, debug: bool = False
-    ):
+    def get_2d_alignment_heuristics(self, voxel_map_localizer, text, debug: bool = False):
         if voxel_map_localizer.voxel_pcd._points is None:
             return None
         # Convert metric measurements to discrete
@@ -615,17 +607,22 @@ def get_2d_alignment_heuristics(
 
         alignment_heuristics = scatter3d(xyz, alignments, grid_size, "max")
         alignment_heuristics = torch.max(alignment_heuristics, dim=-1).values
-        alignment_heuristics = torch.from_numpy(maximum_filter(alignment_heuristics.numpy(), size = 7))
+        alignment_heuristics = torch.from_numpy(
+            maximum_filter(alignment_heuristics.numpy(), size=7)
+        )
         return alignment_heuristics
 
-
     def get_2d_map(
         self, debug: bool = False, return_history_id: bool = False
     ) -> Tuple[np.ndarray, ...]:
         """Get 2d map with explored area and frontiers."""
 
         # Is this already cached? If so we don't need to go to all this work
-        if self._map2d is not None and self._history_soft is not None and self._seq == self._2d_last_updated:
+        if (
+            self._map2d is not None
+            and self._history_soft is not None
+            and self._seq == self._2d_last_updated
+        ):
             return self._map2d if not return_history_id else (*self._map2d, self._history_soft)
 
         # Convert metric measurements to discrete
@@ -669,7 +666,7 @@ def get_2d_map(
 
         # history_ids = history_ids[:, :, min_height:max_height]
         history_soft = torch.max(history_ids, dim=-1).values
-        history_soft = torch.from_numpy(maximum_filter(history_soft.float().numpy(), size = 5))
+        history_soft = torch.from_numpy(maximum_filter(history_soft.float().numpy(), size=5))
 
         if self._remove_visited_from_obstacles:
             # Remove "visited" points containing observations of the robot
@@ -700,11 +697,9 @@ def get_2d_map(
         if self.smooth_kernel_size > 0:
             # Opening and closing operations here on explore
             explored = binary_erosion(
-                binary_dilation(
-                    explored.float().unsqueeze(0).unsqueeze(0), self.smooth_kernel
-                ),
+                binary_dilation(explored.float().unsqueeze(0).unsqueeze(0), self.smooth_kernel),
                 self.smooth_kernel,
-            ) #[0, 0].bool()
+            )  # [0, 0].bool()
             explored = binary_dilation(
                 binary_erosion(explored, self.smooth_kernel),
                 self.smooth_kernel,
@@ -761,16 +756,12 @@ def xy_to_grid_coords(self, xy: torch.Tensor) -> Optional[np.ndarray]:
         if isinstance(xy, np.ndarray):
             xy = torch.from_numpy(xy).float()
         grid_xy = (xy / self.grid_resolution) + self.grid_origin[:2]
-        if torch.any(grid_xy >= self._grid_size_t) or torch.any(
-            grid_xy < torch.zeros(2)
-        ):
+        if torch.any(grid_xy >= self._grid_size_t) or torch.any(grid_xy < torch.zeros(2)):
             return None
         else:
             return grid_xy.int()
 
-    def plan_to_grid_coords(
-        self, plan_result: PlanResult
-    ) -> Optional[List[torch.Tensor]]:
+    def plan_to_grid_coords(self, plan_result: PlanResult) -> Optional[List[torch.Tensor]]:
         """Convert a plan properly into grid coordinates"""
         if not plan_result.success:
             return None
@@ -791,16 +782,12 @@ def grid_coords_to_xyt(self, grid_coords: np.ndarray) -> np.ndarray:
         res[:2] = self.grid_coords_to_xy(grid_coords)
         return res
 
-    def show(
-        self, backend: str = "open3d", **backend_kwargs
-    ) -> Tuple[np.ndarray, np.ndarray]:
+    def show(self, backend: str = "open3d", **backend_kwargs) -> Tuple[np.ndarray, np.ndarray]:
         """Display the aggregated point cloud."""
         if backend == "open3d":
             return self._show_open3d(**backend_kwargs)
         else:
-            raise NotImplementedError(
-                f"Uknown backend {backend}, must be 'open3d' or 'pytorch3d"
-            )
+            raise NotImplementedError(f"Uknown backend {backend}, must be 'open3d' or 'pytorch3d")
 
     def get_xyz_rgb(self) -> Tuple[torch.Tensor, torch.Tensor]:
         """Return xyz and rgb of the current map"""
@@ -819,9 +806,7 @@ def sample_from_mask(self, mask: torch.Tensor) -> Optional[np.ndarray]:
     def xyt_is_safe(self, xyt: np.ndarray, robot: Optional[RobotModel] = None) -> bool:
         """Check to see if a given xyt position is known to be safe."""
         if robot is not None:
-            raise NotImplementedError(
-                "not currently checking against robot base geometry"
-            )
+            raise NotImplementedError("not currently checking against robot base geometry")
         obstacles, explored = self.get_2d_map()
         # Convert xy to grid coords
         grid_xy = self.xy_to_grid_coords(xyt[:2])
@@ -904,9 +889,7 @@ def _get_open3d_geometries(
         # Create a combined point cloud
         # pc_xyz, pc_rgb, pc_feats = self.get_data()
         points, _, _, rgb = self.voxel_pcd.get_pointcloud()
-        pcd = numpy_to_pcd(
-            points.detach().cpu().numpy(), (rgb / norm).detach().cpu().numpy()
-        )
+        pcd = numpy_to_pcd(points.detach().cpu().numpy(), (rgb / norm).detach().cpu().numpy())
         if orig is None:
             orig = np.zeros(3)
         geoms = create_visualization_geometries(pcd=pcd, orig=orig)
@@ -938,9 +921,7 @@ def _get_o3d_robot_footprint_geometry(
         """Get a 3d mesh cube for the footprint of the robot. But this does not work very well for whatever reason."""
         # Define the dimensions of the cube
         if dimensions is None:
-            dimensions = np.array(
-                [0.2, 0.2, 0.2]
-            )  # Cube dimensions (length, width, height)
+            dimensions = np.array([0.2, 0.2, 0.2])  # Cube dimensions (length, width, height)
 
         x, y, theta = xyt
         # theta = theta - np.pi/2
@@ -964,9 +945,7 @@ def _get_o3d_robot_footprint_geometry(
         y_offset = -1 * dimensions[1] / 2
         dx = (math.cos(theta) * x_offset) + (math.cos(theta - np.pi / 2) * y_offset)
         dy = (math.sin(theta + np.pi / 2) * y_offset) + (math.sin(theta) * x_offset)
-        translation_vector = np.array(
-            [x + dx, y + dy, 0]
-        )  # Apply offset based on theta
+        translation_vector = np.array([x + dx, y + dy, 0])  # Apply offset based on theta
         transformation_matrix = np.identity(4)
         transformation_matrix[:3, :3] = rotation_matrix
         transformation_matrix[:3, 3] = translation_vector
@@ -990,9 +969,7 @@ def _show_open3d(
         """Show and return bounding box information and rgb color information from an explored point cloud. Uses open3d."""
 
         # get geometries so we can use them
-        geoms = self._get_open3d_geometries(
-            orig, norm, xyt=xyt, footprint=footprint
-        )
+        geoms = self._get_open3d_geometries(orig, norm, xyt=xyt, footprint=footprint)
 
         # Show the geometries of where we have explored
         open3d.visualization.draw_geometries(geoms)
diff --git a/src/stretch/dynav/mapping_utils/voxel_map.py b/src/stretch/dynav/mapping_utils/voxel_map.py
index ed834a8de..9b0bb8316 100644
--- a/src/stretch/dynav/mapping_utils/voxel_map.py
+++ b/src/stretch/dynav/mapping_utils/voxel_map.py
@@ -1,3 +1,12 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 # This source code is licensed under the MIT license found in the
@@ -23,6 +32,7 @@
     get_edges,
 )
 
+
 class SparseVoxelMapNavigationSpace(XYT):
     """subclass for sampling XYT states from explored space"""
 
@@ -40,7 +50,9 @@ def __init__(
         dilate_obstacle_size: int = 2,
         extend_mode: str = "separate",
     ):
-        print('------------------------YOU ARE NOW RUNNING PEIQI VOXEL NAVIGATION SPACE CODES-----------------')
+        print(
+            "------------------------YOU ARE NOW RUNNING PEIQI VOXEL NAVIGATION SPACE CODES-----------------"
+        )
         self.step_size = step_size
         self.rotation_step_size = rotation_step_size
         self.voxel_map = voxel_map
@@ -90,9 +102,7 @@ def draw_state_on_grid(
         img[x0:x1, y0:y1] += mask * weight
         return img
 
-    def create_collision_masks(
-        self, orientation_resolution: int, show_all: bool = False
-    ):
+    def create_collision_masks(self, orientation_resolution: int, show_all: bool = False):
         """Create a set of orientation masks
 
         Args:
@@ -134,9 +144,7 @@ def extend(self, q0: np.ndarray, q1: np.ndarray) -> np.ndarray:
         separate: move then rotate
         joint: move and rotate all at once."""
         assert len(q0) == 3, f"initial configuration must be 3d, was {q0}"
-        assert (
-            len(q1) == 3 or len(q1) == 2
-        ), f"final configuration can be 2d or 3d, was {q1}"
+        assert len(q1) == 3 or len(q1) == 2, f"final configuration can be 2d or 3d, was {q1}"
         if self.extend_mode == "separate":
             return self._extend_separate(q0, q1)
         elif self.extend_mode == "joint":
@@ -145,15 +153,11 @@ def extend(self, q0: np.ndarray, q1: np.ndarray) -> np.ndarray:
         else:
             raise NotImplementedError(f"not supported: {self.extend_mode=}")
 
-    def _extend_separate(
-        self, q0: np.ndarray, q1: np.ndarray, xy_tol: float = 1e-8
-    ) -> np.ndarray:
+    def _extend_separate(self, q0: np.ndarray, q1: np.ndarray, xy_tol: float = 1e-8) -> np.ndarray:
         """extend towards another configuration in this space.
         TODO: we can set the classes here, right now assuming still np.ndarray"""
         assert len(q0) == 3, f"initial configuration must be 3d, was {q0}"
-        assert (
-            len(q1) == 3 or len(q1) == 2
-        ), f"final configuration can be 2d or 3d, was {q1}"
+        assert len(q1) == 3 or len(q1) == 2, f"final configuration can be 2d or 3d, was {q1}"
         dxy = q1[:2] - q0[:2]
         step = dxy / np.linalg.norm(dxy + self.tolerance) * self.step_size
         xy = np.copy(q0[:2])
@@ -175,9 +179,7 @@ def _extend_separate(
             angle_diff = angle_difference(new_theta, cur_theta)
             while angle_diff > self.rotation_step_size:
                 # Interpolate
-                cur_theta = interpolate_angles(
-                    cur_theta, new_theta, self.rotation_step_size
-                )
+                cur_theta = interpolate_angles(cur_theta, new_theta, self.rotation_step_size)
                 # print("interp ang =", cur_theta, "from =", cur_theta, "to =", new_theta)
                 yield np.array([xy[0], xy[1], cur_theta])
                 angle_diff = angle_difference(new_theta, cur_theta)
@@ -221,9 +223,7 @@ def _get_theta_index(self, theta: float) -> int:
             theta += 2 * np.pi
         if theta >= 2 * np.pi:
             theta -= 2 * np.pi
-        assert (
-            theta >= 0 and theta <= 2 * np.pi
-        ), "only angles between 0 and 2*PI allowed"
+        assert theta >= 0 and theta <= 2 * np.pi, "only angles between 0 and 2*PI allowed"
         theta_idx = np.round((theta / (2 * np.pi) * self._orientation_resolution) - 0.5)
         if theta_idx == self._orientation_resolution:
             theta_idx = 0
@@ -271,9 +271,7 @@ def is_valid(
 
         collision = torch.any(crop_obs & mask)
 
-        p_is_safe = (
-            torch.sum((crop_exp & mask) | ~mask) / (mask.shape[0] * mask.shape[1])
-        ).item()
+        p_is_safe = (torch.sum((crop_exp & mask) | ~mask) / (mask.shape[0] * mask.shape[1])).item()
         is_safe = p_is_safe >= is_safe_threshold
         if verbose:
             print(f"{collision=}, {is_safe=}, {p_is_safe=}, {is_safe_threshold=}")
@@ -322,11 +320,7 @@ def compute_theta(self, cur_x, cur_y, end_x, end_y):
         return theta
 
     def sample_target_point(
-        self,
-        start: torch.Tensor,
-        point: torch.Tensor,
-        planner,
-        exploration: bool = False
+        self, start: torch.Tensor, point: torch.Tensor, planner, exploration: bool = False
     ) -> Optional[np.array]:
         """Sample a position near the mask and return.
 
@@ -340,14 +334,14 @@ def sample_target_point(
         start_pt = planner.to_pt(start)
         reachable_points = planner.get_reachable_points(start_pt)
         if len(reachable_points) == 0:
-            print('No target point find, maybe no point is reachable')
+            print("No target point find, maybe no point is reachable")
             return None
         reachable_xs, reachable_ys = zip(*reachable_points)
         reachable_xs = torch.tensor(reachable_xs)
         reachable_ys = torch.tensor(reachable_ys)
-        reachable = torch.empty(obstacles.shape, dtype = torch.bool).fill_(False)
+        reachable = torch.empty(obstacles.shape, dtype=torch.bool).fill_(False)
         reachable[reachable_xs, reachable_ys] = True
-        
+
         obstacles, explored = self.voxel_map.get_2d_map()
         # if self.dilate_obstacles_kernel is not None:
         #     obstacles = binary_dilation(
@@ -359,12 +353,15 @@ def sample_target_point(
 
         xs, ys = torch.where(reachable)
         if len(xs) < 1:
-            print('No target point find, maybe no point is reachable')
+            print("No target point find, maybe no point is reachable")
             return None
-        selected_targets = torch.stack([xs, ys], dim = -1) \
-            [torch.linalg.norm( \
-                (torch.stack([xs, ys], dim = -1) - torch.tensor([target_x, target_y])).float(), dim = -1 \
-            ).topk(k = len(xs), largest = False).indices]
+        selected_targets = torch.stack([xs, ys], dim=-1)[
+            torch.linalg.norm(
+                (torch.stack([xs, ys], dim=-1) - torch.tensor([target_x, target_y])).float(), dim=-1
+            )
+            .topk(k=len(xs), largest=False)
+            .indices
+        ]
 
         # TODO: was this:
         # expanded_mask = expanded_mask & less_explored & ~obstacles
@@ -404,7 +401,7 @@ def sample_target_point(
             #                 target_is_valid = False
             if not target_is_valid:
                 continue
-            
+
             return np.array([selected_x, selected_y, theta])
         return None
 
@@ -441,17 +438,13 @@ def sample_near_mask(
             import matplotlib.pyplot as plt
 
             plt.imshow(
-                mask.int() * 20
-                + expanded_mask.int() * 10
-                + explored.int()
-                + obstacles.int() * 5
+                mask.int() * 20 + expanded_mask.int() * 10 + explored.int() + obstacles.int() * 5
             )
             # import datetime
             # current_datetime = datetime.datetime.now()
             # formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
             # plt.savefig('debug_' + formatted_datetime + '.png')
 
-
         # Where can the robot go?
         valid_indices = torch.nonzero(expanded_mask, as_tuple=False)
         if valid_indices.size(0) == 0:
@@ -467,9 +460,7 @@ def sample_near_mask(
             point_grid_coords = valid_indices[random_index]
 
             if look_at_any_point:
-                outside_point = find_closest_point_on_mask(
-                    mask, point_grid_coords.float()
-                )
+                outside_point = find_closest_point_on_mask(mask, point_grid_coords.float())
 
             # convert back
             point = self.voxel_map.grid_coords_to_xy(point_grid_coords)
@@ -530,10 +521,7 @@ def _get_kernel(self, size: int):
             return None
         if size not in self._kernels:
             kernel = torch.nn.Parameter(
-                torch.from_numpy(skimage.morphology.disk(size))
-                .unsqueeze(0)
-                .unsqueeze(0)
-                .float(),
+                torch.from_numpy(skimage.morphology.disk(size)).unsqueeze(0).unsqueeze(0).float(),
                 requires_grad=False,
             )
             self._kernels[size] = kernel
@@ -606,12 +594,12 @@ def sample_exploration(
         self,
         xyt,
         planner,
-        voxel_map_localizer = None,
-        text = None,
-        debug = False,
+        voxel_map_localizer=None,
+        text=None,
+        debug=False,
     ):
-        obstacles, explored, history_soft = self.voxel_map.get_2d_map(return_history_id = True)
-        if len(xyt) == 3:  
+        obstacles, explored, history_soft = self.voxel_map.get_2d_map(return_history_id=True)
+        if len(xyt) == 3:
             xyt = xyt[:2]
         reachable_points = planner.get_reachable_points(planner.to_pt(xyt))
         reachable_xs, reachable_ys = zip(*reachable_points)
@@ -632,10 +620,14 @@ def sample_exploration(
         else:
             expanded_frontier = edges
         outside_frontier = expanded_frontier & ~reachable_map
-        time_heuristics = self._time_heuristic(history_soft, outside_frontier, debug = debug)
+        time_heuristics = self._time_heuristic(history_soft, outside_frontier, debug=debug)
         if voxel_map_localizer is not None:
-            alignments_heuristics = self.voxel_map.get_2d_alignment_heuristics(voxel_map_localizer, text)
-            alignments_heuristics = self._alignment_heuristic(alignments_heuristics, outside_frontier, debug = debug)
+            alignments_heuristics = self.voxel_map.get_2d_alignment_heuristics(
+                voxel_map_localizer, text
+            )
+            alignments_heuristics = self._alignment_heuristic(
+                alignments_heuristics, outside_frontier, debug=debug
+            )
             total_heuristics = time_heuristics + alignments_heuristics
         else:
             alignments_heuristics = None
@@ -644,47 +636,61 @@ def sample_exploration(
         rounded_heuristics = np.ceil(total_heuristics * 100) / 100
         max_heuristic = rounded_heuristics.max()
         indices = np.column_stack(np.where(rounded_heuristics == max_heuristic))
-        closest_index = np.argmin(np.linalg.norm(indices -  np.asarray(planner.to_pt([0, 0, 0])), axis = -1))
+        closest_index = np.argmin(
+            np.linalg.norm(indices - np.asarray(planner.to_pt([0, 0, 0])), axis=-1)
+        )
         index = indices[closest_index]
         # index = np.unravel_index(np.argmax(total_heuristics), total_heuristics.shape)
         # debug = True
         if debug:
             from matplotlib import pyplot as plt
+
             plt.subplot(221)
             plt.imshow(obstacles.int() * 5 + outside_frontier.int() * 10)
             plt.subplot(222)
             plt.imshow(explored.int() * 5)
             plt.subplot(223)
             plt.imshow(total_heuristics)
-            plt.scatter(index[1], index[0], s = 15, c = 'g')
+            plt.scatter(index[1], index[0], s=15, c="g")
             plt.show()
         return index, time_heuristics, alignments_heuristics, total_heuristics
-        
-    def _alignment_heuristic(self, alignments, outside_frontier, alignment_smooth = 50, alignment_threshold = 0.12, debug = False):
+
+    def _alignment_heuristic(
+        self,
+        alignments,
+        outside_frontier,
+        alignment_smooth=50,
+        alignment_threshold=0.12,
+        debug=False,
+    ):
         alignments = np.ma.masked_array(alignments, ~outside_frontier)
-        alignment_heuristics = 1 / (1 + np.exp(-alignment_smooth * (alignments - alignment_threshold)))
+        alignment_heuristics = 1 / (
+            1 + np.exp(-alignment_smooth * (alignments - alignment_threshold))
+        )
         index = np.unravel_index(np.argmax(alignment_heuristics), alignments.shape)
         if debug:
             plt.clf()
-            plt.title('alignment')
+            plt.title("alignment")
             plt.imshow(alignment_heuristics)
-            plt.scatter(index[1], index[0], s = 15, c = 'g')
+            plt.scatter(index[1], index[0], s=15, c="g")
             plt.show()
         return alignment_heuristics
 
-    def _time_heuristic(self, history_soft, outside_frontier, time_smooth = 0.1, time_threshold = 15, debug = False):
+    def _time_heuristic(
+        self, history_soft, outside_frontier, time_smooth=0.1, time_threshold=15, debug=False
+    ):
         history_soft = np.ma.masked_array(history_soft, ~outside_frontier)
         time_heuristics = history_soft.max() - history_soft
-        time_heuristics[history_soft < 1] = float('inf')
+        time_heuristics[history_soft < 1] = float("inf")
         time_heuristics = 1 / (1 + np.exp(-time_smooth * (time_heuristics - time_threshold)))
         index = np.unravel_index(np.argmax(time_heuristics), history_soft.shape)
         # return index
         # debug = True
         if debug:
             # plt.clf()
-            plt.title('time')
+            plt.title("time")
             plt.imshow(time_heuristics)
-            plt.scatter(index[1], index[0], s = 15, c = 'r')
+            plt.scatter(index[1], index[0], s=15, c="r")
             plt.show()
         return time_heuristics
 
@@ -706,9 +712,7 @@ def sample_closest_frontier(
             debug(bool): show visualizations of frontiers
             step_dist(float): how far apart in geo dist these points should be
         """
-        assert (
-            len(xyt) == 2 or len(xyt) == 3
-        ), f"xyt must be of size 2 or 3 instead of {len(xyt)}"
+        assert len(xyt) == 2 or len(xyt) == 3, f"xyt must be of size 2 or 3 instead of {len(xyt)}"
 
         frontier, outside_frontier, traversible = self.get_frontier(
             expand_size=expand_size, debug=debug
@@ -755,9 +759,7 @@ def sample_closest_frontier(
             prev_dist = dist
 
             point_grid_coords = torch.FloatTensor([[x, y]])
-            outside_point = find_closest_point_on_mask(
-                outside_frontier, point_grid_coords
-            )
+            outside_point = find_closest_point_on_mask(outside_frontier, point_grid_coords)
 
             if outside_point is None:
                 print(
diff --git a/src/stretch/dynav/mapping_utils/voxelized_pcd.py b/src/stretch/dynav/mapping_utils/voxelized_pcd.py
index 56ca55e4b..e3cb35a67 100644
--- a/src/stretch/dynav/mapping_utils/voxelized_pcd.py
+++ b/src/stretch/dynav/mapping_utils/voxelized_pcd.py
@@ -1,3 +1,12 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 """
     This file implements VoxelizedPointcloud module in home-robot project (https://github.com/facebookresearch/home-robot).
     Adapted to be used in ok-robot's navigation voxel map:
@@ -35,6 +44,7 @@
 import numpy as np
 import torch
 from torch import Tensor
+
 USE_TORCH_GEOMETRIC = True
 if USE_TORCH_GEOMETRIC:
     from torch_geometric.nn.pool.consecutive import consecutive_cluster
@@ -55,7 +65,9 @@ def project_points(points_3d, K, pose):
         pose = torch.Tensor(pose)
     pose = pose.to(points_3d)
     # Convert points to homogeneous coordinates
-    points_3d_homogeneous = torch.hstack((points_3d, torch.ones((points_3d.shape[0], 1)).to(points_3d)))
+    points_3d_homogeneous = torch.hstack(
+        (points_3d, torch.ones((points_3d.shape[0], 1)).to(points_3d))
+    )
 
     # Transform points into camera coordinate system
     points_camera_homogeneous = torch.matmul(torch.linalg.inv(pose), points_3d_homogeneous.T).T
@@ -73,7 +85,9 @@ def get_depth_values(points_3d, pose):
     if not isinstance(pose, torch.Tensor):
         pose = torch.Tensor(pose)
     pose = pose.to(points_3d)
-    points_3d_homogeneous = torch.hstack((points_3d, torch.ones((points_3d.shape[0], 1)).to(points_3d)))
+    points_3d_homogeneous = torch.hstack(
+        (points_3d, torch.ones((points_3d.shape[0], 1)).to(points_3d))
+    )
 
     # Transform points into camera coordinate system
     points_camera_homogeneous = torch.matmul(torch.linalg.inv(pose), points_3d_homogeneous.T).T
@@ -136,7 +150,7 @@ def reset(self):
         self._maxs = self.dim_maxs
         self.obs_count = 1
 
-    def clear_points(self, depth, intrinsics, pose, depth_is_valid = None, min_samples_clear = None):
+    def clear_points(self, depth, intrinsics, pose, depth_is_valid=None, min_samples_clear=None):
         if self._points is not None:
             xys = project_points(self._points.detach().cpu(), intrinsics, pose).int()
             xys = xys[:, [1, 0]]
@@ -146,37 +160,48 @@ def clear_points(self, depth, intrinsics, pose, depth_is_valid = None, min_sampl
             # Some points are projected to (i, j) on image plane and i, j might be smaller than 0 or greater than image size
             # which will lead to Index Error.
             valid_xys = xys.clone()
-            valid_xys[torch.any(torch.stack([
-                    xys[:, 0] < 0, 
-                    xys[:, 0] >= H, 
-                    xys[:, 1] < 0, 
-                    xys[:, 1] >= W,
-                ], dim = 0), dim = 0)] = 0
+            valid_xys[
+                torch.any(
+                    torch.stack(
+                        [
+                            xys[:, 0] < 0,
+                            xys[:, 0] >= H,
+                            xys[:, 1] < 0,
+                            xys[:, 1] >= W,
+                        ],
+                        dim=0,
+                    ),
+                    dim=0,
+                )
+            ] = 0
             indices = torch.any(
                 torch.stack(
                     [
                         # the points are projected outside image frame
-                        xys[:, 0] < 0, xys[:, 0] >= H, 
-                        xys[:, 1] < 0, xys[:, 1] >= W, 
+                        xys[:, 0] < 0,
+                        xys[:, 0] >= H,
+                        xys[:, 1] < 0,
+                        xys[:, 1] >= W,
                         # the points are projected to the image frame but is blocked by some obstacles
-                        depth[valid_xys[:, 0], valid_xys[:, 1]] < (proj_depth - 0.1), 
+                        depth[valid_xys[:, 0], valid_xys[:, 1]] < (proj_depth - 0.1),
                         # the points are projected to the image frame but they are behind camera
                         depth[valid_xys[:, 0], valid_xys[:, 1]] < 0.01,
                         proj_depth < 0.01,
                         # depth is too large
                         # (~depth_is_valid)[valid_xys[:, 0], valid_xys[:, 1]],
-                        proj_depth > 2.0
+                        proj_depth > 2.0,
                     ],
-                    dim = 0
+                    dim=0,
                 ),
-                dim = 0)
+                dim=0,
+            )
 
             # if self._entity_ids is not None:
             #     removed_entities, removed_counts = torch.unique(self._entity_ids.detach().cpu()[~indices], return_counts = True)
             #     total_counts = torch.bincount(self._entity_ids.detach().cpu())
             #     entities_to_be_removed = removed_entities[(removed_counts > total_counts[removed_entities] * 0.6) | (total_counts[removed_entities] - removed_counts < 5)]
             #     indices = indices & ~torch.isin(self._entity_ids.detach().cpu(), entities_to_be_removed)
-        
+
             # print('Clearing non valid points...')
             # print('Removing ' + str((~indices).sum().item()) + ' points.')
             indices = indices.to(self._points.device)
@@ -184,7 +209,7 @@ def clear_points(self, depth, intrinsics, pose, depth_is_valid = None, min_sampl
             if self._features is not None:
                 self._features = self._features[indices]
             if self._weights is not None:
-                self._weights= self._weights[indices]
+                self._weights = self._weights[indices]
             if self._rgb is not None:
                 self._rgb = self._rgb[indices]
             if self._obs_counts is not None:
@@ -194,7 +219,13 @@ def clear_points(self, depth, intrinsics, pose, depth_is_valid = None, min_sampl
 
             if self._entity_ids is not None and min_samples_clear is not None:
                 dbscan = DBSCAN(eps=self.voxel_size * 4, min_samples=min_samples_clear)
-                cluster_vertices = torch.cat((self._points.detach().cpu(), self._entity_ids.detach().cpu().reshape(-1,1) * 1000), -1).numpy()
+                cluster_vertices = torch.cat(
+                    (
+                        self._points.detach().cpu(),
+                        self._entity_ids.detach().cpu().reshape(-1, 1) * 1000,
+                    ),
+                    -1,
+                ).numpy()
                 clusters = dbscan.fit(cluster_vertices)
                 labels = clusters.labels_
                 indices = labels != -1
@@ -202,7 +233,7 @@ def clear_points(self, depth, intrinsics, pose, depth_is_valid = None, min_sampl
                 if self._features is not None:
                     self._features = self._features[indices]
                 if self._weights is not None:
-                    self._weights= self._weights[indices]
+                    self._weights = self._weights[indices]
                 if self._rgb is not None:
                     self._rgb = self._rgb[indices]
                 if self._obs_counts is not None:
@@ -217,7 +248,7 @@ def add(
         rgb: Optional[Tensor],
         weights: Optional[Tensor] = None,
         obs_count: Optional[int] = None,
-        entity_id: Optional[int] = None
+        entity_id: Optional[int] = None,
     ):
         """Add a feature pointcloud to the voxel grid.
 
@@ -239,7 +270,7 @@ def add(
         else:
             obs_count = torch.ones_like(weights) * obs_count
         if entity_id is None:
-            entity_id =  torch.ones_like(weights) * self.obs_count
+            entity_id = torch.ones_like(weights) * self.obs_count
         else:
             obs_count = torch.ones_like(weights) * entity_id
         self.obs_count += 1
@@ -263,21 +294,15 @@ def add(
             self._mins, self._maxs = pos_mins, pos_maxs
             # recompute_voxels = True
         else:
-            assert (
-                self._maxs is not None
-            ), "How did self._mins get set without self._maxs?"
+            assert self._maxs is not None, "How did self._mins get set without self._maxs?"
             # recompute_voxels = torch.any(pos_mins < self._mins) or torch.any(self._maxs < pos_maxs)
             self._mins = torch.min(self._mins, pos_mins)
             self._maxs = torch.max(self._maxs, pos_maxs)
 
         if self._points is None:
-            assert (
-                self._features is None
-            ), "How did self._points get unset while _features is set?"
+            assert self._features is None, "How did self._points get unset while _features is set?"
             # assert self._rgbs is None, "How did self._points get unset while _rgbs is set?"
-            assert (
-                self._weights is None
-            ), "How did self._points get unset while _weights is set?"
+            assert self._weights is None, "How did self._points get unset while _weights is set?"
             all_points, all_features, all_weights, all_rgb = (
                 points,
                 features,
@@ -291,9 +316,7 @@ def add(
             all_points = torch.cat([self._points, points], dim=0)
             all_weights = torch.cat([self._weights, weights], dim=0)
             all_features = (
-                torch.cat([self._features, features], dim=0)
-                if (features is not None)
-                else None
+                torch.cat([self._features, features], dim=0) if (features is not None) else None
             )
             all_rgb = torch.cat([self._rgb, rgb], dim=0) if (rgb is not None) else None
             all_obs_counts = torch.cat([self._obs_counts, obs_count], dim=0)
@@ -306,14 +329,21 @@ def add(
         cluster_voxel_idx, cluster_consecutive_idx, _ = voxelize(
             all_points, voxel_size=self.voxel_size, start=self._mins, end=self._maxs
         )
-        self._points, self._features, self._weights, self._rgb, self._obs_counts, self._entity_ids = reduce_pointcloud(
+        (
+            self._points,
+            self._features,
+            self._weights,
+            self._rgb,
+            self._obs_counts,
+            self._entity_ids,
+        ) = reduce_pointcloud(
             cluster_consecutive_idx,
             pos=all_points,
             features=all_features,
             weights=all_weights,
             rgbs=all_rgb,
-            obs_counts = all_obs_counts,
-            entity_ids = all_entity_ids,
+            obs_counts=all_obs_counts,
+            entity_ids=all_entity_ids,
             feature_reduce=self.feature_pool_method,
         )
         self._obs_counts, self._entity_ids = self._obs_counts.int(), self._entity_ids.int()
@@ -449,9 +479,7 @@ def voxelize(
         cluster_consecutive_idx (LongTensor): Packed idx -- contiguous in cluster ID. E.g. [0, 0, 2, 1, 1, 2]
         batch_sample: See https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/nn/pool/max_pool.html
     """
-    voxel_cluster = voxel_grid(
-        pos=pos, batch=batch, size=voxel_size, start=start, end=end
-    )
+    voxel_cluster = voxel_grid(pos=pos, batch=batch, size=voxel_size, start=start, end=end)
     cluster_consecutive_idx, perm = consecutive_cluster(voxel_cluster)
     batch_sample = batch[perm] if batch is not None else None
     cluster_idx = voxel_cluster
@@ -478,9 +506,7 @@ def scatter_weighted_mean(
         Tensor: Agggregated features, weighted by weights and normalized by weights_cluster
     """
     assert dim == 0, "Dim != 0 not yet implemented"
-    feature_cluster = scatter(
-        features * weights[:, None], cluster, dim=dim, reduce="sum"
-    )
+    feature_cluster = scatter(features * weights[:, None], cluster, dim=dim, reduce="sum")
     feature_cluster = feature_cluster / weights_cluster[:, None]
     return feature_cluster
 
@@ -518,29 +544,32 @@ def reduce_pointcloud(
         weights = torch.ones_like(pos[..., 0])
     weights_cluster = scatter(weights, voxel_cluster, dim=0, reduce="sum")
 
-    pos_cluster = scatter_weighted_mean(
-        pos, weights, voxel_cluster, weights_cluster, dim=0
-    )
+    pos_cluster = scatter_weighted_mean(pos, weights, voxel_cluster, weights_cluster, dim=0)
 
     if rgbs is not None:
-        rgb_cluster = scatter_weighted_mean(
-            rgbs, weights, voxel_cluster, weights_cluster, dim=0
-        )
+        rgb_cluster = scatter_weighted_mean(rgbs, weights, voxel_cluster, weights_cluster, dim=0)
     else:
         rgb_cluster = None
-    
+
     if obs_counts is not None:
-        obs_count_cluster = scatter(obs_counts, voxel_cluster, dim = 0, reduce = "max")
+        obs_count_cluster = scatter(obs_counts, voxel_cluster, dim=0, reduce="max")
     else:
         obs_count_cluster = None
 
     if entity_ids is not None:
-        entity_ids_cluster = scatter(entity_ids, voxel_cluster, dim = 0, reduce = "max")
+        entity_ids_cluster = scatter(entity_ids, voxel_cluster, dim=0, reduce="max")
     else:
         entity_ids_cluster = None
 
     if features is None:
-        return pos_cluster, None, weights_cluster, rgb_cluster, obs_count_cluster, entity_ids_cluster
+        return (
+            pos_cluster,
+            None,
+            weights_cluster,
+            rgb_cluster,
+            obs_count_cluster,
+            entity_ids_cluster,
+        )
 
     if feature_reduce == "mean":
         feature_cluster = scatter_weighted_mean(
@@ -549,17 +578,25 @@ def reduce_pointcloud(
     elif feature_reduce == "max":
         feature_cluster = scatter(features, voxel_cluster, dim=0, reduce="max")
     elif feature_reduce == "sum":
-        feature_cluster = scatter(
-            features * weights[:, None], voxel_cluster, dim=0, reduce="sum"
-        )
+        feature_cluster = scatter(features * weights[:, None], voxel_cluster, dim=0, reduce="sum")
     else:
         raise NotImplementedError(f"Unknown feature reduction method {feature_reduce}")
 
-    return pos_cluster, feature_cluster, weights_cluster, rgb_cluster, obs_count_cluster, entity_ids_cluster
+    return (
+        pos_cluster,
+        feature_cluster,
+        weights_cluster,
+        rgb_cluster,
+        obs_count_cluster,
+        entity_ids_cluster,
+    )
 
 
 def scatter3d(
-    voxel_indices: Tensor, weights: Tensor, grid_dimensions: List[int], method: str = "mean",
+    voxel_indices: Tensor,
+    weights: Tensor,
+    grid_dimensions: List[int],
+    method: str = "mean",
 ) -> Tensor:
     """Scatter weights into a 3d voxel grid of the appropriate size.
 
@@ -622,4 +659,4 @@ def drop_smallest_weight_points(
     if len(points_sorted[cutoff_idx:]) < min_points_after_drop:
         return orig_points
     # print(f"Reduced {len(orig_points)} -> {len(points)} -> {above_cutoff.sum()}")
-    return points_sorted[cutoff_idx:]
\ No newline at end of file
+    return points_sorted[cutoff_idx:]
diff --git a/src/stretch/dynav/ok_robot_hw/README.md b/src/stretch/dynav/ok_robot_hw/README.md
index 3c773b67f..5e79b1f41 100644
--- a/src/stretch/dynav/ok_robot_hw/README.md
+++ b/src/stretch/dynav/ok_robot_hw/README.md
@@ -1,7 +1,9 @@
 # Robot Side Code
+
 Most of the heavy code will be runnning in the workstation and will communicate with the robot through sockets
 
 ## Preparation to run robot side codes
+
 Our hardware codes heavily rely on robot controller provided by [home-robot](https://github.com/facebookresearch/home-robot).
 
 You need to install the home-robot on stretch robot following intructions provided by [home-robot-hw](https://github.com/facebookresearch/home-robot/blob/main/docs/install_robot.md) before running any robot side codes on robot.
@@ -13,26 +15,29 @@ Once you finised installing home-robot, follow [these steps](../docs/robot-insta
 The success of OK-Robot system also depends on robot calibration and accurate urdf file, so make sure you follow [these calibration steps](../docs/robot-calibration.md) to obtain an accurate urdf file for your robot.
 
 ## Start home-robot
+
 ```
 stretch_robot_home.py
 roslaunch home_robot_hw startup_stretch_hector_slam.launch
 ```
 
 ## Start Robot Control
+
 ```
 python run.py -x1 [x1] -y1 [y1] -x2 [x2] -y2 [y2] -ip [your workstation ip]
 ```
 
-* **[x1, y1]** - Co-ordinated of tape on which the base of the robot is on
-* **[x2, y2]** - Co-ordinates of the secondary tape.
-* **ip** - Your workstation ip, the robot will try to communicate with this ip
-* **np** - Navigation port number, the robot will listen to this port number to get planned navigation paths (default: 5555)
-* **mp** - Manipulation port number, the robot will listen to this port number to get estimated manipulation pose (default: 5556)
+- **\[x1, y1\]** - Co-ordinated of tape on which the base of the robot is on
+- **\[x2, y2\]** - Co-ordinates of the secondary tape.
+- **ip** - Your workstation ip, the robot will try to communicate with this ip
+- **np** - Navigation port number, the robot will listen to this port number to get planned navigation paths (default: 5555)
+- **mp** - Manipulation port number, the robot will listen to this port number to get estimated manipulation pose (default: 5556)
+
+After running run.py it will go through 4 states in each cycle.
 
-After running run.py it will go through 4 states in each cycle. 
-* Picking Navigation
-* Manipulation
-* Placing Navigation
-* Placing 
+- Picking Navigation
+- Manipulation
+- Placing Navigation
+- Placing
 
-For each navigation stage it asks A [Object Name], B [Near by Object Name]
\ No newline at end of file
+For each navigation stage it asks A \[Object Name\], B \[Near by Object Name\]
diff --git a/src/stretch/dynav/ok_robot_hw/camera.py b/src/stretch/dynav/ok_robot_hw/camera.py
index 09def2415..947cf9a57 100644
--- a/src/stretch/dynav/ok_robot_hw/camera.py
+++ b/src/stretch/dynav/ok_robot_hw/camera.py
@@ -1,3 +1,12 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 ## License: Apache 2.0. See LICENSE file in root directory.
 ## Copyright(c) 2017 Intel Corporation. All Rights Reserved.
 
@@ -5,10 +14,11 @@
 ##              Align Depth to Color               ##
 #####################################################
 
-import numpy as np
 import cv2
-import open3d as o3d
 import matplotlib.pyplot as plt
+import numpy as np
+import open3d as o3d
+
 
 class RealSenseCamera:
     def __init__(self, robot):
@@ -16,7 +26,7 @@ def __init__(self, robot):
         self.depth_scale = 0.001
 
         # Camera intrinsics
-        intrinsics = self.robot.get_camera_K() 
+        intrinsics = self.robot.get_camera_K()
         self.fy = intrinsics[0, 0]
         self.fx = intrinsics[1, 1]
         self.cy = intrinsics[0, 2]
@@ -28,34 +38,37 @@ def __init__(self, robot):
 
     def capture_image(self, visualize=False):
         self.rgb_image, self.depth_image, self.points = self.robot.get_images(compute_xyz=True)
-        self.rgb_image = np.rot90(self.rgb_image, k = 1)[:, :, [2, 1, 0]]
-        self.depth_image = np.rot90(self.depth_image, k = 1)
-        self.points = np.rot90(self.points, k = 1)
+        self.rgb_image = np.rot90(self.rgb_image, k=1)[:, :, [2, 1, 0]]
+        self.depth_image = np.rot90(self.depth_image, k=1)
+        self.points = np.rot90(self.points, k=1)
 
         cv2.imwrite("./images/input.jpg", np.rot90(self.rgb_image, k=-1))
         # cv2.imwrite("depth.jpg", np.rot90(self.depth_image, k=-1)/np.max(self.depth_image))
         self.rgb_image = cv2.cvtColor(self.rgb_image, cv2.COLOR_BGR2RGB)
-        
+
         if visualize:
-            fig, ax = plt.subplots(1, 2, figsize=(10,5))
-            timer = fig.canvas.new_timer(interval = 5000) #creating a timer object and setting an interval of 3000 milliseconds
-            timer.add_callback(lambda : plt.close())
+            fig, ax = plt.subplots(1, 2, figsize=(10, 5))
+            timer = fig.canvas.new_timer(
+                interval=5000
+            )  # creating a timer object and setting an interval of 3000 milliseconds
+            timer.add_callback(lambda: plt.close())
 
             ax[0].imshow(np.rot90(self.rgb_image, k=-1))
             ax[0].set_title("Color Image")
 
             ax[1].imshow(np.rot90(self.depth_image, k=-1))
             ax[1].set_title("Depth Image")
-                
+
             plt.savefig("./images/rgb_dpt.png")
             plt.pause(3)
             plt.close()
-        
+
         return self.rgb_image, self.depth_image, self.points
 
     def pixel2d_to_point3d(self, ix, iy):
         return self.points[iy, ix][[1, 0, 2]]
 
+
 if __name__ == "__main__":
     camera = RealSenseCamera()
     camera.capture_image()
diff --git a/src/stretch/dynav/ok_robot_hw/global_parameters.py b/src/stretch/dynav/ok_robot_hw/global_parameters.py
index 98faca81e..574e04959 100644
--- a/src/stretch/dynav/ok_robot_hw/global_parameters.py
+++ b/src/stretch/dynav/ok_robot_hw/global_parameters.py
@@ -1,3 +1,12 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 POS_TOL = 0.1
 YAW_TOL = 0.2
 
@@ -5,8 +14,8 @@
 GRIPPER_FINGERTIP_LEFT_NODE = "link_gripper_fingertip_left"
 GRIPPER_FINGERTIP_RIGHT_NODE = "link_gripper_fingertip_right"
 
-INIT_LIFT_POS = 0.45 
-INIT_WRIST_PITCH = -1.57 
+INIT_LIFT_POS = 0.45
+INIT_WRIST_PITCH = -1.57
 INIT_ARM_POS = 0
 INIT_WRIST_ROLL = 0
 INIT_WRIST_YAW = 0
diff --git a/src/stretch/dynav/ok_robot_hw/image_publisher.py b/src/stretch/dynav/ok_robot_hw/image_publisher.py
index ab9064a30..6afc54a47 100644
--- a/src/stretch/dynav/ok_robot_hw/image_publisher.py
+++ b/src/stretch/dynav/ok_robot_hw/image_publisher.py
@@ -1,11 +1,27 @@
-import zmq
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 import numpy as np
+import zmq
 from PIL import Image as PILImage
 
-from stretch.dynav.ok_robot_hw.utils.communication_utils import send_array, recv_array, send_depth_img, recv_depth_img, send_rgb_img,  recv_rgb_img
+from stretch.dynav.ok_robot_hw.utils.communication_utils import (
+    recv_array,
+    recv_depth_img,
+    recv_rgb_img,
+    send_array,
+    send_depth_img,
+    send_rgb_img,
+)
 
-class ImagePublisher():
 
+class ImagePublisher:
     def __init__(self, camera, socket):
         self.camera = camera
         self.socket = socket
@@ -24,12 +40,23 @@ def publish_image(self, text, mode, head_tilt=-1):
         send_rgb_img(self.socket, rotated_image)
         print(self.socket.recv_string())
         send_depth_img(self.socket, rotated_depth)
-        print(self.socket.recv_string()) 
+        print(self.socket.recv_string())
         # send_array(self.socket, rotated_image)
         # print(self.socket.recv_string())
         # send_array(self.socket, rotated_depth)
-        # print(self.socket.recv_string()) 
-        send_array(self.socket, np.array([self.camera.fy, self.camera.fx, self.camera.cy, self.camera.cx, int(head_tilt*100)]))
+        # print(self.socket.recv_string())
+        send_array(
+            self.socket,
+            np.array(
+                [
+                    self.camera.fy,
+                    self.camera.fx,
+                    self.camera.cy,
+                    self.camera.cx,
+                    int(head_tilt * 100),
+                ]
+            ),
+        )
         print(self.socket.recv_string())
 
         ## Sending Object text and Manipulation mode
@@ -55,5 +82,5 @@ def publish_image(self, text, mode, head_tilt=-1):
         print(translation)
         print("rotation: ")
         print(rotation)
-        print(self.socket.recv_string())    
+        print(self.socket.recv_string())
         return translation, rotation, depth, width, retry
diff --git a/src/stretch/dynav/ok_robot_hw/robot.py b/src/stretch/dynav/ok_robot_hw/robot.py
index d3aa1e1a8..87e5edc3b 100644
--- a/src/stretch/dynav/ok_robot_hw/robot.py
+++ b/src/stretch/dynav/ok_robot_hw/robot.py
@@ -1,56 +1,75 @@
-import numpy as np
-import sys
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import math
 import os
+import random
+import sys
+import time
+
+import numpy as np
+import pinocchio as pin
 
 # from urdf_parser_py.urdf import URDF
 from scipy.spatial.transform import Rotation as R
-import math
-import time
-import random
-import os
 
-from stretch.dynav.ok_robot_hw.utils import transform_joint_array
 from stretch.dynav.ok_robot_hw.global_parameters import *
-
-import pinocchio as pin
+from stretch.dynav.ok_robot_hw.utils import transform_joint_array
 from stretch.motion.kinematics import HelloStretchIdx
 
 OVERRIDE_STATES = {}
 
+
 class HelloRobot:
     def __init__(
         self,
         robot,
-        stretch_client_urdf_file = 'src/stretch/config/urdf',
-        gripper_threshold = 7.0, 
-        stretch_gripper_max = 0.64, 
-        stretch_gripper_min = 0, 
-        end_link = 'link_gripper_s3_body'
+        stretch_client_urdf_file="src/stretch/config/urdf",
+        gripper_threshold=7.0,
+        stretch_gripper_max=0.64,
+        stretch_gripper_min=0,
+        end_link="link_gripper_s3_body",
     ):
         self.STRETCH_GRIPPER_MAX = stretch_gripper_max
         self.STRETCH_GRIPPER_MIN = stretch_gripper_min
-        self.urdf_path = os.path.join(stretch_client_urdf_file, 'stretch.urdf')
-        self.joints_pin = {'joint_fake':0}
-        
+        self.urdf_path = os.path.join(stretch_client_urdf_file, "stretch.urdf")
+        self.joints_pin = {"joint_fake": 0}
+
         self.GRIPPER_THRESHOLD = gripper_threshold
 
         print("hello robot starting")
         self.head_joint_list = ["joint_fake", "joint_head_pan", "joint_head_tilt"]
-        self.init_joint_list = ["joint_fake","joint_lift","3","2","1" ,"0","joint_wrist_yaw","joint_wrist_pitch","joint_wrist_roll", "joint_gripper_finger_left"]
-
-        # end_link is the frame of reference node 
-        self.end_link = end_link 
+        self.init_joint_list = [
+            "joint_fake",
+            "joint_lift",
+            "3",
+            "2",
+            "1",
+            "0",
+            "joint_wrist_yaw",
+            "joint_wrist_pitch",
+            "joint_wrist_roll",
+            "joint_gripper_finger_left",
+        ]
+
+        # end_link is the frame of reference node
+        self.end_link = end_link
         self.set_end_link(end_link)
-        
+
         # Initialize StretchClient controller
         self.robot = robot
         self.robot.switch_to_manipulation_mode()
         # time.sleep(2)
 
         # Constraining the robots movement
-        self.clamp = lambda n, minn, maxn: max(min(maxn, n), minn) 
+        self.clamp = lambda n, minn, maxn: max(min(maxn, n), minn)
         self.pan, self.tilt = self.robot.get_pan_tilt()
-    
 
     def set_end_link(self, link):
         if link == GRIPPER_FINGERTIP_LEFT_NODE or GRIPPER_FINGERTIP_RIGHT_NODE:
@@ -58,44 +77,48 @@ def set_end_link(self, link):
         else:
             self.joint_list = self.init_joint_list[:-1]
 
-
     def get_joints(self):
         """
-            Returns all the joint names and values involved in forward kinematics of head and gripper 
+        Returns all the joint names and values involved in forward kinematics of head and gripper
         """
         ## Names of all 13 joints
-        joint_names = self.init_joint_list + ["joint_gripper_finger_right"] + self.head_joint_list[1:]
+        joint_names = (
+            self.init_joint_list + ["joint_gripper_finger_right"] + self.head_joint_list[1:]
+        )
         self.updateJoints()
         joint_values = list(self.joints.values()) + [0] + list(self.head_joints.values())[1:]
 
         return joint_names, joint_values
 
     def move_to_position(
-        self, 
-        lift_pos = None, 
-        arm_pos = None, 
-        base_trans = 0.0, 
-        wrist_yaw = None, 
-        wrist_pitch = None, 
-        wrist_roll = None, 
-        gripper_pos = None, 
-        base_theta = None, 
-        head_tilt = None, 
-        head_pan = None,
-        blocking = True,
+        self,
+        lift_pos=None,
+        arm_pos=None,
+        base_trans=0.0,
+        wrist_yaw=None,
+        wrist_pitch=None,
+        wrist_roll=None,
+        gripper_pos=None,
+        base_theta=None,
+        head_tilt=None,
+        head_pan=None,
+        blocking=True,
     ):
         """
-            Moves the robots, base, arm, lift, wrist and head to a desired position.
+        Moves the robots, base, arm, lift, wrist and head to a desired position.
         """
         if base_theta is not None:
-            self.robot.navigate_to([0, 0, base_theta], relative = True, blocking = True)
+            self.robot.navigate_to([0, 0, base_theta], relative=True, blocking=True)
             return
-            
+
         # Base, arm and lift state update
         target_state = self.robot.get_six_joints()
         if not gripper_pos is None:
-            self.CURRENT_STATE = gripper_pos*(self.STRETCH_GRIPPER_MAX-self.STRETCH_GRIPPER_MIN)+self.STRETCH_GRIPPER_MIN
-            self.robot.gripper_to(self.CURRENT_STATE, blocking = blocking)
+            self.CURRENT_STATE = (
+                gripper_pos * (self.STRETCH_GRIPPER_MAX - self.STRETCH_GRIPPER_MIN)
+                + self.STRETCH_GRIPPER_MIN
+            )
+            self.robot.gripper_to(self.CURRENT_STATE, blocking=blocking)
         if not arm_pos is None:
             target_state[2] = arm_pos
         if not lift_pos is None:
@@ -110,12 +133,12 @@ def move_to_position(
         if not wrist_pitch is None:
             target_state[4] = min(wrist_pitch, 0.1)
         if not wrist_roll is None:
-            target_state[5] = wrist_roll    
-        
+            target_state[5] = wrist_roll
+
         # Actual Movement
         # print('Target Position', target_state)
         # print('pan tilt before', self.robot.get_pan_tilt())
-        self.robot.arm_to(target_state, blocking = blocking, head = np.array([self.pan, self.tilt]))
+        self.robot.arm_to(target_state, blocking=blocking, head=np.array([self.pan, self.tilt]))
         # print('pan tilt after', self.robot.get_pan_tilt())
         # print('Actual location', self.robot.get_six_joints())
 
@@ -129,77 +152,77 @@ def move_to_position(
         if not head_pan is None:
             target_head_pan = head_pan
             self.pan = head_pan
-        self.robot.head_to(head_tilt = target_head_tilt, head_pan = target_head_pan, blocking = blocking)
+        self.robot.head_to(head_tilt=target_head_tilt, head_pan=target_head_pan, blocking=blocking)
         # self.pan, self.tilt = self.robot.get_pan_tilt()
-        #time.sleep(0.7)
+        # time.sleep(0.7)
 
     def pickup(self, width):
         """
-            Code for grasping the object
-            Gripper closes gradually until it encounters resistence
+        Code for grasping the object
+        Gripper closes gradually until it encounters resistence
         """
         next_gripper_pos = width
         while True:
-            self.robot.gripper_to(max(next_gripper_pos * self.STRETCH_GRIPPER_MAX, -0.2), blocking = True)
+            self.robot.gripper_to(
+                max(next_gripper_pos * self.STRETCH_GRIPPER_MAX, -0.2), blocking=True
+            )
             curr_gripper_pose = self.robot.get_gripper_position()
             # print('Robot means to move gripper to', next_gripper_pos * self.STRETCH_GRIPPER_MAX)
             # print('Robot actually moves gripper to', curr_gripper_pose)
             if next_gripper_pos == -1:
                 break
-            
+
             if next_gripper_pos > 0:
                 next_gripper_pos -= 0.35
-            else: 
+            else:
                 next_gripper_pos = -1
 
     def updateJoints(self):
         """
-            update all the current poisitions of joints 
+        update all the current poisitions of joints
         """
         state = self.robot.get_six_joints()
         origin_dist = state[0]
 
         # Head Joints
         pan, tilt = self.robot.get_pan_tilt()
-        
-        self.joints_pin['joint_fake'] = origin_dist
-        self.joints_pin['joint_lift'] = state[1]
+
+        self.joints_pin["joint_fake"] = origin_dist
+        self.joints_pin["joint_lift"] = state[1]
         armPos = state[2]
-        self.joints_pin['joint_arm_l3'] = armPos / 4.0
-        self.joints_pin['joint_arm_l2'] = armPos / 4.0
-        self.joints_pin['joint_arm_l1'] = armPos / 4.0
-        self.joints_pin['joint_arm_l0'] = armPos / 4.0
-        self.joints_pin['joint_wrist_yaw'] = state[3]
-        self.joints_pin['joint_wrist_roll'] = state[5]
-        self.joints_pin['joint_wrist_pitch'] = OVERRIDE_STATES.get('wrist_pitch', state[4])
-        self.joints_pin['joint_gripper_finger_left'] = 0
-
-        self.joints_pin['joint_head_pan'] = pan
-        self.joints_pin['joint_head_tilt'] = tilt
-
-    # following function is used to move the robot to a desired joint configuration 
+        self.joints_pin["joint_arm_l3"] = armPos / 4.0
+        self.joints_pin["joint_arm_l2"] = armPos / 4.0
+        self.joints_pin["joint_arm_l1"] = armPos / 4.0
+        self.joints_pin["joint_arm_l0"] = armPos / 4.0
+        self.joints_pin["joint_wrist_yaw"] = state[3]
+        self.joints_pin["joint_wrist_roll"] = state[5]
+        self.joints_pin["joint_wrist_pitch"] = OVERRIDE_STATES.get("wrist_pitch", state[4])
+        self.joints_pin["joint_gripper_finger_left"] = 0
+
+        self.joints_pin["joint_head_pan"] = pan
+        self.joints_pin["joint_head_tilt"] = tilt
+
+    # following function is used to move the robot to a desired joint configuration
     def move_to_joints(self, joints, gripper, mode=0):
         """
-            Given the desrired joints movement this fucntion will the joints accordingly
+        Given the desrired joints movement this fucntion will the joints accordingly
         """
         state = self.robot.get_six_joints()
 
         # clamp rotational joints between -1.57 to 1.57
-        joints['joint_wrist_pitch'] = (joints['joint_wrist_pitch'] + 1.57) % 3.14 - 1.57
-        joints['joint_wrist_yaw'] = (joints['joint_wrist_yaw'] + 1.57) % 3.14 - 1.57
-        joints['joint_wrist_roll'] = (joints['joint_wrist_roll'] + 1.57) % 3.14 - 1.57
-        joints['joint_wrist_pitch'] = self.clamp(joints['joint_wrist_pitch'], -1.57, 0.1)
+        joints["joint_wrist_pitch"] = (joints["joint_wrist_pitch"] + 1.57) % 3.14 - 1.57
+        joints["joint_wrist_yaw"] = (joints["joint_wrist_yaw"] + 1.57) % 3.14 - 1.57
+        joints["joint_wrist_roll"] = (joints["joint_wrist_roll"] + 1.57) % 3.14 - 1.57
+        joints["joint_wrist_pitch"] = self.clamp(joints["joint_wrist_pitch"], -1.57, 0.1)
         target_state = [
-            joints['joint_fake'], 
-            joints['joint_lift'],
-            joints['3'] + 
-            joints['2'] + 
-            joints['1'] + 
-            joints['0'],
-            joints['joint_wrist_yaw'],
-            joints['joint_wrist_pitch'],
-            joints['joint_wrist_roll']]
-        
+            joints["joint_fake"],
+            joints["joint_lift"],
+            joints["3"] + joints["2"] + joints["1"] + joints["0"],
+            joints["joint_wrist_yaw"],
+            joints["joint_wrist_pitch"],
+            joints["joint_wrist_roll"],
+        ]
+
         # print('pan tilt before', self.robot.get_pan_tilt())
 
         # Moving only the lift first
@@ -207,52 +230,54 @@ def move_to_joints(self, joints, gripper, mode=0):
             target1 = state
             target1[0] = target_state[0]
             target1[1] = min(1.1, target_state[1] + 0.2)
-            self.robot.arm_to(target1, blocking = True, head = np.array([self.pan, self.tilt]))
+            self.robot.arm_to(target1, blocking=True, head=np.array([self.pan, self.tilt]))
 
         # print('pan tilt after', self.robot.get_pan_tilt())
         # print('pan tilt before', self.robot.get_pan_tilt())
 
-        self.robot.arm_to(target_state, blocking = True, head = np.array([self.pan, self.tilt]))
-        self.robot.head_to(head_tilt = self.tilt, head_pan = self.pan, blocking = True)
+        self.robot.arm_to(target_state, blocking=True, head=np.array([self.pan, self.tilt]))
+        self.robot.head_to(head_tilt=self.tilt, head_pan=self.pan, blocking=True)
 
         # print('pan tilt after', self.robot.get_pan_tilt())
         # print(f"current state {self.robot.get_six_joints()}")
         # print(f"target state {target_state}")
         # time.sleep(1)
 
-        #NOTE: below code is to fix the pitch drift issue in current hello-robot. Remove it if there is no pitch drift issue
-        OVERRIDE_STATES['wrist_pitch'] = joints['joint_wrist_pitch']
-    
+        # NOTE: below code is to fix the pitch drift issue in current hello-robot. Remove it if there is no pitch drift issue
+        OVERRIDE_STATES["wrist_pitch"] = joints["joint_wrist_pitch"]
+
     def get_joint_transform(self, node1, node2):
-        '''
-            This function takes two nodes from a robot URDF file as input and 
-            outputs the coordinate frame of node2 relative to the coordinate frame of node1.
+        """
+        This function takes two nodes from a robot URDF file as input and
+        outputs the coordinate frame of node2 relative to the coordinate frame of node1.
 
-            Mainly used for transforming co-ordinates from camera frame to gripper frame.
-        '''
+        Mainly used for transforming co-ordinates from camera frame to gripper frame.
+        """
 
         # return frame_transform, frame2, frame1
         self.updateJoints()
         frame_pin = self.robot.get_frame_pose(self.joints_pin, node1, node2)
 
         return frame_pin
-    
+
     def move_to_pose(self, translation_tensor, rotational_tensor, gripper, move_mode=0):
         """
-            Function to move the gripper to a desired translation and rotation
+        Function to move the gripper to a desired translation and rotation
         """
         translation = [translation_tensor[0], translation_tensor[1], translation_tensor[2]]
         rotation = rotational_tensor
-        
+
         self.updateJoints()
 
         q = self.robot.get_joint_positions()
-        q[HelloStretchIdx.WRIST_PITCH] = OVERRIDE_STATES.get('wrist_pitch', q[HelloStretchIdx.WRIST_PITCH])
-        pin_pose = self.robot.get_ee_pose(matrix=True, link_name=self.end_link, q = q)
+        q[HelloStretchIdx.WRIST_PITCH] = OVERRIDE_STATES.get(
+            "wrist_pitch", q[HelloStretchIdx.WRIST_PITCH]
+        )
+        pin_pose = self.robot.get_ee_pose(matrix=True, link_name=self.end_link, q=q)
         pin_rotation, pin_translation = pin_pose[:3, :3], pin_pose[:3, 3]
         pin_curr_pose = pin.SE3(pin_rotation, pin_translation)
 
-        rot_matrix = R.from_euler('xyz', rotation, degrees=False).as_matrix()
+        rot_matrix = R.from_euler("xyz", rotation, degrees=False).as_matrix()
 
         pin_del_pose = pin.SE3(np.array(rot_matrix), np.array(translation))
         pin_goal_pose_new = pin_curr_pose * pin_del_pose
@@ -262,7 +287,7 @@ def move_to_pose(self, translation_tensor, rotational_tensor, gripper, move_mode
         # print(f"final pos and quat {final_pos}\n {final_quat}")
 
         full_body_cfg = self.robot.solve_ik(
-            final_pos, final_quat, False, None, False, node_name = self.end_link
+            final_pos, final_quat, False, None, False, node_name=self.end_link
         )
         if full_body_cfg is None:
             print("Warning: Cannot find an IK solution for desired EE pose!")
diff --git a/src/stretch/dynav/ok_robot_hw/utils/__init__.py b/src/stretch/dynav/ok_robot_hw/utils/__init__.py
index c7a2ffae2..5baa346ae 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/__init__.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/__init__.py
@@ -1,3 +1,12 @@
-from .grasper_utils import *
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 from .communication_utils import *
-from .utils import *
\ No newline at end of file
+from .grasper_utils import *
+from .utils import *
diff --git a/src/stretch/dynav/ok_robot_hw/utils/communication_utils.py b/src/stretch/dynav/ok_robot_hw/utils/communication_utils.py
index ad2396b70..6e9775237 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/communication_utils.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/communication_utils.py
@@ -1,47 +1,65 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import cv2
 import numpy as np
 import zmq
-import cv2
+
 
 # use zmq to send a numpy array
 def send_array(socket, A, flags=0, copy=True, track=False):
     """send a numpy array with metadata"""
     A = np.array(A)
     md = dict(
-        dtype = str(A.dtype),
-        shape = A.shape,
+        dtype=str(A.dtype),
+        shape=A.shape,
     )
-    socket.send_json(md, flags|zmq.SNDMORE)
+    socket.send_json(md, flags | zmq.SNDMORE)
     return socket.send(np.ascontiguousarray(A), flags, copy=copy, track=track)
 
+
 # use zmq to receive a numpy array
 def recv_array(socket, flags=0, copy=True, track=False):
     """recv a numpy array"""
     md = socket.recv_json(flags=flags)
     msg = socket.recv(flags=flags, copy=copy, track=track)
-    A = np.frombuffer(msg, dtype=md['dtype'])
-    return A.reshape(md['shape'])
+    A = np.frombuffer(msg, dtype=md["dtype"])
+    return A.reshape(md["shape"])
+
 
 def send_rgb_img(socket, img):
-    img = img.astype(np.uint8) 
+    img = img.astype(np.uint8)
     encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 90]
-    _, img_encoded = cv2.imencode('.jpg', img, encode_param)
+    _, img_encoded = cv2.imencode(".jpg", img, encode_param)
     socket.send(img_encoded.tobytes())
 
+
 def recv_rgb_img(socket):
     img = socket.recv()
     img = np.frombuffer(img, dtype=np.uint8)
     img = cv2.imdecode(img, cv2.IMREAD_COLOR)
     return img
 
+
 def send_depth_img(socket, depth_img):
     depth_img = (depth_img * 1000).astype(np.uint16)
-    encode_param = [int(cv2.IMWRITE_PNG_COMPRESSION), 3]  # Compression level from 0 (no compression) to 9 (max compression)
-    _, depth_img_encoded = cv2.imencode('.png', depth_img, encode_param)
+    encode_param = [
+        int(cv2.IMWRITE_PNG_COMPRESSION),
+        3,
+    ]  # Compression level from 0 (no compression) to 9 (max compression)
+    _, depth_img_encoded = cv2.imencode(".png", depth_img, encode_param)
     socket.send(depth_img_encoded.tobytes())
 
+
 def recv_depth_img(socket):
     depth_img = socket.recv()
     depth_img = np.frombuffer(depth_img, dtype=np.uint8)
     depth_img = cv2.imdecode(depth_img, cv2.IMREAD_UNCHANGED)
-    depth_img = (depth_img / 1000.)
-    return depth_img
\ No newline at end of file
+    depth_img = depth_img / 1000.0
+    return depth_img
diff --git a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
index f27de8c02..3f1ceae4c 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
@@ -1,15 +1,26 @@
-import time
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 import math
+import time
+
 import numpy as np
 import pinocchio as pin
 
-from stretch.dynav.ok_robot_hw.image_publisher import ImagePublisher
 from stretch.dynav.ok_robot_hw.global_parameters import *
+from stretch.dynav.ok_robot_hw.image_publisher import ImagePublisher
 from stretch.dynav.ok_robot_hw.utils.utils import apply_se3_transform
 
+
 def capture_and_process_image(camera, mode, obj, socket, hello_robot):
-    
-    print('Currently in ' + mode + ' mode and the robot is about to manipulate ' + obj + '.')
+
+    print("Currently in " + mode + " mode and the robot is about to manipulate " + obj + ".")
 
     image_publisher = ImagePublisher(camera, socket)
 
@@ -20,31 +31,33 @@ def capture_and_process_image(camera, mode, obj, socket, hello_robot):
     head_tilt = INIT_HEAD_TILT
     head_pan = INIT_HEAD_PAN
 
-    while(retry_flag):
-        translation, rotation, depth, width, retry_flag = image_publisher.publish_image(obj, mode, head_tilt=head_tilt)
+    while retry_flag:
+        translation, rotation, depth, width, retry_flag = image_publisher.publish_image(
+            obj, mode, head_tilt=head_tilt
+        )
 
         print(f"retry flag : {retry_flag}")
-        if (retry_flag == 1):
+        if retry_flag == 1:
             base_trans = translation[0]
-            head_tilt += (rotation[0])
+            head_tilt += rotation[0]
 
-            hello_robot.move_to_position(base_trans=base_trans,
-                                    head_pan=head_pan,
-                                    head_tilt=head_tilt)
+            hello_robot.move_to_position(
+                base_trans=base_trans, head_pan=head_pan, head_tilt=head_tilt
+            )
             time.sleep(1)
-        
-        elif (retry_flag !=0 and side_retries == 3):
+
+        elif retry_flag != 0 and side_retries == 3:
             print("Tried in all angles but couldn't succed")
             time.sleep(1)
             return None, None, None, None
 
-        elif (side_retries == 2 and tilt_retries == 3):
+        elif side_retries == 2 and tilt_retries == 3:
             hello_robot.move_to_position(base_trans=0.1, head_tilt=head_tilt)
             side_retries = 3
 
         elif retry_flag == 2:
-            if (tilt_retries == 3):
-                if (side_retries == 0):
+            if tilt_retries == 3:
+                if side_retries == 0:
                     hello_robot.move_to_position(base_trans=0.1, head_tilt=head_tilt)
                     side_retries = 1
                 else:
@@ -53,8 +66,9 @@ def capture_and_process_image(camera, mode, obj, socket, hello_robot):
                 tilt_retries = 1
             else:
                 print(f"retrying with head tilt : {head_tilt + head_tilt_angles[tilt_retries]}")
-                hello_robot.move_to_position(head_pan=head_pan,
-                                        head_tilt=head_tilt + head_tilt_angles[tilt_retries])
+                hello_robot.move_to_position(
+                    head_pan=head_pan, head_tilt=head_tilt + head_tilt_angles[tilt_retries]
+                )
                 tilt_retries += 1
                 time.sleep(1)
 
@@ -69,11 +83,9 @@ def capture_and_process_image(camera, mode, obj, socket, hello_robot):
 
 def move_to_point(robot, point, base_node, gripper_node, move_mode=1, pitch_rotation=0):
     """
-        Function for moving the gripper to a specific point
+    Function for moving the gripper to a specific point
     """
-    rotation = np.array([[1, 0, 0],
-                         [0, 1, 0],
-                         [0, 0, 1]])
+    rotation = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
 
     dest_frame = pin.SE3(rotation, point)
     transform = robot.get_joint_transform(base_node, gripper_node)
@@ -84,38 +96,60 @@ def move_to_point(robot, point, base_node, gripper_node, move_mode=1, pitch_rota
     transformed_frame.translation[2] -= 0.2
 
     robot.move_to_pose(
-            [transformed_frame.translation[0], transformed_frame.translation[1], transformed_frame.translation[2]],
-            [pitch_rotation, 0, 0],
-            [1],
-            move_mode=move_mode
-        )
+        [
+            transformed_frame.translation[0],
+            transformed_frame.translation[1],
+            transformed_frame.translation[2],
+        ],
+        [pitch_rotation, 0, 0],
+        [1],
+        move_mode=move_mode,
+    )
 
-def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height = 0.03, gripper_depth=0.03, gripper_width = 1):
+
+def pickup(
+    robot,
+    rotation,
+    translation,
+    base_node,
+    gripper_node,
+    gripper_height=0.03,
+    gripper_depth=0.03,
+    gripper_width=1,
+):
     """
-        rotation: Relative rotation of gripper pose w.r.t camera
-        translation: Relative translation of gripper pose w.r.t camera
-        base_node: Camera Node
+    rotation: Relative rotation of gripper pose w.r.t camera
+    translation: Relative translation of gripper pose w.r.t camera
+    base_node: Camera Node
 
-        Supports home robot top down grasping as well
+    Supports home robot top down grasping as well
 
-        Graping trajectory steps
-        1. Rotation of gripper
-        2. Lift the gripper
-        3. Move the base such gripper in line with the grasp
-        4. Gradually Move the gripper to the desired position
+    Graping trajectory steps
+    1. Rotation of gripper
+    2. Lift the gripper
+    3. Move the base such gripper in line with the grasp
+    4. Gradually Move the gripper to the desired position
     """
     # Transforming the final point from Model camera frame to robot camera frame
     pin_point = np.array([-translation[1], -translation[0], translation[2]])
 
     # Rotation from Camera frame to Model frame
-    rotation1_bottom_mat = np.array([[0.0000000, -1.0000000,  0.0000000],
-                                 [-1.0000000,  0.0000000,  0.0000000],
-                                 [0.0000000,  0.0000000, 1.0000000]])
+    rotation1_bottom_mat = np.array(
+        [
+            [0.0000000, -1.0000000, 0.0000000],
+            [-1.0000000, 0.0000000, 0.0000000],
+            [0.0000000, 0.0000000, 1.0000000],
+        ]
+    )
 
     # Rotation from model frame to pose frame
-    rotation1_mat = np.array([[rotation[0][0], rotation[0][1], rotation[0][2]],
-                            [rotation[1][0],  rotation[1][1], rotation[1][2]],
-                            [rotation[2][0],  rotation[2][1], rotation[2][2]]])
+    rotation1_mat = np.array(
+        [
+            [rotation[0][0], rotation[0][1], rotation[0][2]],
+            [rotation[1][0], rotation[1][1], rotation[1][2]],
+            [rotation[2][0], rotation[2][1], rotation[2][2]],
+        ]
+    )
 
     # Rotation from camera frame to pose frame
     pin_rotation = np.dot(rotation1_bottom_mat, rotation1_mat)
@@ -133,29 +167,26 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
     # print(f"pin_transformed frame {pin_transformed_frame}")
 
     # Lifting the arm to high position as part of pregrasping position
-    print('pan, tilt before', robot.robot.get_pan_tilt())
-    robot.move_to_position(gripper_pos = gripper_width)
-    robot.move_to_position(lift_pos = 1.05, head_pan = None, head_tilt = None)
-    print('pan, tilt after', robot.robot.get_pan_tilt())
+    print("pan, tilt before", robot.robot.get_pan_tilt())
+    robot.move_to_position(gripper_pos=gripper_width)
+    robot.move_to_position(lift_pos=1.05, head_pan=None, head_tilt=None)
+    print("pan, tilt after", robot.robot.get_pan_tilt())
 
     # Rotation for aligning Robot gripper frame to Model gripper frame
-    rotation2_top_mat = np.array([[0, 0, 1], 
-                                [1, 0, 0],
-                                [0, -1, 0]])
-
+    rotation2_top_mat = np.array([[0, 0, 1], [1, 0, 0], [0, -1, 0]])
 
     # final Rotation of gripper to hold the objet
     pin_final_rotation = np.dot(pin_transformed_frame.rotation, rotation2_top_mat)
     print(f"pin final rotation {pin_final_rotation}")
 
     rpy_angles = pin.rpy.matrixToRpy(pin_final_rotation)
-    print('pan, tilt before', robot.robot.get_pan_tilt())
+    print("pan, tilt before", robot.robot.get_pan_tilt())
     robot.move_to_pose(
-            [0, 0, 0],
-            [rpy_angles[0], rpy_angles[1], rpy_angles[2]],
-            [1],
-        )
-    print('pan, tilt after', robot.robot.get_pan_tilt())
+        [0, 0, 0],
+        [rpy_angles[0], rpy_angles[1], rpy_angles[2]],
+        [1],
+    )
+    print("pan, tilt after", robot.robot.get_pan_tilt())
 
     # Final grasping point relative to camera
     pin_cam2gripper_transform = robot.get_joint_transform(base_node, gripper_node)
@@ -163,27 +194,29 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
     # print(f"pin transformed point1 {pin_transformed_point1}")
 
     # Final grasping point relative to base
-    pin_cam2base_transform = robot.get_joint_transform(base_node, 'base_link')
+    pin_cam2base_transform = robot.get_joint_transform(base_node, "base_link")
     pin_base_point = apply_se3_transform(pin_cam2base_transform, pin_point)
     # print(f"pin base point {pin_base_point}")
 
-    diff_value = (0.226 - gripper_depth - gripper_height) # 0.228 is the distance between link_Straight_gripper node and the gripper tip
-    pin_transformed_point1[2] -= (diff_value)
-    ref_diff = (diff_value)
+    diff_value = (
+        0.226 - gripper_depth - gripper_height
+    )  # 0.228 is the distance between link_Straight_gripper node and the gripper tip
+    pin_transformed_point1[2] -= diff_value
+    ref_diff = diff_value
 
     # Moving gripper to a point that is 0.2m away from the pose center in the line of gripper
-    print('pan, tilt before', robot.robot.get_pan_tilt())
+    print("pan, tilt before", robot.robot.get_pan_tilt())
     robot.move_to_pose(
         [pin_transformed_point1[0], pin_transformed_point1[1], pin_transformed_point1[2] - 0.2],
         [0, 0, 0],
         [1],
-        move_mode = 1
+        move_mode=1,
     )
-    print('pan, tilt after', robot.robot.get_pan_tilt())
+    print("pan, tilt after", robot.robot.get_pan_tilt())
 
     # Z-Axis of link_straight_gripper points in line of gripper
     # So, the z co-ordiante of point w.r.t gripper gives the distance of point from gripper
-    pin_base2gripper_transform = robot.get_joint_transform('base_link', gripper_node)
+    pin_base2gripper_transform = robot.get_joint_transform("base_link", gripper_node)
     pin_transformed_point2 = apply_se3_transform(pin_base2gripper_transform, pin_base_point)
     curr_diff = pin_transformed_point2[2]
 
@@ -194,31 +227,23 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
         dist = diff - 0.08
         state = robot.robot.get_six_joints()
         state[1] += 0.02
-        robot.robot.arm_to(state, blocking = True)
-        robot.move_to_pose(
-            [0, 0, dist],
-            [0, 0, 0],
-            [1]
-        )
+        robot.robot.arm_to(state, blocking=True)
+        robot.move_to_pose([0, 0, dist], [0, 0, 0], [1])
         diff = diff - dist
-        
+
     while diff > 0.01:
         dist = min(0.03, diff)
-        robot.move_to_pose(
-            [0, 0, dist],   
-            [0, 0, 0],
-            [1]
-        )
+        robot.move_to_pose([0, 0, dist], [0, 0, 0], [1])
         diff = diff - dist
-    
+
     # Now the gripper reached the grasping point and starts picking procedure
     robot.pickup(gripper_width)
 
     # Lifts the arm
-    robot.move_to_position(lift_pos = min(robot.robot.get_six_joints()[1] + 0.2, 1.1))
+    robot.move_to_position(lift_pos=min(robot.robot.get_six_joints()[1] + 0.2, 1.1))
 
     # Tucks the gripper so that while moving to place it wont collide with any obstacles
-    robot.move_to_position(arm_pos = 0.01)
-    robot.move_to_position(wrist_pitch = 0.0)
-    robot.move_to_position(lift_pos = min(robot.robot.get_six_joints()[1], 0.9), wrist_yaw = 2.5)
-    robot.move_to_position(lift_pos = min(robot.robot.get_six_joints()[1], 0.55))
+    robot.move_to_position(arm_pos=0.01)
+    robot.move_to_position(wrist_pitch=0.0)
+    robot.move_to_position(lift_pos=min(robot.robot.get_six_joints()[1], 0.9), wrist_yaw=2.5)
+    robot.move_to_position(lift_pos=min(robot.robot.get_six_joints()[1], 0.55))
diff --git a/src/stretch/dynav/ok_robot_hw/utils/utils.py b/src/stretch/dynav/ok_robot_hw/utils/utils.py
index 1c60e52ee..27ace6fb0 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/utils.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/utils.py
@@ -1,5 +1,15 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 import numpy as np
 
+
 def apply_se3_transform(se3_obj, point):
     homogeneous_point = np.append(point.flatten(), 1)
     print(homogeneous_point)
@@ -8,14 +18,15 @@ def apply_se3_transform(se3_obj, point):
 
     return transformed_point
 
+
 def transform_joint_array(joint_array):
     n = len(joint_array)
     new_joint_array = []
-    for i in range(n+3):
+    for i in range(n + 3):
         if i < 2:
             new_joint_array.append(joint_array[i])
         elif i < 6:
-            new_joint_array.append(joint_array[2]/4.0)
+            new_joint_array.append(joint_array[2] / 4.0)
         else:
-            new_joint_array.append(joint_array[i-3])
-    return np.array(new_joint_array)
\ No newline at end of file
+            new_joint_array.append(joint_array[i - 3])
+    return np.array(new_joint_array)
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index 858b1c83e..ec74828c4 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -1,3 +1,12 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 # This source code is licensed under the MIT license found in the
@@ -6,32 +15,32 @@
 import datetime
 import os
 import pickle
+import threading
 import time
+from multiprocessing import Process
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 
+import cv2
 import numpy as np
 import torch
-
-from stretch.core.parameters import Parameters, get_parameters
-from stretch.agent import RobotClient
-
 import zmq
-
 from matplotlib import pyplot as plt
 
+from stretch.agent import RobotClient
+from stretch.core.parameters import Parameters, get_parameters
+from stretch.dynav.ok_robot_hw.camera import RealSenseCamera
 from stretch.dynav.ok_robot_hw.global_parameters import *
 from stretch.dynav.ok_robot_hw.robot import HelloRobot as Manipulation_Wrapper
-from stretch.dynav.ok_robot_hw.camera import RealSenseCamera
-from stretch.dynav.ok_robot_hw.utils.grasper_utils import pickup, move_to_point, capture_and_process_image
-from stretch.dynav.ok_robot_hw.utils.communication_utils import send_array, recv_array
+from stretch.dynav.ok_robot_hw.utils.communication_utils import recv_array, send_array
+from stretch.dynav.ok_robot_hw.utils.grasper_utils import (
+    capture_and_process_image,
+    move_to_point,
+    pickup,
+)
 
 # from stretch.dynav.llm_server import ImageProcessor
 
-import cv2
-
-import threading
-from multiprocessing import Process
 
 class RobotAgentMDP:
     """Basic demo code. Collects everything that we need to make this work."""
@@ -49,9 +58,9 @@ def __init__(
         re: int = 1,
         env_num: int = 1,
         test_num: int = 1,
-        method: str = 'dynamem'
+        method: str = "dynamem",
     ):
-        print('------------------------YOU ARE NOW RUNNING PEIQI CODES V3-----------------')
+        print("------------------------YOU ARE NOW RUNNING PEIQI CODES V3-----------------")
         self.re = re
         if isinstance(parameters, Dict):
             self.parameters = Parameters(**parameters)
@@ -67,7 +76,9 @@ def __init__(
             stretch_gripper_max = 0.64
             end_link = "link_gripper_s3_body"
         self.transform_node = end_link
-        self.manip_wrapper = Manipulation_Wrapper(self.robot, stretch_gripper_max = stretch_gripper_max, end_link = end_link)
+        self.manip_wrapper = Manipulation_Wrapper(
+            self.robot, stretch_gripper_max=stretch_gripper_max, end_link=end_link
+        )
         self.robot.move_to_nav_posture()
 
         self.normalize_embeddings = True
@@ -75,21 +86,27 @@ def __init__(
         self.rot_err_threshold = 0.4
         self.obs_count = 0
         self.obs_history = []
-        self.guarantee_instance_is_reachable = (
-            parameters.guarantee_instance_is_reachable
-        )
+        self.guarantee_instance_is_reachable = parameters.guarantee_instance_is_reachable
 
-        self.image_sender = ImageSender(ip = ip, image_port = image_port, text_port = text_port, manip_port = manip_port)
-        if method == 'dynamem':
+        self.image_sender = ImageSender(
+            ip=ip, image_port=image_port, text_port=text_port, manip_port=manip_port
+        )
+        if method == "dynamem":
             from stretch.dynav.voxel_map_server import ImageProcessor
-            self.image_processor = ImageProcessor(rerun = True, static = False, log = 'env' + str(env_num) + '_' + str(test_num))
-        elif method == 'mllm':
+
+            self.image_processor = ImageProcessor(
+                rerun=True, static=False, log="env" + str(env_num) + "_" + str(test_num)
+            )
+        elif method == "mllm":
             from stretch.dynav.llm_server import ImageProcessor
-            self.image_processor = ImageProcessor(rerun = True, static = False, log = 'env' + str(env_num) + '_' + str(test_num))
+
+            self.image_processor = ImageProcessor(
+                rerun=True, static=False, log="env" + str(env_num) + "_" + str(test_num)
+            )
 
         self.look_around_times = []
         self.execute_times = []
-        
+
         timestamp = f"{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}"
 
         # self.head_lock = threading.Lock()
@@ -99,18 +116,18 @@ def look_around(self):
         for pan in [0.4, -0.4, -1.2]:
             for tilt in [-0.6]:
                 start_time = time.time()
-                self.robot.head_to(pan, tilt, blocking = True)
+                self.robot.head_to(pan, tilt, blocking=True)
                 end_time = time.time()
-                print('moving head takes ', end_time - start_time, 'seconds.')
+                print("moving head takes ", end_time - start_time, "seconds.")
                 self.update()
 
     def rotate_in_place(self):
         print("*" * 10, "Rotate in place", "*" * 10)
         xyt = self.robot.get_base_pose()
-        self.robot.head_to(head_pan = 0, head_tilt = -0.6, blocking = True)
+        self.robot.head_to(head_pan=0, head_tilt=-0.6, blocking=True)
         for i in range(8):
             xyt[2] += 2 * np.pi / 8
-            self.robot.navigate_to(xyt, blocking = True)
+            self.robot.navigate_to(xyt, blocking=True)
             self.update()
 
     def update(self):
@@ -138,14 +155,14 @@ def execute_action(
 
         start = self.robot.get_base_pose()
         # print("       Start:", start)
-        # res = self.image_sender.query_text(text, start)  
+        # res = self.image_sender.query_text(text, start)
         res = self.image_processor.process_text(text, start)
-        if len(res) == 0 and text != '' and text is not None:
-            res = self.image_processor.process_text('', start)
+        if len(res) == 0 and text != "" and text is not None:
+            res = self.image_processor.process_text("", start)
 
         look_around_finish = time.time()
         look_around_take = look_around_finish - start_time
-        print('Path planning takes ', look_around_take, ' seconds.')
+        print("Path planning takes ", look_around_take, " seconds.")
         # self.look_around_times.append(look_around_take)
         # print(self.look_around_times)
         # print(sum(self.look_around_times) / len(self.look_around_times))
@@ -159,12 +176,12 @@ def execute_action(
                         res[:-2],
                         pos_err_threshold=self.pos_err_threshold,
                         rot_err_threshold=self.rot_err_threshold,
-                        blocking = True
+                        blocking=True,
                     )
 
                 execution_finish = time.time()
                 execution_take = execution_finish - look_around_finish
-                print('Executing action takes ', execution_take, ' seconds.')
+                print("Executing action takes ", execution_take, " seconds.")
                 self.execute_times.append(execution_take)
                 print(self.execute_times)
                 print(sum(self.execute_times) / len(self.execute_times))
@@ -175,12 +192,12 @@ def execute_action(
                     res,
                     pos_err_threshold=self.pos_err_threshold,
                     rot_err_threshold=self.rot_err_threshold,
-                    blocking = False
+                    blocking=False,
                 )
 
                 execution_finish = time.time()
                 execution_take = execution_finish - look_around_finish
-                print('Executing action takes ', execution_take, ' seconds.')
+                print("Executing action takes ", execution_take, " seconds.")
                 self.execute_times.append(execution_take)
                 print(self.execute_times)
                 print(sum(self.execute_times) / len(self.execute_times))
@@ -190,9 +207,7 @@ def execute_action(
             print("Failed. Try again!")
             return None, None
 
-    def run_exploration(
-        self
-    ):
+    def run_exploration(self):
         """Go through exploration. We use the voxel_grid map created by our collector to sample free space, and then use our motion planner (RRT for now) to get there. At the end, we plan back to (0,0,0).
 
         Args:
@@ -203,119 +218,132 @@ def run_exploration(
             return False
         return True
 
-    def navigate(self, text, max_step = 10):
+    def navigate(self, text, max_step=10):
         finished = False
         step = 0
         end_point = None
         while not finished and step < max_step:
-            print('*' * 20, step, '*' * 20)
+            print("*" * 20, step, "*" * 20)
             step += 1
-            finished, end_point = self.execute_action(text) 
+            finished, end_point = self.execute_action(text)
             if finished is None:
                 print("Navigation failed! The path might be blocked!")
                 return None
         return end_point
 
-    def place(self, text, init_tilt = INIT_HEAD_TILT, base_node = TOP_CAMERA_NODE):
-        '''
-            An API for running placing. By calling this API, human will ask the robot to place whatever it holds
-            onto objects specified by text queries A
-            - hello_robot: a wrapper for home-robot StretchClient controller
-            - socoket: we use this to communicate with workstation to get estimated gripper pose
-            - text: queries specifying target object
-            - transform node: node name for coordinate systems of target gripper pose (usually the coordinate system on the robot gripper)
-            - base node: node name for coordinate systems of estimated gipper poses given by anygrasp
-        '''
+    def place(self, text, init_tilt=INIT_HEAD_TILT, base_node=TOP_CAMERA_NODE):
+        """
+        An API for running placing. By calling this API, human will ask the robot to place whatever it holds
+        onto objects specified by text queries A
+        - hello_robot: a wrapper for home-robot StretchClient controller
+        - socoket: we use this to communicate with workstation to get estimated gripper pose
+        - text: queries specifying target object
+        - transform node: node name for coordinate systems of target gripper pose (usually the coordinate system on the robot gripper)
+        - base node: node name for coordinate systems of estimated gipper poses given by anygrasp
+        """
         self.robot.switch_to_manipulation_mode()
         self.robot.look_at_ee()
-        self.manip_wrapper.move_to_position(
-            head_pan=INIT_HEAD_PAN,
-            head_tilt=init_tilt)
+        self.manip_wrapper.move_to_position(head_pan=INIT_HEAD_PAN, head_tilt=init_tilt)
         camera = RealSenseCamera(self.robot)
 
         rotation, translation = capture_and_process_image(
-            camera = camera,
-            mode = 'place',
-            obj = text,
-            socket = self.image_sender.manip_socket, 
-            hello_robot = self.manip_wrapper)
+            camera=camera,
+            mode="place",
+            obj=text,
+            socket=self.image_sender.manip_socket,
+            hello_robot=self.manip_wrapper,
+        )
 
         if rotation is None:
             return False
 
         # lift arm to the top before the robot extends the arm, prepare the pre-placing gripper pose
         self.manip_wrapper.move_to_position(lift_pos=1.05)
-        self.manip_wrapper.move_to_position(wrist_yaw=0,
-                                 wrist_pitch=0)
+        self.manip_wrapper.move_to_position(wrist_yaw=0, wrist_pitch=0)
 
         # Placing the object
         move_to_point(self.manip_wrapper, translation, base_node, self.transform_node, move_mode=0)
-        self.manip_wrapper.move_to_position(gripper_pos=1, blocking = True)
+        self.manip_wrapper.move_to_position(gripper_pos=1, blocking=True)
 
         # Lift the arm a little bit, and rotate the wrist roll of the robot in case the object attached on the gripper
-        self.manip_wrapper.move_to_position(lift_pos = min(self.manip_wrapper.robot.get_six_joints()[1] + 0.3, 1.1))
-        self.manip_wrapper.move_to_position(wrist_roll = 2.5, blocking = True)
-        self.manip_wrapper.move_to_position(wrist_roll = -2.5, blocking = True)
+        self.manip_wrapper.move_to_position(
+            lift_pos=min(self.manip_wrapper.robot.get_six_joints()[1] + 0.3, 1.1)
+        )
+        self.manip_wrapper.move_to_position(wrist_roll=2.5, blocking=True)
+        self.manip_wrapper.move_to_position(wrist_roll=-2.5, blocking=True)
 
         # Wait for some time and shrink the arm back
-        self.manip_wrapper.move_to_position(gripper_pos=1, 
-                                lift_pos = 1.05,
-                                arm_pos = 0)
+        self.manip_wrapper.move_to_position(gripper_pos=1, lift_pos=1.05, arm_pos=0)
         self.manip_wrapper.move_to_position(wrist_pitch=-1.57)
 
         # Shift the base back to the original point as we are certain that orginal point is navigable in navigation obstacle map
-        self.manip_wrapper.move_to_position(base_trans = -self.manip_wrapper.robot.get_six_joints()[0])
+        self.manip_wrapper.move_to_position(
+            base_trans=-self.manip_wrapper.robot.get_six_joints()[0]
+        )
         return True
 
-    def manipulate(self, text, init_tilt = INIT_HEAD_TILT, base_node = TOP_CAMERA_NODE):
-        '''
-            An API for running manipulation. By calling this API, human will ask the robot to pick up objects
-            specified by text queries A
-            - hello_robot: a wrapper for home-robot StretchClient controller
-            - socoket: we use this to communicate with workstation to get estimated gripper pose
-            - text: queries specifying target object
-            - transform node: node name for coordinate systems of target gripper pose (usually the coordinate system on the robot gripper)
-            - base node: node name for coordinate systems of estimated gipper poses given by anygrasp
-        '''
+    def manipulate(self, text, init_tilt=INIT_HEAD_TILT, base_node=TOP_CAMERA_NODE):
+        """
+        An API for running manipulation. By calling this API, human will ask the robot to pick up objects
+        specified by text queries A
+        - hello_robot: a wrapper for home-robot StretchClient controller
+        - socoket: we use this to communicate with workstation to get estimated gripper pose
+        - text: queries specifying target object
+        - transform node: node name for coordinate systems of target gripper pose (usually the coordinate system on the robot gripper)
+        - base node: node name for coordinate systems of estimated gipper poses given by anygrasp
+        """
 
         self.robot.switch_to_manipulation_mode()
         self.robot.look_at_ee()
 
         gripper_pos = 1
 
-        self.manip_wrapper.move_to_position(arm_pos=INIT_ARM_POS,
-                                head_pan=INIT_HEAD_PAN,
-                                head_tilt=init_tilt,
-                                gripper_pos = gripper_pos,
-                                lift_pos=INIT_LIFT_POS,
-                                wrist_pitch = INIT_WRIST_PITCH,
-                                wrist_roll = INIT_WRIST_ROLL,
-                                wrist_yaw = INIT_WRIST_YAW)
+        self.manip_wrapper.move_to_position(
+            arm_pos=INIT_ARM_POS,
+            head_pan=INIT_HEAD_PAN,
+            head_tilt=init_tilt,
+            gripper_pos=gripper_pos,
+            lift_pos=INIT_LIFT_POS,
+            wrist_pitch=INIT_WRIST_PITCH,
+            wrist_roll=INIT_WRIST_ROLL,
+            wrist_yaw=INIT_WRIST_YAW,
+        )
 
         camera = RealSenseCamera(self.robot)
 
         rotation, translation, depth, width = capture_and_process_image(
-            camera = camera,
-            mode = 'pick',
-            obj = text,
-            socket = self.image_sender.manip_socket, 
-            hello_robot = self.manip_wrapper)
-    
+            camera=camera,
+            mode="pick",
+            obj=text,
+            socket=self.image_sender.manip_socket,
+            hello_robot=self.manip_wrapper,
+        )
+
         if rotation is None:
             return False
-        
+
         if width < 0.05 and self.re == 3:
             gripper_width = 0.45
-        elif width < 0.075 and self.re == 3: 
+        elif width < 0.075 and self.re == 3:
             gripper_width = 0.6
         else:
             gripper_width = 1
 
-        if input('Do you want to do this manipulation? Y or N ') != 'N':
-            pickup(self.manip_wrapper, rotation, translation, base_node, self.transform_node, gripper_depth = depth, gripper_width = gripper_width)
-    
+        if input("Do you want to do this manipulation? Y or N ") != "N":
+            pickup(
+                self.manip_wrapper,
+                rotation,
+                translation,
+                base_node,
+                self.transform_node,
+                gripper_depth=depth,
+                gripper_width=gripper_width,
+            )
+
         # Shift the base back to the original point as we are certain that orginal point is navigable in navigation obstacle map
-        self.manip_wrapper.move_to_position(base_trans = -self.manip_wrapper.robot.get_six_joints()[0])
+        self.manip_wrapper.move_to_position(
+            base_trans=-self.manip_wrapper.robot.get_six_joints()[0]
+        )
 
         return True
 
@@ -323,48 +351,58 @@ def save(self):
         with self.image_processor.voxel_map_lock:
             self.image_processor.write_to_pickle()
 
+
 def send_array(socket, A, flags=0, copy=True, track=False):
     """send a numpy array with metadata"""
     A = np.array(A)
     md = dict(
-        dtype = str(A.dtype),
-        shape = A.shape,
+        dtype=str(A.dtype),
+        shape=A.shape,
     )
-    socket.send_json(md, flags|zmq.SNDMORE)
+    socket.send_json(md, flags | zmq.SNDMORE)
     return socket.send(np.ascontiguousarray(A), flags, copy=copy, track=track)
 
+
 def recv_array(socket, flags=0, copy=True, track=False):
     """recv a numpy array"""
     md = socket.recv_json(flags=flags)
     msg = socket.recv(flags=flags, copy=copy, track=track)
-    A = np.frombuffer(msg, dtype=md['dtype'])
-    return A.reshape(md['shape'])
+    A = np.frombuffer(msg, dtype=md["dtype"])
+    return A.reshape(md["shape"])
+
 
 def send_rgb_img(socket, img):
-    img = img.astype(np.uint8) 
+    img = img.astype(np.uint8)
     encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 90]
-    _, img_encoded = cv2.imencode('.jpg', img, encode_param)
+    _, img_encoded = cv2.imencode(".jpg", img, encode_param)
     socket.send(img_encoded.tobytes())
 
+
 def recv_rgb_img(socket):
     img = socket.recv()
     img = np.frombuffer(img, dtype=np.uint8)
     img = cv2.imdecode(img, cv2.IMREAD_COLOR)
     return img
 
+
 def send_depth_img(socket, depth_img):
     depth_img = (depth_img * 1000).astype(np.uint16)
-    encode_param = [int(cv2.IMWRITE_PNG_COMPRESSION), 3]  # Compression level from 0 (no compression) to 9 (max compression)
-    _, depth_img_encoded = cv2.imencode('.png', depth_img, encode_param)
+    encode_param = [
+        int(cv2.IMWRITE_PNG_COMPRESSION),
+        3,
+    ]  # Compression level from 0 (no compression) to 9 (max compression)
+    _, depth_img_encoded = cv2.imencode(".png", depth_img, encode_param)
     socket.send(depth_img_encoded.tobytes())
 
+
 def recv_depth_img(socket):
     depth_img = socket.recv()
     depth_img = np.frombuffer(depth_img, dtype=np.uint8)
     depth_img = cv2.imdecode(depth_img, cv2.IMREAD_UNCHANGED)
-    depth_img = (depth_img / 1000.)
+    depth_img = depth_img / 1000.0
     return depth_img
 
+
 def send_everything(socket, rgb, depth, intrinsics, pose):
     send_rgb_img(socket, rgb)
     socket.recv_string()
@@ -375,44 +413,47 @@ def send_everything(socket, rgb, depth, intrinsics, pose):
     send_array(socket, pose)
     socket.recv_string()
 
+
 def recv_everything(socket):
     rgb = recv_rgb_img(socket)
-    socket.send_string('')
+    socket.send_string("")
     depth = recv_depth_img(socket)
-    socket.send_string('')
+    socket.send_string("")
     intrinsics = recv_array(socket)
-    socket.send_string('')
+    socket.send_string("")
     pose = recv_array(socket)
-    socket.send_string('')
+    socket.send_string("")
     return rgb, depth, intrinsics, pose
 
+
 class ImageSender:
-    def __init__(self, 
-        stop_and_photo = False, 
-        ip = '100.108.67.79', 
-        image_port = 5560,
-        text_port = 5561,
-        manip_port = 5557,
-        color_name = "/camera/color",
-        depth_name = "/camera/aligned_depth_to_color",
-        camera_name = "/camera_pose",
-        slop_time_seconds = 0.05,
-        queue_size = 100,
+    def __init__(
+        self,
+        stop_and_photo=False,
+        ip="100.108.67.79",
+        image_port=5560,
+        text_port=5561,
+        manip_port=5557,
+        color_name="/camera/color",
+        depth_name="/camera/aligned_depth_to_color",
+        camera_name="/camera_pose",
+        slop_time_seconds=0.05,
+        queue_size=100,
     ):
         context = zmq.Context()
         self.img_socket = context.socket(zmq.REQ)
-        self.img_socket.connect('tcp://' + str(ip) + ':' + str(image_port))
+        self.img_socket.connect("tcp://" + str(ip) + ":" + str(image_port))
         self.text_socket = context.socket(zmq.REQ)
-        self.text_socket.connect('tcp://' + str(ip) + ':' + str(text_port))
+        self.text_socket.connect("tcp://" + str(ip) + ":" + str(text_port))
         self.manip_socket = context.socket(zmq.REQ)
-        self.manip_socket.connect('tcp://' + str(ip) + ':' + str(manip_port))
+        self.manip_socket.connect("tcp://" + str(ip) + ":" + str(manip_port))
 
     def query_text(self, text, start):
         self.text_socket.send_string(text)
         self.text_socket.recv_string()
         send_array(self.text_socket, start)
         return recv_array(self.text_socket)
-        
+
     def send_images(self, obs):
         rgb = obs.rgb
         depth = obs.depth
diff --git a/src/stretch/dynav/run_manip.py b/src/stretch/dynav/run_manip.py
index 3f63d4a37..35ae069de 100644
--- a/src/stretch/dynav/run_manip.py
+++ b/src/stretch/dynav/run_manip.py
@@ -1,3 +1,12 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 # This source code is licensed under the MIT license found in the
@@ -6,23 +15,25 @@
 import click
 import numpy as np
 
+from stretch.agent import RobotClient
+
 # Mapping and perception
 from stretch.core.parameters import Parameters, get_parameters
 from stretch.dynav import RobotAgentMDP
-from stretch.agent import RobotClient
 
 
 def compute_tilt(camera_xyz, target_xyz):
-    '''
-        a util function for computing robot head tilts so the robot can look at the target object after navigation
-        - camera_xyz: estimated (x, y, z) coordinates of camera
-        - target_xyz: estimated (x, y, z) coordinates of the target object
-    '''
+    """
+    a util function for computing robot head tilts so the robot can look at the target object after navigation
+    - camera_xyz: estimated (x, y, z) coordinates of camera
+    - target_xyz: estimated (x, y, z) coordinates of the target object
+    """
     vector = camera_xyz - target_xyz
     return -np.arctan2(vector[2], np.linalg.norm(vector[:2]))
 
+
 @click.command()
-@click.option("--ip", default='100.108.67.79', type=str)
+@click.option("--ip", default="100.108.67.79", type=str)
 @click.option("--manual-wait", default=False, is_flag=True)
 @click.option("--random-goals", default=False, is_flag=True)
 @click.option("--explore-iter", default=-1)
@@ -49,7 +60,7 @@ def main(
         random_goals(bool): randomly sample frontier goals instead of looking for closest
     """
     click.echo("Will connect to a Stretch robot and collect a short trajectory.")
-    robot = RobotClient(robot_ip = "100.79.44.11")
+    robot = RobotClient(robot_ip="100.79.44.11")
 
     print("- Load parameters")
     parameters = get_parameters("dynav_config.yaml")
@@ -58,18 +69,16 @@ def main(
         parameters["exploration_steps"] = explore_iter
 
     print("- Start robot agent with data collection")
-    demo = RobotAgentMDP(
-        robot, parameters, ip = ip, re = re
-    )
+    demo = RobotAgentMDP(robot, parameters, ip=ip, re=re)
 
-    while input('STOP? Y/N') != 'Y':
-        if input('You want to run manipulation: y/n') == 'y':
-            text = input('Enter object name: ')
+    while input("STOP? Y/N") != "Y":
+        if input("You want to run manipulation: y/n") == "y":
+            text = input("Enter object name: ")
             theta = -0.6
             demo.manipulate(text, theta)
-            
-        if input('You want to run placing: y/n') == 'y':
-            text = input('Enter receptacle name: ')
+
+        if input("You want to run placing: y/n") == "y":
+            text = input("Enter receptacle name: ")
             theta = -0.6
             demo.place(text, theta)
 
diff --git a/src/stretch/dynav/run_ok_nav.py b/src/stretch/dynav/run_ok_nav.py
index 5417a6a73..756c3f9ba 100644
--- a/src/stretch/dynav/run_ok_nav.py
+++ b/src/stretch/dynav/run_ok_nav.py
@@ -1,30 +1,41 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import threading
+import time
+
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 import click
+import cv2
 import matplotlib.pyplot as plt
 import numpy as np
 from PIL import Image
 
+from stretch.agent import RobotClient
+
 # Mapping and perception
 from stretch.core.parameters import Parameters, get_parameters
 from stretch.dynav import RobotAgentMDP
 
 # Chat and UI tools
 from stretch.utils.point_cloud import numpy_to_pcd, show_point_cloud
-from stretch.agent import RobotClient
 
-import cv2
-import time
-import threading
 
 def compute_tilt(camera_xyz, target_xyz):
-    '''
-        a util function for computing robot head tilts so the robot can look at the target object after navigation
-        - camera_xyz: estimated (x, y, z) coordinates of camera
-        - target_xyz: estimated (x, y, z) coordinates of the target object
-    '''
+    """
+    a util function for computing robot head tilts so the robot can look at the target object after navigation
+    - camera_xyz: estimated (x, y, z) coordinates of camera
+    - target_xyz: estimated (x, y, z) coordinates of the target object
+    """
     if not isinstance(camera_xyz, np.ndarray):
         camera_xyz = np.array(camera_xyz)
     if not isinstance(target_xyz, np.ndarray):
@@ -32,13 +43,14 @@ def compute_tilt(camera_xyz, target_xyz):
     vector = camera_xyz - target_xyz
     return -np.arctan2(vector[2], np.linalg.norm(vector[:2]))
 
+
 @click.command()
-@click.option("--ip", default='100.108.67.79', type=str)
+@click.option("--ip", default="100.108.67.79", type=str)
 @click.option("--manual-wait", default=False, is_flag=True)
 @click.option("--random-goals", default=False, is_flag=True)
 @click.option("--explore-iter", default=-1)
 @click.option("--re", default=1, type=int)
-@click.option("--method", default='dynamem', type=str)
+@click.option("--method", default="dynamem", type=str)
 @click.option("--env", default=1, type=int)
 @click.option("--test", default=1, type=int)
 @click.option(
@@ -53,7 +65,7 @@ def main(
     navigate_home: bool = False,
     explore_iter: int = 5,
     re: int = 1,
-    method: str = 'dynamem',
+    method: str = "dynamem",
     env: int = 1,
     test: int = 1,
     input_path: str = None,
@@ -66,7 +78,7 @@ def main(
         random_goals(bool): randomly sample frontier goals instead of looking for closest
     """
     click.echo("Will connect to a Stretch robot and collect a short trajectory.")
-    robot = RobotClient(robot_ip = "100.79.44.11")
+    robot = RobotClient(robot_ip="100.79.44.11")
 
     print("- Load parameters")
     parameters = get_parameters("dynav_config.yaml")
@@ -75,12 +87,10 @@ def main(
         parameters["exploration_steps"] = explore_iter
     object_to_find, location_to_place = None, None
     robot.move_to_nav_posture()
-    robot.set_velocity(v = 30., w = 15.)
+    robot.set_velocity(v=30.0, w=15.0)
 
     print("- Start robot agent with data collection")
-    demo = RobotAgentMDP(
-        robot, parameters, ip = ip, re = re, env_num = env, test_num = test, method = method
-    )
+    demo = RobotAgentMDP(robot, parameters, ip=ip, re=re, env_num=env, test_num=test, method=method)
 
     if input_path is None:
         demo.rotate_in_place()
@@ -102,38 +112,38 @@ def main(
     # img_thread.start()
 
     while True:
-        mode = input('select mode? E/N/S')
-        if mode == 'S':
+        mode = input("select mode? E/N/S")
+        if mode == "S":
             demo.image_processor.write_to_pickle()
             break
-        if mode == 'E':
+        if mode == "E":
             robot.switch_to_navigation_mode()
             for epoch in range(explore_iter):
-                print('\n', 'Exploration epoch ', epoch, '\n')
+                print("\n", "Exploration epoch ", epoch, "\n")
                 if not demo.run_exploration():
-                    print('Exploration failed! Quitting!')
+                    print("Exploration failed! Quitting!")
                     continue
         else:
             text = None
             point = None
-            if input('You want to run manipulation: y/n') != 'n':
+            if input("You want to run manipulation: y/n") != "n":
                 robot.move_to_nav_posture()
                 robot.switch_to_navigation_mode()
-                text = input('Enter object name: ')
+                text = input("Enter object name: ")
                 point = demo.navigate(text)
                 if point is None:
-                    print('Navigation Failure!')
-                cv2.imwrite(text + '.jpg', robot.get_observation().rgb[:, :, [2, 1, 0]])
+                    print("Navigation Failure!")
+                cv2.imwrite(text + ".jpg", robot.get_observation().rgb[:, :, [2, 1, 0]])
 
-            if input('You want to run manipulation: y/n') != 'n':
+            if input("You want to run manipulation: y/n") != "n":
                 robot.switch_to_navigation_mode()
                 xyt = robot.get_base_pose()
                 xyt[2] = xyt[2] + np.pi / 2
-                robot.navigate_to(xyt, blocking = True)
+                robot.navigate_to(xyt, blocking=True)
 
                 robot.switch_to_manipulation_mode()
                 if text is None:
-                    text = input('Enter object name: ')
+                    text = input("Enter object name: ")
                 camera_xyz = robot.get_head_pose()[:3, 3]
                 if point is not None:
                     theta = compute_tilt(camera_xyz, point)
@@ -141,26 +151,26 @@ def main(
                     theta = -0.6
                 demo.manipulate(text, theta)
                 robot.look_front()
-            
+
             text = None
             point = None
-            if input('You want to run placing: y/n') != 'n':
+            if input("You want to run placing: y/n") != "n":
                 robot.switch_to_navigation_mode()
-                text = input('Enter receptacle name: ')
+                text = input("Enter receptacle name: ")
                 point = demo.navigate(text)
                 if point is None:
-                    print('Navigation Failure')
-                cv2.imwrite(text + '.jpg', robot.get_observation().rgb[:, :, [2, 1, 0]])
-        
-            if input('You want to run placing: y/n') != 'n':
+                    print("Navigation Failure")
+                cv2.imwrite(text + ".jpg", robot.get_observation().rgb[:, :, [2, 1, 0]])
+
+            if input("You want to run placing: y/n") != "n":
                 robot.switch_to_navigation_mode()
                 xyt = robot.get_base_pose()
                 xyt[2] = xyt[2] + np.pi / 2
-                robot.navigate_to(xyt, blocking = True)
-                
+                robot.navigate_to(xyt, blocking=True)
+
                 robot.switch_to_manipulation_mode()
                 if text is None:
-                    text = input('Enter receptacle name: ')
+                    text = input("Enter receptacle name: ")
                 camera_xyz = robot.get_head_pose()[:3, 3]
                 if point is not None:
                     theta = compute_tilt(camera_xyz, point)
diff --git a/src/stretch/dynav/scannet.py b/src/stretch/dynav/scannet.py
index 2fe92aaae..6d16e1a9f 100644
--- a/src/stretch/dynav/scannet.py
+++ b/src/stretch/dynav/scannet.py
@@ -1,209 +1,217 @@
-CLASS_LABELS_200 =(
-    'wall',
-    'background',
-    'object',
-    'range hood',
-    'door',
-    'stove',
-    'bathroom stall',
-    'divider',
-    'clothes dryer',
-    'file cabinet',
-    'cabinet',
-    'bathroom cabinet',
-    'blackboard',
-    'curtain',
-    'blinds',
-    'tv',
-    'mattress',
-    'bathtub',
-    'glass',
-    'shower',
-    'power outlet',
-    'water dispenser',
-    'bulletin board',
-    'printer',
-    'windowsill',
-    'bookshelf',
-    'dresser',
-    'storage organizer',
-    'fireplace',
-    'fire alarm',
-    'piano',
-    'shelf',
-    'kitchen counter',
-    'washing machine',
-    'stairs',
-    'paper cutter',
-    'sink',
-    'window',
-    'refrigerator',
-    'counter',
-    'closet',
-    'bathroom vanity',
-    'radiator',
-    'vent',
-    'kitchen cabinet',
-    'water cooler',
-    'doorframe',
-    'closet door',
-    'table',
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
 
-    'shower head',
-    'spray',
-    'inhaler',
-    'guitar case',
-    'plunger',
-    'toilet paper dispenser',
-    'adapter',
-    'soy sauce',
-    'pipe',
-    'bottle',
-    'scale',
-    'paper towel',
-    'paper towel roll',
-    'scissors',
-    'tape',
-    'chopsticks',
-    'case of water bottles',
-    'hand sanitizer',
-    'laptop',
-    'alcohol disinfection',
-    'keyboard',
-    'coffee maker',
-    'light',
-    'toaster',
-    'stuffed animal',
-    'toilet seat cover dispenser',
-    'ironing board',
-    'fire extinguisher',
-    'fruit',
-    'container',
-    'bag',
-    'oven',
-    'body wash',
-    'bucket',
-    'cd case',
-    'tray',
-    'bowl',
-    'speaker',
-    'crate',
-    'projector',
-    'book',
-    'school bag',
-    'laundry detergent',
-    'clothes',
-    'candle',
-    'basket',
-    'face wash',
-    'notebook',
-    'purse',
-    'trash bin',
-    'paper bag',
-    'package',
-    'disinfecting wipes',
-    'recycling bin',
-    'headphones',
-    'mouse',
-    'shower gel',
-    'dustpan',
-    'cup',
-    'vacuum cleaner',
-    'dish rack',
-    'coffee kettle',
-    'plants',
-    'rag',
-    'can',
-    'cushion',
-    'monitor',
-    'fan',
-    'tube',
-    'box',
-    'ball',
-    'bicycle',
-    'guitar',
-    'trash can',
-    'hand sanitizers',
-    'paper towel dispenser',
-    'whiteboard',
-    'bin',
-    'potted plant',
-    'tennis',
-    'soap dish',
-    'structure',
-    'calendar',
-    'dumbbell',
-    'fish oil',
-    'ottoman',
-    'stool',
-    'hand wash',
-    'lamp',
-    'toaster oven',
-    'music stand',
-    'water bottle',
-    'clock',
-    'charger',
-    'picture',
-    'bascketball',
-    'microwave',
-    'screwdriver',
-    'rack',
-    'apple',
-    'suitcase',
-    'ladder',
-    'ping pong ball',
-    'dishwasher',
-    'storage container',
-    'toilet paper holder',
-    'coat rack',
-    'soap dispenser',
-    'banana',
-    'toilet paper',
-    'mug',
-    'marker pen',
-    'hat',
-    'aerosol',
-    'luggage',
-    'poster',
-    'bed',
-    'cart',
-    'light switch',
-    'backpack',
-    'power strip',
-    'baseball',
-    'mustard',
-    'water pitcher',
-    'couch',
-    'beverage',
-    'toy',
-    'salt',
-    'plant',
-    'pillow',
-    'broom',
-    'pepper',
-    'muffins',
-    'multivitamin',
-    'towel',
-    'storage bin',
-    'nightstand',
-    'telephone',
-    'tissue box',
-    'hair dryer',
-    'mirror',
-    'sign',
-    'plate',
-    'tripod',
-    'chair',
-    'plastic bag',
-    'umbrella',
-    'paper',
-    'laundry hamper',
-    'food',
-    'jacket',
-    'computer tower',
-    'keyboard piano',
-    'person',
-    'machine',
-    'projector screen',
-    'shoe',
-)
\ No newline at end of file
+CLASS_LABELS_200 = (
+    "wall",
+    "background",
+    "object",
+    "range hood",
+    "door",
+    "stove",
+    "bathroom stall",
+    "divider",
+    "clothes dryer",
+    "file cabinet",
+    "cabinet",
+    "bathroom cabinet",
+    "blackboard",
+    "curtain",
+    "blinds",
+    "tv",
+    "mattress",
+    "bathtub",
+    "glass",
+    "shower",
+    "power outlet",
+    "water dispenser",
+    "bulletin board",
+    "printer",
+    "windowsill",
+    "bookshelf",
+    "dresser",
+    "storage organizer",
+    "fireplace",
+    "fire alarm",
+    "piano",
+    "shelf",
+    "kitchen counter",
+    "washing machine",
+    "stairs",
+    "paper cutter",
+    "sink",
+    "window",
+    "refrigerator",
+    "counter",
+    "closet",
+    "bathroom vanity",
+    "radiator",
+    "vent",
+    "kitchen cabinet",
+    "water cooler",
+    "doorframe",
+    "closet door",
+    "table",
+    "shower head",
+    "spray",
+    "inhaler",
+    "guitar case",
+    "plunger",
+    "toilet paper dispenser",
+    "adapter",
+    "soy sauce",
+    "pipe",
+    "bottle",
+    "scale",
+    "paper towel",
+    "paper towel roll",
+    "scissors",
+    "tape",
+    "chopsticks",
+    "case of water bottles",
+    "hand sanitizer",
+    "laptop",
+    "alcohol disinfection",
+    "keyboard",
+    "coffee maker",
+    "light",
+    "toaster",
+    "stuffed animal",
+    "toilet seat cover dispenser",
+    "ironing board",
+    "fire extinguisher",
+    "fruit",
+    "container",
+    "bag",
+    "oven",
+    "body wash",
+    "bucket",
+    "cd case",
+    "tray",
+    "bowl",
+    "speaker",
+    "crate",
+    "projector",
+    "book",
+    "school bag",
+    "laundry detergent",
+    "clothes",
+    "candle",
+    "basket",
+    "face wash",
+    "notebook",
+    "purse",
+    "trash bin",
+    "paper bag",
+    "package",
+    "disinfecting wipes",
+    "recycling bin",
+    "headphones",
+    "mouse",
+    "shower gel",
+    "dustpan",
+    "cup",
+    "vacuum cleaner",
+    "dish rack",
+    "coffee kettle",
+    "plants",
+    "rag",
+    "can",
+    "cushion",
+    "monitor",
+    "fan",
+    "tube",
+    "box",
+    "ball",
+    "bicycle",
+    "guitar",
+    "trash can",
+    "hand sanitizers",
+    "paper towel dispenser",
+    "whiteboard",
+    "bin",
+    "potted plant",
+    "tennis",
+    "soap dish",
+    "structure",
+    "calendar",
+    "dumbbell",
+    "fish oil",
+    "ottoman",
+    "stool",
+    "hand wash",
+    "lamp",
+    "toaster oven",
+    "music stand",
+    "water bottle",
+    "clock",
+    "charger",
+    "picture",
+    "bascketball",
+    "microwave",
+    "screwdriver",
+    "rack",
+    "apple",
+    "suitcase",
+    "ladder",
+    "ping pong ball",
+    "dishwasher",
+    "storage container",
+    "toilet paper holder",
+    "coat rack",
+    "soap dispenser",
+    "banana",
+    "toilet paper",
+    "mug",
+    "marker pen",
+    "hat",
+    "aerosol",
+    "luggage",
+    "poster",
+    "bed",
+    "cart",
+    "light switch",
+    "backpack",
+    "power strip",
+    "baseball",
+    "mustard",
+    "water pitcher",
+    "couch",
+    "beverage",
+    "toy",
+    "salt",
+    "plant",
+    "pillow",
+    "broom",
+    "pepper",
+    "muffins",
+    "multivitamin",
+    "towel",
+    "storage bin",
+    "nightstand",
+    "telephone",
+    "tissue box",
+    "hair dryer",
+    "mirror",
+    "sign",
+    "plate",
+    "tripod",
+    "chair",
+    "plastic bag",
+    "umbrella",
+    "paper",
+    "laundry hamper",
+    "food",
+    "jacket",
+    "computer tower",
+    "keyboard piano",
+    "person",
+    "machine",
+    "projector screen",
+    "shoe",
+)
diff --git a/src/stretch/dynav/voxel_map_localizer.py b/src/stretch/dynav/voxel_map_localizer.py
index e35a3cbe9..6bd42a1bb 100644
--- a/src/stretch/dynav/voxel_map_localizer.py
+++ b/src/stretch/dynav/voxel_map_localizer.py
@@ -1,32 +1,40 @@
-import numpy as np
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
 
-import cv2
+import math
+from typing import List, Optional, Tuple, Union
 
+import clip
+import cv2
+import numpy as np
 import torch
 import torch.nn.functional as F
 import torchvision.transforms as T
-from torchvision.transforms.functional import InterpolationMode
-
-import clip
-
-from stretch.dynav.mapping_utils import VoxelizedPointcloud
-
-from typing import List, Optional, Tuple, Union
+from PIL import Image
+from sklearn.cluster import DBSCAN
 from torch import Tensor
-
-from transformers import AutoProcessor, AutoModel
+from torchvision.transforms.functional import InterpolationMode
 
 # from ultralytics import YOLOWorld
-from transformers import Owlv2Processor, Owlv2ForObjectDetection
+from transformers import AutoModel, AutoProcessor, Owlv2ForObjectDetection, Owlv2Processor
 
-import math
-from PIL import Image
+from stretch.dynav.mapping_utils import VoxelizedPointcloud
 
-from sklearn.cluster import DBSCAN
 
 def get_inv_intrinsics(intrinsics):
     # return intrinsics.double().inverse().to(intrinsics)
-    fx, fy, ppx, ppy = intrinsics[..., 0, 0], intrinsics[..., 1, 1], intrinsics[..., 0, 2], intrinsics[..., 1, 2]
+    fx, fy, ppx, ppy = (
+        intrinsics[..., 0, 0],
+        intrinsics[..., 1, 1],
+        intrinsics[..., 0, 2],
+        intrinsics[..., 1, 2],
+    )
     inv_intrinsics = torch.zeros_like(intrinsics)
     inv_intrinsics[..., 0, 0] = 1.0 / fx
     inv_intrinsics[..., 1, 1] = 1.0 / fy
@@ -35,6 +43,7 @@ def get_inv_intrinsics(intrinsics):
     inv_intrinsics[..., 2, 2] = 1.0
     return inv_intrinsics
 
+
 def get_xyz(depth, pose, intrinsics):
     """Returns the XYZ coordinates for a set of points.
 
@@ -74,14 +83,23 @@ def get_xyz(depth, pose, intrinsics):
     xyz = xyz @ get_inv_intrinsics(intrinsics).transpose(-1, -2)
     xyz = xyz * depth.flatten(1).unsqueeze(-1)
     xyz = (xyz[..., None, :] * pose[..., None, :3, :3]).sum(dim=-1) + pose[..., None, :3, 3]
-    
+
     xyz = xyz.unflatten(1, (height, width))
 
     return xyz
 
-class VoxelMapLocalizer():
-    def __init__(self, voxel_map_wrapper = None, exist_model = True, clip_model = None, processor = None, device = 'cuda', siglip = True):
-        print('Localizer V3')
+
+class VoxelMapLocalizer:
+    def __init__(
+        self,
+        voxel_map_wrapper=None,
+        exist_model=True,
+        clip_model=None,
+        processor=None,
+        device="cuda",
+        siglip=True,
+    ):
+        print("Localizer V3")
         self.voxel_map_wrapper = voxel_map_wrapper
         self.device = device
         # self.clip_model, self.preprocessor = clip.load(model_config, device=device)
@@ -92,21 +110,27 @@ def __init__(self, voxel_map_wrapper = None, exist_model = True, clip_model = No
             self.clip_model, self.preprocessor = clip.load("ViT-B/16", device=self.device)
             self.clip_model.eval()
         else:
-            self.clip_model = AutoModel.from_pretrained("google/siglip-so400m-patch14-384").to(self.device)
+            self.clip_model = AutoModel.from_pretrained("google/siglip-so400m-patch14-384").to(
+                self.device
+            )
             self.preprocessor = AutoProcessor.from_pretrained("google/siglip-so400m-patch14-384")
             self.clip_model.eval()
-        self.voxel_pcd = VoxelizedPointcloud(voxel_size = 0.05).to(self.device)
+        self.voxel_pcd = VoxelizedPointcloud(voxel_size=0.05).to(self.device)
         # self.exist_model = YOLOWorld("yolov8l-worldv2.pt")
         self.existence_checking_model = exist_model
         if exist_model:
-            print('WE ARE USING OWLV2!')
-            self.exist_processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
-            self.exist_model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble").to(self.device)
+            print("WE ARE USING OWLV2!")
+            self.exist_processor = AutoProcessor.from_pretrained(
+                "google/owlv2-base-patch16-ensemble"
+            )
+            self.exist_model = Owlv2ForObjectDetection.from_pretrained(
+                "google/owlv2-base-patch16-ensemble"
+            ).to(self.device)
         else:
-            print('YOU ARE USING NOTHING!')
-        
+            print("YOU ARE USING NOTHING!")
 
-    def add(self,
+    def add(
+        self,
         points: Tensor,
         features: Optional[Tensor],
         rgb: Optional[Tensor],
@@ -123,15 +147,13 @@ def add(self,
             weights = weights.to(self.device)
         # if weight_decay is not None and self.voxel_pcd._weights is not None:
         #     self.voxel_pcd._weights *= weight_decay
-        self.voxel_pcd.add(points = points, 
-                        features = features,
-                        rgb = rgb,
-                        weights = weights,
-                        obs_count = obs_count)
+        self.voxel_pcd.add(
+            points=points, features=features, rgb=rgb, weights=weights, obs_count=obs_count
+        )
 
     def calculate_clip_and_st_embeddings_for_queries(self, queries):
         if isinstance(queries, str):
-            queries = [queries] 
+            queries = [queries]
         if self.siglip:
             inputs = self.preprocessor(text=queries, padding="max_length", return_tensors="pt")
             for input in inputs:
@@ -144,7 +166,7 @@ def calculate_clip_and_st_embeddings_for_queries(self, queries):
         # all_clip_tokens = self.clip_model.encode_text(text)
         all_clip_tokens = F.normalize(all_clip_tokens, p=2, dim=-1)
         return all_clip_tokens
-        
+
     def find_alignment_over_model(self, queries):
         clip_text_tokens = self.calculate_clip_and_st_embeddings_for_queries(queries)
         points, features, weights, _ = self.voxel_pcd.get_pointcloud()
@@ -152,21 +174,21 @@ def find_alignment_over_model(self, queries):
             return None
         features = F.normalize(features, p=2, dim=-1)
         point_alignments = clip_text_tokens.float() @ features.float().T
-    
+
         # print(point_alignments.shape)
         return point_alignments
 
     def find_alignment_for_A(self, A):
         points, features, _, _ = self.voxel_pcd.get_pointcloud()
         alignments = self.find_alignment_over_model(A).cpu()
-        return points[alignments.argmax(dim = -1)].detach().cpu()
-    
+        return points[alignments.argmax(dim=-1)].detach().cpu()
+
     def find_obs_id_for_A(self, A):
         obs_counts = self.voxel_pcd._obs_counts
         alignments = self.find_alignment_over_model(A).cpu()
-        return obs_counts[alignments.argmax(dim = -1)].detach().cpu()
+        return obs_counts[alignments.argmax(dim=-1)].detach().cpu()
 
-    def compute_coord(self, text, obs_id, threshold = 0.2):
+    def compute_coord(self, text, obs_id, threshold=0.2):
         # print(obs_id, len(self.voxel_map_wrapper.observations))
         if obs_id <= 0 or obs_id > len(self.voxel_map_wrapper.observations):
             return None
@@ -177,13 +199,15 @@ def compute_coord(self, text, obs_id, threshold = 0.2):
         xyzs = get_xyz(depth, pose, K)[0]
         # rgb[depth >= 2.5] = 0
         rgb = rgb.permute(2, 0, 1).to(torch.uint8)
-        inputs = self.exist_processor(text=[['a photo of a ' + text]], images=rgb, return_tensors="pt")
+        inputs = self.exist_processor(
+            text=[["a photo of a " + text]], images=rgb, return_tensors="pt"
+        )
         for input in inputs:
-            inputs[input] = inputs[input].to('cuda')
-        
+            inputs[input] = inputs[input].to("cuda")
+
         with torch.no_grad():
             outputs = self.exist_model(**inputs)
-    
+
         target_sizes = torch.Tensor([rgb.size()[-2:]]).to(self.device)
         results = self.exist_processor.image_processor.post_process_object_detection(
             outputs, threshold=threshold, target_sizes=target_sizes
@@ -194,30 +218,37 @@ def compute_coord(self, text, obs_id, threshold = 0.2):
         # w, h = depth.shape
         # tl_x, tl_y, br_x, br_y = int(max(0, tl_x.item())), int(max(0, tl_y.item())), int(min(h, br_x.item())), int(min(w, br_y.item()))
         # return torch.median(xyzs[tl_y: br_y, tl_x: br_x].reshape(-1, 3), dim = 0).values
-        for idx, (score, bbox) in enumerate(sorted(zip(results['scores'], results['boxes']), key=lambda x: x[0], reverse=True)):
-        
+        for idx, (score, bbox) in enumerate(
+            sorted(zip(results["scores"], results["boxes"]), key=lambda x: x[0], reverse=True)
+        ):
+
             tl_x, tl_y, br_x, br_y = bbox
             w, h = depth.shape
-            tl_x, tl_y, br_x, br_y = int(max(0, tl_x.item())), int(max(0, tl_y.item())), int(min(h, br_x.item())), int(min(w, br_y.item()))
-            
-            if torch.median(depth[tl_y: br_y, tl_x: br_x].reshape(-1)) < 3:
-                return torch.median(xyzs[tl_y: br_y, tl_x: br_x].reshape(-1, 3), dim = 0).values
+            tl_x, tl_y, br_x, br_y = (
+                int(max(0, tl_x.item())),
+                int(max(0, tl_y.item())),
+                int(min(h, br_x.item())),
+                int(min(w, br_y.item())),
+            )
+
+            if torch.median(depth[tl_y:br_y, tl_x:br_x].reshape(-1)) < 3:
+                return torch.median(xyzs[tl_y:br_y, tl_x:br_x].reshape(-1, 3), dim=0).values
         return None
 
-    def verify_point(self, A, point, distance_threshold = 0.1, similarity_threshold = 0.14):
+    def verify_point(self, A, point, distance_threshold=0.1, similarity_threshold=0.14):
         if isinstance(point, np.ndarray):
             point = torch.from_numpy(point)
         points, _, _, _ = self.voxel_pcd.get_pointcloud()
-        distances = torch.linalg.norm(point - points.detach().cpu(), dim = -1)
+        distances = torch.linalg.norm(point - points.detach().cpu(), dim=-1)
         if torch.min(distances) > distance_threshold:
-            print('Points are so far from other points!')
+            print("Points are so far from other points!")
             return False
         alignments = self.find_alignment_over_model(A).detach().cpu()[0]
         if torch.max(alignments[distances <= distance_threshold]) < similarity_threshold:
-            print('Points close the the point are not similar to the text!')
-        return torch.max(alignments[distances < distance_threshold]) >= similarity_threshold       
+            print("Points close the the point are not similar to the text!")
+        return torch.max(alignments[distances < distance_threshold]) >= similarity_threshold
 
-    def localize_A(self, A, debug = True, return_debug = False):
+    def localize_A(self, A, debug=True, return_debug=False):
         # centroids, extends, similarity_max_list, points, obs_max_list, debug_text = self.find_clusters_for_A(A, return_obs_counts = True, debug = debug)
         # if len(centroids) == 0:
         #     if not debug:
@@ -244,7 +275,7 @@ def localize_A(self, A, debug = True, return_debug = False):
         #         debug_text += '#### - Instance ' +  str(idx + 1) + ' has high cosine similarity (' + str(round(similarity, 3)) +  '). **😃** Directly navigate to it.\n'
 
         #         break
-            
+
         #     debug_text += '#### - Cannot verify whether this instance is the target. **😞** \n'
 
         # if target_point is None:
@@ -259,17 +290,19 @@ def localize_A(self, A, debug = True, return_debug = False):
         alignments = self.find_alignment_over_model(A).cpu()
         # alignments = alignments[0, points[:, -1] >= 0.1]
         # points = points[points[:, -1] >= 0.1]
-        point = points[alignments.argmax(dim = -1)].detach().cpu().squeeze()
+        point = points[alignments.argmax(dim=-1)].detach().cpu().squeeze()
         obs_counts = self.voxel_pcd._obs_counts
-        image_id = obs_counts[alignments.argmax(dim = -1)].detach().cpu()
-        debug_text = ''
+        image_id = obs_counts[alignments.argmax(dim=-1)].detach().cpu()
+        debug_text = ""
         target_point = None
 
         res = self.compute_coord(A, image_id)
         # res = None
         if res is not None:
             target_point = res
-            debug_text += '#### - Obejct is detected in observations . **😃** Directly navigate to it.\n'
+            debug_text += (
+                "#### - Obejct is detected in observations . **😃** Directly navigate to it.\n"
+            )
         else:
             # debug_text += '#### - Directly ignore this instance is the target. **😞** \n'
             if self.siglip:
@@ -279,9 +312,11 @@ def localize_A(self, A, debug = True, return_debug = False):
             if cosine_similarity_check:
                 target_point = point
 
-                debug_text += '#### - The point has high cosine similarity. **😃** Directly navigate to it.\n'
+                debug_text += (
+                    "#### - The point has high cosine similarity. **😃** Directly navigate to it.\n"
+                )
             else:
-                debug_text += '#### - Cannot verify whether this instance is the target. **😞** \n'
+                debug_text += "#### - Cannot verify whether this instance is the target. **😞** \n"
         # print('target_point', target_point)
         if not debug:
             return target_point
@@ -290,9 +325,9 @@ def localize_A(self, A, debug = True, return_debug = False):
         else:
             return target_point, debug_text, image_id, point
 
-    def find_clusters_for_A(self, A, return_obs_counts = False, debug = False, turning_point = None):
+    def find_clusters_for_A(self, A, return_obs_counts=False, debug=False, turning_point=None):
 
-        debug_text = ''
+        debug_text = ""
 
         points, features, _, _ = self.voxel_pcd.get_pointcloud()
         alignments = self.find_alignment_over_model(A).cpu().reshape(-1).detach().numpy()
@@ -306,7 +341,9 @@ def find_clusters_for_A(self, A, return_obs_counts = False, debug = False, turni
         points = points[mask]
         if len(points) == 0:
 
-            debug_text += '### - No instance found! Maybe target object has not been observed yet. **😭**\n'
+            debug_text += (
+                "### - No instance found! Maybe target object has not been observed yet. **😭**\n"
+            )
 
             output = [[], [], [], []]
             if return_obs_counts:
@@ -318,20 +355,26 @@ def find_clusters_for_A(self, A, return_obs_counts = False, debug = False, turni
         else:
             if return_obs_counts:
                 obs_ids = self.voxel_pcd._obs_counts.detach().cpu().numpy()[mask]
-                centroids, extends, similarity_max_list, points, obs_max_list = find_clusters(points.detach().cpu().numpy(), alignments, obs = obs_ids)
+                centroids, extends, similarity_max_list, points, obs_max_list = find_clusters(
+                    points.detach().cpu().numpy(), alignments, obs=obs_ids
+                )
                 output = [centroids, extends, similarity_max_list, points, obs_max_list]
             else:
-                centroids, extends, similarity_max_list, points = find_clusters(points.detach().cpu().numpy(), alignments, obs = None)
+                centroids, extends, similarity_max_list, points = find_clusters(
+                    points.detach().cpu().numpy(), alignments, obs=None
+                )
                 output = [centroids, extends, similarity_max_list, points]
 
-            debug_text += '### - Found ' + str(len(centroids)) + ' instances that might be target object.\n'
+            debug_text += (
+                "### - Found " + str(len(centroids)) + " instances that might be target object.\n"
+            )
             if debug:
                 output.append(debug_text)
-            
+
             return output
 
 
-def find_clusters(vertices: np.ndarray, similarity: np.ndarray, obs = None):
+def find_clusters(vertices: np.ndarray, similarity: np.ndarray, obs=None):
     # Calculate the number of top values directly
     top_positions = vertices
     # top_values = probability_over_all_points[top_indices].flatten()
@@ -347,7 +390,7 @@ def find_clusters(vertices: np.ndarray, similarity: np.ndarray, obs = None):
     similarity_list = []
     points = []
     obs_max_list = []
-    
+
     for cluster_id in set(labels):
         if cluster_id == -1:  # Ignore noise
             continue
@@ -377,4 +420,4 @@ def find_clusters(vertices: np.ndarray, similarity: np.ndarray, obs = None):
     if obs is not None:
         return centroids, extends, similarity_list, points, obs_max_list
     else:
-        return centroids, extends, similarity_list, points
\ No newline at end of file
+        return centroids, extends, similarity_list, points
diff --git a/src/stretch/dynav/voxel_map_server.py b/src/stretch/dynav/voxel_map_server.py
index 6ce7dbb1b..5437a0d5f 100644
--- a/src/stretch/dynav/voxel_map_server.py
+++ b/src/stretch/dynav/voxel_map_server.py
@@ -1,59 +1,77 @@
-import zmq
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
 
-from stretch.dynav.scannet import CLASS_LABELS_200
+import datetime
+import os
+import pickle
+import threading
+
+# import wget
+import time
+from io import BytesIO
+from pathlib import Path
 
+import clip
 import cv2
+import hydra
 import numpy as np
+import open3d as o3d
+import rerun as rr
+import scipy
 import torch
+
+# from stretch.utils.morphology import get_edges
 import torch.nn.functional as F
 import torchvision.transforms.functional as V
-# from segment_anything import sam_model_registry, SamPredictor
-# from transformers import AutoProcessor, OwlViTForObjectDetection
-from ultralytics import YOLO, SAM, YOLOWorld
+import wget
+import zmq
+from matplotlib import pyplot as plt
+from PIL import Image
 from sam2.build_sam import build_sam2
 from sam2.sam2_image_predictor import SAM2ImagePredictor
-
-import clip
 from torchvision import transforms
+from transformers import AutoModel, AutoProcessor
 
-import os
-# import wget
-import time
-
-import open3d as o3d
+# from segment_anything import sam_model_registry, SamPredictor
+# from transformers import AutoProcessor, OwlViTForObjectDetection
+from ultralytics import SAM, YOLO, YOLOWorld
 
-from matplotlib import pyplot as plt
-import pickle
-from pathlib import Path
-import wget
-from stretch.dynav.voxel_map_localizer import VoxelMapLocalizer
 from stretch.core import get_parameters
+from stretch.dynav.communication_util import (
+    load_socket,
+    recv_array,
+    recv_depth_img,
+    recv_everything,
+    recv_rgb_img,
+    send_array,
+    send_depth_img,
+    send_everything,
+    send_rgb_img,
+)
 from stretch.dynav.mapping_utils import (
-    SparseVoxelMap, 
-    SparseVoxelMapNavigationSpace, 
     AStar,
-    VoxelizedPointcloud
+    SparseVoxelMap,
+    SparseVoxelMapNavigationSpace,
+    VoxelizedPointcloud,
 )
+from stretch.dynav.scannet import CLASS_LABELS_200
+from stretch.dynav.voxel_map_localizer import VoxelMapLocalizer
 
-import datetime
-
-import threading
-import scipy
-import hydra
-
-from transformers import AutoProcessor, AutoModel
-import rerun as rr
-
-from io import BytesIO
-from PIL import Image
-
-from stretch.dynav.communication_util import load_socket, send_array, recv_array, send_rgb_img, recv_rgb_img, send_depth_img, recv_depth_img, send_everything, recv_everything
-# from stretch.utils.morphology import get_edges
-import torch.nn.functional as F
 
 def get_inv_intrinsics(intrinsics):
     # return intrinsics.double().inverse().to(intrinsics)
-    fx, fy, ppx, ppy = intrinsics[..., 0, 0], intrinsics[..., 1, 1], intrinsics[..., 0, 2], intrinsics[..., 1, 2]
+    fx, fy, ppx, ppy = (
+        intrinsics[..., 0, 0],
+        intrinsics[..., 1, 1],
+        intrinsics[..., 0, 2],
+        intrinsics[..., 1, 2],
+    )
     inv_intrinsics = torch.zeros_like(intrinsics)
     inv_intrinsics[..., 0, 0] = 1.0 / fx
     inv_intrinsics[..., 1, 1] = 1.0 / fy
@@ -62,6 +80,7 @@ def get_inv_intrinsics(intrinsics):
     inv_intrinsics[..., 2, 2] = 1.0
     return inv_intrinsics
 
+
 def get_xyz(depth, pose, intrinsics):
     """Returns the XYZ coordinates for a set of points.
 
@@ -101,40 +120,42 @@ def get_xyz(depth, pose, intrinsics):
     xyz = xyz @ get_inv_intrinsics(intrinsics).transpose(-1, -2)
     xyz = xyz * depth.flatten(1).unsqueeze(-1)
     xyz = (xyz[..., None, :] * pose[..., None, :3, :3]).sum(dim=-1) + pose[..., None, :3, 3]
-    
+
     xyz = xyz.unflatten(1, (height, width))
 
     return xyz
 
+
 class ImageProcessor:
-    def __init__(self,  
-        vision_method = 'mask*lip', 
-        siglip = True,
-        device = 'cuda',
-        min_depth = 0.25,
-        max_depth = 2.5,
-        img_port = 5558,
-        text_port = 5556,
-        open_communication = True,
-        rerun = True,
-        static = True,
-        log = None,
-        image_shape = (450, 350)
+    def __init__(
+        self,
+        vision_method="mask*lip",
+        siglip=True,
+        device="cuda",
+        min_depth=0.25,
+        max_depth=2.5,
+        img_port=5558,
+        text_port=5556,
+        open_communication=True,
+        rerun=True,
+        static=True,
+        log=None,
+        image_shape=(450, 350),
     ):
         self.static = static
         self.siglip = siglip
         current_datetime = datetime.datetime.now()
         if log is None:
-            self.log = 'debug_' + current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
+            self.log = "debug_" + current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
         else:
             self.log = log
         self.rerun = rerun
         if self.rerun:
             if self.static:
-                rr.init(self.log, spawn = False)
-                rr.connect('100.108.67.79:9876')
+                rr.init(self.log, spawn=False)
+                rr.connect("100.108.67.79:9876")
             else:
-                rr.init(self.log, spawn = True)
+                rr.init(self.log, spawn=True)
         self.min_depth = min_depth
         self.max_depth = max_depth
         self.obs_count = 0
@@ -144,17 +165,19 @@ def __init__(self,
         self.vision_method = vision_method
         # If cuda is not available, then device will be forced to be cpu
         if not torch.cuda.is_available():
-            device = 'cpu'
+            device = "cpu"
         self.device = device
 
         self.create_obstacle_map()
         self.create_vision_model()
 
-        self.voxel_map_lock = threading.Lock()  # Create a lock for synchronizing access to `self.voxel_map_localizer`
+        self.voxel_map_lock = (
+            threading.Lock()
+        )  # Create a lock for synchronizing access to `self.voxel_map_localizer`
 
         self.traj = None
         self.image_shape = image_shape
-        
+
         if open_communication:
             self.img_socket = load_socket(img_port)
             self.text_socket = load_socket(text_port)
@@ -162,7 +185,7 @@ def __init__(self,
             self.img_thread = threading.Thread(target=self._recv_image)
             self.img_thread.daemon = True
             self.img_thread.start()
-    
+
     def create_obstacle_map(self):
         print("- Load parameters")
         parameters = get_parameters("dynav_config.yaml")
@@ -172,29 +195,21 @@ def create_obstacle_map(self):
             local_radius=parameters["local_radius"],
             obs_min_height=parameters["obs_min_height"],
             obs_max_height=parameters["obs_max_height"],
-            obs_min_density = parameters["obs_min_density"],
-            exp_min_density = parameters["exp_min_density"],
+            obs_min_density=parameters["obs_min_density"],
+            exp_min_density=parameters["exp_min_density"],
             min_depth=self.min_depth,
             max_depth=self.max_depth,
             pad_obstacles=parameters["pad_obstacles"],
-            add_local_radius_points=parameters.get(
-                "add_local_radius_points", default=True
-            ),
+            add_local_radius_points=parameters.get("add_local_radius_points", default=True),
             remove_visited_from_obstacles=parameters.get(
                 "remove_visited_from_obstacles", default=False
             ),
             smooth_kernel_size=parameters.get("filters/smooth_kernel_size", -1),
             use_median_filter=parameters.get("filters/use_median_filter", False),
             median_filter_size=parameters.get("filters/median_filter_size", 5),
-            median_filter_max_error=parameters.get(
-                "filters/median_filter_max_error", 0.01
-            ),
-            use_derivative_filter=parameters.get(
-                "filters/use_derivative_filter", False
-            ),
-            derivative_filter_threshold=parameters.get(
-                "filters/derivative_filter_threshold", 0.5
-            )
+            median_filter_max_error=parameters.get("filters/median_filter_max_error", 0.01),
+            use_derivative_filter=parameters.get("filters/use_derivative_filter", False),
+            derivative_filter_threshold=parameters.get("filters/derivative_filter_threshold", 0.5),
         )
         self.space = SparseVoxelMapNavigationSpace(
             self.voxel_map,
@@ -215,64 +230,98 @@ def create_vision_model(self):
             self.clip_model, self.clip_preprocess = clip.load("ViT-B/16", device=self.device)
             self.clip_model.eval()
         else:
-            self.clip_model = AutoModel.from_pretrained("google/siglip-so400m-patch14-384").to(self.device)
+            self.clip_model = AutoModel.from_pretrained("google/siglip-so400m-patch14-384").to(
+                self.device
+            )
             self.clip_preprocess = AutoProcessor.from_pretrained("google/siglip-so400m-patch14-384")
             self.clip_model.eval()
-        if self.vision_method != 'mask*lip':
+        if self.vision_method != "mask*lip":
             sam_checkpoint = f"./sam2_hiera_small.pt"
             sam_config = "sam2_hiera_s.yaml"
             if not os.path.exists(sam_checkpoint):
-                wget.download('https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_small.pt', out = sam_checkpoint)
-            sam2_model = build_sam2(sam_config, sam_checkpoint, device=self.device, apply_postprocessing=False)
+                wget.download(
+                    "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_small.pt",
+                    out=sam_checkpoint,
+                )
+            sam2_model = build_sam2(
+                sam_config, sam_checkpoint, device=self.device, apply_postprocessing=False
+            )
             self.mask_predictor = SAM2ImagePredictor(sam2_model)
-        self.yolo_model = YOLOWorld('yolov8s-worldv2.pt')
+        self.yolo_model = YOLOWorld("yolov8s-worldv2.pt")
         self.texts = CLASS_LABELS_200
         self.yolo_model.set_classes(self.texts)
-        self.voxel_map_localizer = VoxelMapLocalizer(self.voxel_map, clip_model = self.clip_model, processor = self.clip_preprocess, device = self.device, siglip = self.siglip)
+        self.voxel_map_localizer = VoxelMapLocalizer(
+            self.voxel_map,
+            clip_model=self.clip_model,
+            processor=self.clip_preprocess,
+            device=self.device,
+            siglip=self.siglip,
+        )
 
     def process_text(self, text, start_pose):
         if self.rerun:
-            rr.log('/object', rr.Clear(recursive = True), static = self.static)
-            rr.log('/robot_start_pose', rr.Clear(recursive = True), static = self.static)
-            rr.log('/direction', rr.Clear(recursive = True), static = self.static)
-            rr.log('robot_monologue', rr.Clear(recursive = True), static = self.static)
-            rr.log('/Past_observation_most_similar_to_text', rr.Clear(recursive = True), static = self.static)
+            rr.log("/object", rr.Clear(recursive=True), static=self.static)
+            rr.log("/robot_start_pose", rr.Clear(recursive=True), static=self.static)
+            rr.log("/direction", rr.Clear(recursive=True), static=self.static)
+            rr.log("robot_monologue", rr.Clear(recursive=True), static=self.static)
+            rr.log(
+                "/Past_observation_most_similar_to_text",
+                rr.Clear(recursive=True),
+                static=self.static,
+            )
             if not self.static:
-                rr.connect('100.108.67.79:9876')
-        debug_text = ''
-        mode = 'navigation'
+                rr.connect("100.108.67.79:9876")
+        debug_text = ""
+        mode = "navigation"
         obs = None
         localized_point = None
         waypoints = None
-        
-        if text is not None and text != '' and self.traj is not None:
-            print('saved traj', self.traj)
+
+        if text is not None and text != "" and self.traj is not None:
+            print("saved traj", self.traj)
             traj_target_point = self.traj[-1]
             if self.voxel_map_localizer.verify_point(text, traj_target_point):
                 localized_point = traj_target_point
-                debug_text += '## Last visual grounding results looks fine so directly use it.\n'
+                debug_text += "## Last visual grounding results looks fine so directly use it.\n"
                 # if self.planner.verify_path(self.traj[:-2]):
                 #     waypoints = self.traj[:-2]
                 #     debug_text += '## Last path planning results looks fine so directly use it.\n'
 
         if waypoints is None:
             # Do visual grounding
-            if text is not None and text != '' and localized_point is None:
+            if text is not None and text != "" and localized_point is None:
                 with self.voxel_map_lock:
-                    localized_point, debug_text, obs, pointcloud = self.voxel_map_localizer.localize_A(text, debug = True, return_debug = True)
+                    (
+                        localized_point,
+                        debug_text,
+                        obs,
+                        pointcloud,
+                    ) = self.voxel_map_localizer.localize_A(text, debug=True, return_debug=True)
                 if localized_point is not None:
-                    rr.log("/object", rr.Points3D([localized_point[0], localized_point[1], 1.5], colors=torch.Tensor([1, 0, 0]), radii=0.1), static = self.static)
+                    rr.log(
+                        "/object",
+                        rr.Points3D(
+                            [localized_point[0], localized_point[1], 1.5],
+                            colors=torch.Tensor([1, 0, 0]),
+                            radii=0.1,
+                        ),
+                        static=self.static,
+                    )
             # Do Frontier based exploration
-            if text is None or text == '' or localized_point is None:
-                debug_text += '## Navigation fails, so robot starts exploring environments.\n'
+            if text is None or text == "" or localized_point is None:
+                debug_text += "## Navigation fails, so robot starts exploring environments.\n"
                 localized_point = self.sample_frontier(start_pose, text)
-                mode = 'exploration'
-                rr.log("/object", rr.Points3D([0, 0, 0], colors=torch.Tensor([1, 0, 0]), radii=0), static = self.static)
-                print('\n', localized_point, '\n')
+                mode = "exploration"
+                rr.log(
+                    "/object",
+                    rr.Points3D([0, 0, 0], colors=torch.Tensor([1, 0, 0]), radii=0),
+                    static=self.static,
+                )
+                print("\n", localized_point, "\n")
 
             if localized_point is None:
                 return []
-        
+
             if len(localized_point) == 2:
                 localized_point = np.array([localized_point[0], localized_point[1], 0])
 
@@ -280,44 +329,46 @@ def process_text(self, text, start_pose):
 
             if self.rerun:
                 buf = BytesIO()
-                plt.savefig(buf, format='png')
+                plt.savefig(buf, format="png")
                 buf.seek(0)
                 img = Image.open(buf)
                 img = np.array(img)
                 buf.close()
-                rr.log('2d_map', rr.Image(img), static = self.static)
+                rr.log("2d_map", rr.Image(img), static=self.static)
             else:
-                if text != '':
-                    plt.savefig(self.log + '/debug_' + text + str(self.obs_count) + '.png')
+                if text != "":
+                    plt.savefig(self.log + "/debug_" + text + str(self.obs_count) + ".png")
                 else:
-                    plt.savefig(self.log + '/debug_exploration' + str(self.obs_count) + '.png')
+                    plt.savefig(self.log + "/debug_exploration" + str(self.obs_count) + ".png")
             plt.clf()
 
-            if obs is not None and mode == 'navigation':
+            if obs is not None and mode == "navigation":
                 rgb = self.voxel_map.observations[obs - 1].rgb
                 if not self.rerun:
                     if isinstance(rgb, torch.Tensor):
                         rgb = np.array(rgb)
-                    cv2.imwrite(self.log + '/debug_' + text + '.png', rgb[:, :, [2, 1, 0]])
+                    cv2.imwrite(self.log + "/debug_" + text + ".png", rgb[:, :, [2, 1, 0]])
                 else:
-                    rr.log('/Past_observation_most_similar_to_text', rr.Image(rgb), static = self.static)
+                    rr.log(
+                        "/Past_observation_most_similar_to_text", rr.Image(rgb), static=self.static
+                    )
             waypoints = None
 
             if point is None:
-                print('Unable to find any target point, some exception might happen')
+                print("Unable to find any target point, some exception might happen")
             else:
-                print('Target point is', point)
+                print("Target point is", point)
                 res = self.planner.plan(start_pose, point)
             if res.success:
                 waypoints = [pt.state for pt in res.trajectory]
             else:
                 waypoints = None
-                print('[FAILURE]', res.reason)
-        # If we are navigating to some object of interst, send (x, y, z) of 
+                print("[FAILURE]", res.reason)
+        # If we are navigating to some object of interst, send (x, y, z) of
         # the object so that we can make sure the robot looks at the object after navigation
         traj = []
         if waypoints is not None:
-            finished = len(waypoints) <= 10 and mode == 'navigation'
+            finished = len(waypoints) <= 10 and mode == "navigation"
             if finished:
                 self.traj = None
             else:
@@ -330,31 +381,47 @@ def process_text(self, text, start_pose):
                 if isinstance(localized_point, torch.Tensor):
                     localized_point = localized_point.tolist()
                 traj.append(localized_point)
-            print('Planned trajectory:', traj)
+            print("Planned trajectory:", traj)
 
         if self.rerun:
-            if text is not None and text != '':
-                debug_text = '### The goal is to navigate to ' + text + '.\n' + debug_text
+            if text is not None and text != "":
+                debug_text = "### The goal is to navigate to " + text + ".\n" + debug_text
             else:
-                debug_text = '### I have not received any text query from human user.\n ### So, I plan to explore the environment with Frontier-based exploration.\n'
-            debug_text = '# Robot\'s monologue: \n' + debug_text
-            rr.log("robot_monologue", rr.TextDocument(debug_text, media_type = rr.MediaType.MARKDOWN), static = self.static)
-            
+                debug_text = "### I have not received any text query from human user.\n ### So, I plan to explore the environment with Frontier-based exploration.\n"
+            debug_text = "# Robot's monologue: \n" + debug_text
+            rr.log(
+                "robot_monologue",
+                rr.TextDocument(debug_text, media_type=rr.MediaType.MARKDOWN),
+                static=self.static,
+            )
+
         if traj is not None:
             origins = []
             vectors = []
             for idx in range(len(traj)):
                 if idx != len(traj) - 1:
                     origins.append([traj[idx][0], traj[idx][1], 1.5])
-                    vectors.append([traj[idx + 1][0] - traj[idx][0], traj[idx + 1][1] - traj[idx][1], 0])
-            rr.log("/direction", rr.Arrows3D(origins = origins, vectors = vectors, colors=torch.Tensor([0, 1, 0]), radii=0.1), static = self.static)
-            rr.log("/robot_start_pose", rr.Points3D([start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1), static = self.static)
+                    vectors.append(
+                        [traj[idx + 1][0] - traj[idx][0], traj[idx + 1][1] - traj[idx][1], 0]
+                    )
+            rr.log(
+                "/direction",
+                rr.Arrows3D(
+                    origins=origins, vectors=vectors, colors=torch.Tensor([0, 1, 0]), radii=0.1
+                ),
+                static=self.static,
+            )
+            rr.log(
+                "/robot_start_pose",
+                rr.Points3D(
+                    [start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1
+                ),
+                static=self.static,
+            )
 
         # self.write_to_pickle()
         return traj
 
-
-
     # def process_text(self, text, start_pose):
     #     if self.rerun:
     #         # if not self.static:
@@ -366,7 +433,7 @@ def process_text(self, text, start_pose):
     #         rr.log('/Past_observation_most_similar_to_text', rr.Clear(recursive = True), static = self.static)
     #         if not self.static:
     #             rr.connect('100.108.67.79:9876')
-        
+
     #     debug_text = ''
     #     mode = 'navigation'
     #     obs = None
@@ -386,7 +453,7 @@ def process_text(self, text, start_pose):
 
     #     if localized_point is None:
     #         return []
-        
+
     #     if len(localized_point) == 2:
     #         localized_point = np.array([localized_point[0], localized_point[1], 0])
 
@@ -435,7 +502,7 @@ def process_text(self, text, start_pose):
     #         res = self.planner.plan(start_pose, point)
     #         if res.success:
     #             waypoints = [pt.state for pt in res.trajectory]
-    #             # If we are navigating to some object of interst, send (x, y, z) of 
+    #             # If we are navigating to some object of interst, send (x, y, z) of
     #             # the object so that we can make sure the robot looks at the object after navigation
     #             print(waypoints)
     #             # finished = (len(waypoints) <= 4 or torch.linalg.norm(torch.Tensor(point)[:2] - torch.Tensor(start_pose[:2])) > 0.8) and mode == 'navigation'
@@ -453,7 +520,7 @@ def process_text(self, text, start_pose):
     #             print('Planned trajectory:', traj)
     #         else:
     #             print('[FAILURE]', res.reason)
-            
+
     #     if traj is not None:
     #         origins = []
     #         vectors = []
@@ -463,11 +530,11 @@ def process_text(self, text, start_pose):
     #                 vectors.append([traj[idx + 1][0] - traj[idx][0], traj[idx + 1][1] - traj[idx][1], 0])
     #         rr.log("/direction", rr.Arrows3D(origins = origins, vectors = vectors, colors=torch.Tensor([0, 1, 0]), radii=0.05), static = self.static)
     #         rr.log("/robot_start_pose", rr.Points3D([start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1), static = self.static)
-        
+
     #     self.write_to_pickle()
     #     return traj
 
-    def sample_navigation(self, start, point, mode = 'navigation'):
+    def sample_navigation(self, start, point, mode="navigation"):
         # plt.clf()
         obstacles, _ = self.voxel_map.get_2d_map()
         plt.imshow(obstacles)
@@ -475,50 +542,60 @@ def sample_navigation(self, start, point, mode = 'navigation'):
             start_pt = self.planner.to_pt(start)
             # plt.scatter(start_pt[1], start_pt[0], s = 10)
             return None
-        goal = self.space.sample_target_point(start, point, self.planner, exploration = mode != 'navigation')
+        goal = self.space.sample_target_point(
+            start, point, self.planner, exploration=mode != "navigation"
+        )
         print("point:", point, "goal:", goal)
         obstacles, explored = self.voxel_map.get_2d_map()
         start_pt = self.planner.to_pt(start)
-        plt.scatter(start_pt[1], start_pt[0], s = 15, c = 'b')
+        plt.scatter(start_pt[1], start_pt[0], s=15, c="b")
         point_pt = self.planner.to_pt(point)
-        plt.scatter(point_pt[1], point_pt[0], s = 15, c = 'r')
+        plt.scatter(point_pt[1], point_pt[0], s=15, c="r")
         if goal is not None:
             goal_pt = self.planner.to_pt(goal)
-            plt.scatter(goal_pt[1], goal_pt[0], s = 10, c = 'g')
+            plt.scatter(goal_pt[1], goal_pt[0], s=10, c="g")
         return goal
 
-    def sample_frontier(self, start_pose = [0, 0, 0], text = None):
+    def sample_frontier(self, start_pose=[0, 0, 0], text=None):
         with self.voxel_map_lock:
-            if text is not None and text != '':
-                index, time_heuristics, alignments_heuristics, total_heuristics = self.space.sample_exploration(start_pose, self.planner, self.voxel_map_localizer, text, debug = False)
+            if text is not None and text != "":
+                (
+                    index,
+                    time_heuristics,
+                    alignments_heuristics,
+                    total_heuristics,
+                ) = self.space.sample_exploration(
+                    start_pose, self.planner, self.voxel_map_localizer, text, debug=False
+                )
             else:
-                index, time_heuristics, _, total_heuristics = self.space.sample_exploration(start_pose, self.planner, None, None, debug = False)
+                index, time_heuristics, _, total_heuristics = self.space.sample_exploration(
+                    start_pose, self.planner, None, None, debug=False
+                )
                 alignments_heuristics = time_heuristics
-                
+
         obstacles, explored = self.voxel_map.get_2d_map()
         # plt.clf()
         # plt.imshow(obstacles * 0.5 + explored * 0.5)
         # plt.scatter(index[1], index[0], s = 20, c = 'r')
         return self.voxel_map.grid_coords_to_xyt(torch.tensor([index[0], index[1]]))
-            
 
     def _recv_image(self):
         while True:
             # data = recv_array(self.img_socket)
             rgb, depth, intrinsics, pose = recv_everything(self.img_socket)
-            print('Image received')
+            print("Image received")
             start_time = time.time()
             self.process_rgbd_images(rgb, depth, intrinsics, pose)
             process_time = time.time() - start_time
-            print('Image processing takes', process_time, 'seconds')
-            print('processing took ' + str(process_time) + ' seconds')
+            print("Image processing takes", process_time, "seconds")
+            print("processing took " + str(process_time) + " seconds")
 
     def forward_one_block(self, resblocks, x):
         q, k, v = None, None, None
         y = resblocks.ln_1(x)
         y = F.linear(y, resblocks.attn.in_proj_weight, resblocks.attn.in_proj_bias)
         N, L, C = y.shape
-        y = y.view(N, L, 3, C//3).permute(2, 0, 1, 3).reshape(3*N, L, C//3)
+        y = y.view(N, L, 3, C // 3).permute(2, 0, 1, 3).reshape(3 * N, L, C // 3)
         y = F.linear(y, resblocks.attn.out_proj.weight, resblocks.attn.out_proj.bias)
         q, k, v = y.tensor_split(3, dim=0)
         v += x
@@ -530,7 +607,7 @@ def forward_one_block_siglip(self, resblocks, x):
         q, k, v = None, None, None
         x = F.linear(x, resblocks.in_proj_weight, resblocks.in_proj_bias)
         N, L, C = x.shape
-        x = x.view(N, L, 3, C//3).permute(2, 0, 1, 3).reshape(3*N, L, C//3)
+        x = x.view(N, L, 3, C // 3).permute(2, 0, 1, 3).reshape(3 * N, L, C // 3)
         x = F.linear(x, resblocks.out_proj.weight, resblocks.out_proj.bias)
         q, k, v = x.tensor_split(3, dim=0)
 
@@ -539,14 +616,16 @@ def forward_one_block_siglip(self, resblocks, x):
     def extract_mask_clip_features(self, x, image_shape):
         if self.siglip:
             with torch.no_grad():
-                output = self.clip_model.vision_model(x['pixel_values'], output_hidden_states = True)
+                output = self.clip_model.vision_model(x["pixel_values"], output_hidden_states=True)
             feat = output.last_hidden_state
             feat = self.forward_one_block_siglip(self.clip_model.vision_model.head.attention, feat)
             feat = self.clip_model.vision_model.head.layernorm(feat)
             feat = feat + self.clip_model.vision_model.head.mlp(feat)
             feat = feat.detach().cpu()
             with torch.no_grad():
-                N, L, H, W = self.clip_model.vision_model.embeddings.patch_embedding(x['pixel_values']).shape
+                N, L, H, W = self.clip_model.vision_model.embeddings.patch_embedding(
+                    x["pixel_values"]
+                ).shape
             feat = feat.reshape(N, H, W, L).permute(0, 3, 1, 2)
         else:
             with torch.no_grad():
@@ -554,7 +633,14 @@ def extract_mask_clip_features(self, x, image_shape):
                 N, L, H, W = x.shape
                 x = x.reshape(x.shape[0], x.shape[1], -1)
                 x = x.permute(0, 2, 1)
-                x = torch.cat([self.clip_model.visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)
+                x = torch.cat(
+                    [
+                        self.clip_model.visual.class_embedding.to(x.dtype)
+                        + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device),
+                        x,
+                    ],
+                    dim=1,
+                )
                 x = x + self.clip_model.visual.positional_embedding.to(x.dtype)
                 x = self.clip_model.visual.ln_pre(x)
                 x = x.permute(1, 0, 2)
@@ -568,37 +654,56 @@ def extract_mask_clip_features(self, x, image_shape):
                 x = self.clip_model.visual.ln_post(x)
                 x = x @ self.clip_model.visual.proj
                 feat = x.reshape(N, H, W, -1).permute(0, 3, 1, 2)
-        feat = F.interpolate(feat, image_shape, mode = 'bilinear', align_corners = True)
-        feat = F.normalize(feat, dim = 1)
+        feat = F.interpolate(feat, image_shape, mode="bilinear", align_corners=True)
+        feat = F.normalize(feat, dim=1)
         return feat.permute(0, 2, 3, 1)
-    
+
     def run_mask_clip(self, rgb, mask, world_xyz):
 
         with torch.no_grad():
-            results = self.yolo_model.predict(rgb.permute(1,2,0)[:, :, [2, 1, 0]].numpy(), conf=0.1, verbose=False)
+            results = self.yolo_model.predict(
+                rgb.permute(1, 2, 0)[:, :, [2, 1, 0]].numpy(), conf=0.1, verbose=False
+            )
             xyxy_tensor = results[0].boxes.xyxy
             if len(xyxy_tensor) == 0:
                 return
-            bounding_boxes = torch.stack(sorted(xyxy_tensor, key=lambda box: (box[2] - box[0]) * (box[3] - box[1]), reverse = True), dim = 0)
+            bounding_boxes = torch.stack(
+                sorted(
+                    xyxy_tensor, key=lambda box: (box[2] - box[0]) * (box[3] - box[1]), reverse=True
+                ),
+                dim=0,
+            )
             bbox_mask = torch.zeros_like(mask)
             for box in bounding_boxes:
                 tl_x, tl_y, br_x, br_y = box
-                bbox_mask[max(int(tl_y), 0): min(int(br_y), rgb.shape[1]), max(int(tl_x), 0): min(int(br_x), rgb.shape[2])] = 1
+                bbox_mask[
+                    max(int(tl_y), 0) : min(int(br_y), rgb.shape[1]),
+                    max(int(tl_x), 0) : min(int(br_x), rgb.shape[2]),
+                ] = 1
             bbox_mask = bbox_mask.bool()
             # print(mask, bbox_mask)
             mask = torch.logical_or(mask, ~bbox_mask)
-            
+
             if not self.siglip:
-                if self.device == 'cpu':
-                    input = self.clip_preprocess(transforms.ToPILImage()(rgb)).unsqueeze(0).to(self.device)
+                if self.device == "cpu":
+                    input = (
+                        self.clip_preprocess(transforms.ToPILImage()(rgb))
+                        .unsqueeze(0)
+                        .to(self.device)
+                    )
                 else:
-                    input = self.clip_preprocess(transforms.ToPILImage()(rgb)).unsqueeze(0).to(self.device).half()
+                    input = (
+                        self.clip_preprocess(transforms.ToPILImage()(rgb))
+                        .unsqueeze(0)
+                        .to(self.device)
+                        .half()
+                    )
             else:
                 input = self.clip_preprocess(images=rgb, padding="max_length", return_tensors="pt")
                 for i in input:
                     input[i] = input[i].to(self.device)
             features = self.extract_mask_clip_features(input, rgb.shape[-2:])[0].cpu()
-            
+
         valid_xyz = world_xyz[~mask]
         features = features[~mask]
         valid_rgb = rgb.permute(1, 2, 0)[~mask]
@@ -607,28 +712,35 @@ def run_mask_clip(self, rgb, mask, world_xyz):
 
     def run_owl_sam_clip(self, rgb, mask, world_xyz):
         with torch.no_grad():
-            results = self.yolo_model.predict(rgb.permute(1,2,0)[:, :, [2, 1, 0]].numpy(), conf=0.1, verbose=False)
+            results = self.yolo_model.predict(
+                rgb.permute(1, 2, 0)[:, :, [2, 1, 0]].numpy(), conf=0.1, verbose=False
+            )
             xyxy_tensor = results[0].boxes.xyxy
             if len(xyxy_tensor) == 0:
                 return
 
-            self.mask_predictor.set_image(rgb.permute(1,2,0).numpy())
+            self.mask_predictor.set_image(rgb.permute(1, 2, 0).numpy())
             # bounding_boxes = torch.stack(sorted(results[0]['boxes'], key=lambda box: (box[2] - box[0]) * (box[3] - box[1]), reverse = True), dim = 0)
-            bounding_boxes = torch.stack(sorted(xyxy_tensor, key=lambda box: (box[2] - box[0]) * (box[3] - box[1]), reverse = True), dim = 0)
+            bounding_boxes = torch.stack(
+                sorted(
+                    xyxy_tensor, key=lambda box: (box[2] - box[0]) * (box[3] - box[1]), reverse=True
+                ),
+                dim=0,
+            )
             # transformed_boxes = self.mask_predictor.transform.apply_boxes_torch(bounding_boxes.detach().to(self.device), rgb.shape[-2:])
             # masks, _, _= self.mask_predictor.predict_torch(
-            masks, _, _= self.mask_predictor.predict(
+            masks, _, _ = self.mask_predictor.predict(
                 point_coords=None,
                 point_labels=None,
                 # boxes=transformed_boxes,
-                box = bounding_boxes,
-                multimask_output=False
+                box=bounding_boxes,
+                multimask_output=False,
             )
             if masks.ndim == 3:
                 masks = torch.Tensor(masks).bool()
             else:
                 masks = torch.Tensor(masks[:, 0, :, :]).bool()
-            
+
             # Debug code, visualize all bounding boxes and segmentation masks
 
             image_vis = np.array(rgb.permute(1, 2, 0))
@@ -637,8 +749,10 @@ def run_owl_sam_clip(self, rgb, mask, world_xyz):
             for idx, box in enumerate(bounding_boxes):
                 tl_x, tl_y, br_x, br_y = box
                 tl_x, tl_y, br_x, br_y = tl_x.item(), tl_y.item(), br_x.item(), br_y.item()
-                cv2.rectangle(image_vis, (int(tl_x), int(tl_y)), (int(br_x), int(br_y)), (255, 0, 0), 2)
-            image_vis = cv2.cvtColor(image_vis, cv2.COLOR_RGB2BGR) 
+                cv2.rectangle(
+                    image_vis, (int(tl_x), int(tl_y)), (int(br_x), int(br_y)), (255, 0, 0), 2
+                )
+            image_vis = cv2.cvtColor(image_vis, cv2.COLOR_RGB2BGR)
             for vis_mask in masks:
                 segmentation_color_map[vis_mask.detach().cpu().numpy()] = [0, 255, 0]
             image_vis = cv2.addWeighted(image_vis, 0.7, segmentation_color_map, 0.3, 0)
@@ -646,21 +760,38 @@ def run_owl_sam_clip(self, rgb, mask, world_xyz):
                 cv2.imwrite(self.log + "/seg" + str(self.obs_count) + ".jpg", image_vis)
             # else:
             #     rr.log('Segmentation mask', rr.Image(image_vis[:, :, [2, 1, 0]]))
-    
+
             crops = []
             if not self.siglip:
                 for (box, mask) in zip(bounding_boxes, masks):
                     tl_x, tl_y, br_x, br_y = box
-                    crops.append(self.clip_preprocess(transforms.ToPILImage()(rgb[:, max(int(tl_y), 0): min(int(br_y), rgb.shape[1]), max(int(tl_x), 0): min(int(br_x), rgb.shape[2])])))
-                features = self.clip_model.encode_image(torch.stack(crops, dim = 0).to(self.device))
+                    crops.append(
+                        self.clip_preprocess(
+                            transforms.ToPILImage()(
+                                rgb[
+                                    :,
+                                    max(int(tl_y), 0) : min(int(br_y), rgb.shape[1]),
+                                    max(int(tl_x), 0) : min(int(br_x), rgb.shape[2]),
+                                ]
+                            )
+                        )
+                    )
+                features = self.clip_model.encode_image(torch.stack(crops, dim=0).to(self.device))
             else:
                 for box in bounding_boxes:
                     tl_x, tl_y, br_x, br_y = box
-                    crops.append(rgb[:, max(int(tl_y), 0): min(int(br_y), rgb.shape[1]), max(int(tl_x), 0): min(int(br_x), rgb.shape[2])])
-                inputs = self.clip_preprocess(images = crops, padding="max_length", return_tensors="pt").to(self.device)
+                    crops.append(
+                        rgb[
+                            :,
+                            max(int(tl_y), 0) : min(int(br_y), rgb.shape[1]),
+                            max(int(tl_x), 0) : min(int(br_x), rgb.shape[2]),
+                        ]
+                    )
+                inputs = self.clip_preprocess(
+                    images=crops, padding="max_length", return_tensors="pt"
+                ).to(self.device)
                 features = self.clip_model.get_image_features(**inputs)
-            features = F.normalize(features, dim = -1).cpu()
-
+            features = F.normalize(features, dim=-1).cpu()
 
         for idx, (sam_mask, feature) in enumerate(zip(masks.cpu(), features.cpu())):
             valid_mask = torch.logical_and(~mask, sam_mask)
@@ -670,10 +801,10 @@ def run_owl_sam_clip(self, rgb, mask, world_xyz):
             feature = feature.repeat(valid_xyz.shape[0], 1)
             valid_rgb = rgb.permute(1, 2, 0)[valid_mask]
             self.add_to_voxel_pcd(valid_xyz, feature, valid_rgb)
-    
-    def add_to_voxel_pcd(self, valid_xyz, feature, valid_rgb, weights = None, threshold = 0.95):
+
+    def add_to_voxel_pcd(self, valid_xyz, feature, valid_rgb, weights=None, threshold=0.95):
         # Adding all points to voxelizedPointCloud is useless and expensive, we should exclude threshold of all points
-        selected_indices = torch.randperm(len(valid_xyz))[:int((1 - threshold) * len(valid_xyz))]
+        selected_indices = torch.randperm(len(valid_xyz))[: int((1 - threshold) * len(valid_xyz))]
         if len(selected_indices) == 0:
             return
         if valid_xyz is not None:
@@ -685,11 +816,13 @@ def add_to_voxel_pcd(self, valid_xyz, feature, valid_rgb, weights = None, thresh
         if weights is not None:
             weights = weights[selected_indices]
         with self.voxel_map_lock:
-            self.voxel_map_localizer.add(points = valid_xyz, 
-                                    features = feature,
-                                    rgb = valid_rgb,
-                                    weights = weights,
-                                    obs_count = self.obs_count)
+            self.voxel_map_localizer.add(
+                points=valid_xyz,
+                features=feature,
+                rgb=valid_rgb,
+                weights=weights,
+                obs_count=self.obs_count,
+            )
 
     def process_rgbd_images(self, rgb, depth, intrinsics, pose):
         # import time
@@ -698,29 +831,38 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
             os.mkdir(self.log)
         self.obs_count += 1
 
-        cv2.imwrite(self.log + '/rgb' + str(self.obs_count) + '.jpg', rgb[:, :, [2, 1, 0]])
-        np.save(self.log + '/rgb' + str(self.obs_count) + '.npy', rgb)
-        np.save(self.log + '/depth' + str(self.obs_count) + '.npy', depth)
-        np.save(self.log + '/intrinsics' + str(self.obs_count) + '.npy', intrinsics)
-        np.save(self.log + '/pose' + str(self.obs_count) + '.npy', pose)
+        cv2.imwrite(self.log + "/rgb" + str(self.obs_count) + ".jpg", rgb[:, :, [2, 1, 0]])
+        np.save(self.log + "/rgb" + str(self.obs_count) + ".npy", rgb)
+        np.save(self.log + "/depth" + str(self.obs_count) + ".npy", depth)
+        np.save(self.log + "/intrinsics" + str(self.obs_count) + ".npy", intrinsics)
+        np.save(self.log + "/pose" + str(self.obs_count) + ".npy", pose)
 
         rgb, depth = torch.from_numpy(rgb), torch.from_numpy(depth)
         rgb = rgb.permute(2, 0, 1).to(torch.uint8)
-        
+
         h, w = self.image_shape
         h_image, w_image = depth.shape
-        rgb = F.interpolate(rgb.unsqueeze(0), size=self.image_shape, mode='bilinear', align_corners=False).squeeze(0)
-        depth = F.interpolate(depth.unsqueeze(0).unsqueeze(0), size=self.image_shape, mode='bilinear', align_corners=False).squeeze(0).squeeze(0)
+        rgb = F.interpolate(
+            rgb.unsqueeze(0), size=self.image_shape, mode="bilinear", align_corners=False
+        ).squeeze(0)
+        depth = (
+            F.interpolate(
+                depth.unsqueeze(0).unsqueeze(0),
+                size=self.image_shape,
+                mode="bilinear",
+                align_corners=False,
+            )
+            .squeeze(0)
+            .squeeze(0)
+        )
         intrinsics[0, 0] *= w / w_image
         intrinsics[1, 1] *= h / h_image
         intrinsics[0, 2] *= w / w_image
         intrinsics[1, 2] *= h / h_image
 
         world_xyz = get_xyz(depth, pose, intrinsics).squeeze(0)
-        
-        median_depth = torch.from_numpy(
-            scipy.ndimage.median_filter(depth, size=5)
-        )
+
+        median_depth = torch.from_numpy(scipy.ndimage.median_filter(depth, size=5))
         median_filter_error = (depth - median_depth).abs()
         # edges = get_edges(depth, threshold=0.5)
         valid_depth = torch.logical_and(depth < self.max_depth, depth > self.min_depth)
@@ -729,25 +871,32 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
             & (median_filter_error < 0.01).bool()
             # & ~edges
         )
-        
+
         with self.voxel_map_lock:
-            if self.vision_method == 'mask&*lip':
+            if self.vision_method == "mask&*lip":
                 min_samples_clear = 3
             else:
                 min_samples_clear = 10
-            self.voxel_map_localizer.voxel_pcd.clear_points(depth, torch.from_numpy(intrinsics), torch.from_numpy(pose), min_samples_clear = min_samples_clear)
-            self.voxel_map.voxel_pcd.clear_points(depth, torch.from_numpy(intrinsics), torch.from_numpy(pose))
-        
-        if self.vision_method == 'mask&*lip':
+            self.voxel_map_localizer.voxel_pcd.clear_points(
+                depth,
+                torch.from_numpy(intrinsics),
+                torch.from_numpy(pose),
+                min_samples_clear=min_samples_clear,
+            )
+            self.voxel_map.voxel_pcd.clear_points(
+                depth, torch.from_numpy(intrinsics), torch.from_numpy(pose)
+            )
+
+        if self.vision_method == "mask&*lip":
             self.run_owl_sam_clip(rgb, ~valid_depth, world_xyz)
         else:
             self.run_mask_clip(rgb, ~valid_depth, world_xyz)
 
         self.voxel_map.add(
-            camera_pose = torch.Tensor(pose), 
-            rgb = torch.Tensor(rgb).permute(1, 2, 0), 
-            depth = torch.Tensor(depth), 
-            camera_K = torch.Tensor(intrinsics)
+            camera_pose=torch.Tensor(pose),
+            rgb=torch.Tensor(rgb).permute(1, 2, 0),
+            depth=torch.Tensor(depth),
+            camera_K=torch.Tensor(intrinsics),
         )
         obs, exp = self.voxel_map.get_2d_map()
         if self.rerun:
@@ -755,35 +904,43 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
             #     rr.set_time_sequence("frame", self.obs_count)
             # rr.log('robot_pov', rr.Image(rgb.permute(1, 2, 0)), static = self.static)
             if self.voxel_map.voxel_pcd._points is not None:
-                rr.log("Obstalce_map/pointcloud", rr.Points3D(self.voxel_map.voxel_pcd._points.detach().cpu(), \
-                                                              colors=self.voxel_map.voxel_pcd._rgb.detach().cpu() / 255., radii=0.03), static = self.static)
+                rr.log(
+                    "Obstalce_map/pointcloud",
+                    rr.Points3D(
+                        self.voxel_map.voxel_pcd._points.detach().cpu(),
+                        colors=self.voxel_map.voxel_pcd._rgb.detach().cpu() / 255.0,
+                        radii=0.03,
+                    ),
+                    static=self.static,
+                )
             if self.voxel_map_localizer.voxel_pcd._points is not None:
-                rr.log("Semantic_memory/pointcloud", rr.Points3D(self.voxel_map_localizer.voxel_pcd._points.detach().cpu(), \
-                                                                 colors=self.voxel_map_localizer.voxel_pcd._rgb.detach().cpu() / 255., radii=0.03), static = self.static)
+                rr.log(
+                    "Semantic_memory/pointcloud",
+                    rr.Points3D(
+                        self.voxel_map_localizer.voxel_pcd._points.detach().cpu(),
+                        colors=self.voxel_map_localizer.voxel_pcd._rgb.detach().cpu() / 255.0,
+                        radii=0.03,
+                    ),
+                    static=self.static,
+                )
             # rr.log("Obstalce_map/2D_obs_map", rr.Image(obs.int() * 127 + exp.int() * 127))
         else:
-            cv2.imwrite(self.log + '/debug_' + str(self.obs_count) + '.jpg', np.asarray(obs.int() * 127 + exp.int() * 127))
+            cv2.imwrite(
+                self.log + "/debug_" + str(self.obs_count) + ".jpg",
+                np.asarray(obs.int() * 127 + exp.int() * 127),
+            )
         # end_time = time.time()
         # print('image processing takes ' + str(end_time - start_time) + ' seconds.')
 
     def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
-        print('Reading from ', pickle_file_name)
-        rr.init('Debug', spawn = True)
+        print("Reading from ", pickle_file_name)
+        rr.init("Debug", spawn=True)
         if isinstance(pickle_file_name, str):
             pickle_file_name = Path(pickle_file_name)
         assert pickle_file_name.exists(), f"No file found at {pickle_file_name}"
         with pickle_file_name.open("rb") as f:
             data = pickle.load(f)
-        for i, (
-            camera_pose,
-            xyz,
-            rgb,
-            feats,
-            depth,
-            base_pose,
-            K,
-            world_xyz,
-        ) in enumerate(
+        for i, (camera_pose, xyz, rgb, feats, depth, base_pose, K, world_xyz,) in enumerate(
             zip(
                 data["camera_poses"],
                 data["xyz"],
@@ -842,14 +999,18 @@ def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
         self.voxel_map_localizer.voxel_pcd._rgb = data["combined_rgb"]
         self.voxel_map_localizer.voxel_pcd._obs_counts = data["obs_id"]
         self.voxel_map_localizer.voxel_pcd._entity_ids = data["entity_id"]
-        self.voxel_map_localizer.voxel_pcd.obs_count = max(self.voxel_map_localizer.voxel_pcd._obs_counts).item()
-        self.voxel_map.voxel_pcd.obs_count = max(self.voxel_map_localizer.voxel_pcd._obs_counts).item()
+        self.voxel_map_localizer.voxel_pcd.obs_count = max(
+            self.voxel_map_localizer.voxel_pcd._obs_counts
+        ).item()
+        self.voxel_map.voxel_pcd.obs_count = max(
+            self.voxel_map_localizer.voxel_pcd._obs_counts
+        ).item()
 
     def write_to_pickle(self):
         """Write out to a pickle file. This is a rough, quick-and-easy output for debugging, not intended to replace the scalable data writer in data_tools for bigger efforts."""
-        if not os.path.exists('debug'):
-            os.mkdir('debug')
-        filename = 'debug/' + self.log + '.pkl'
+        if not os.path.exists("debug"):
+            os.mkdir("debug")
+        filename = "debug/" + self.log + ".pkl"
         data = {}
         data["camera_poses"] = []
         data["camera_K"] = []
@@ -884,21 +1045,22 @@ def write_to_pickle(self):
         data["entity_id"] = self.voxel_map_localizer.voxel_pcd._entity_ids
         with open(filename, "wb") as f:
             pickle.dump(data, f)
-        print('write all data to', filename)
+        print("write all data to", filename)
+
 
 # @hydra.main(version_base="1.2", config_path=".", config_name="config.yaml")
 # def main(cfg):
 #     torch.manual_seed(1)
 #     imageProcessor = ImageProcessor(rerun = False, static = False, min_depth = 0., max_depth = 2.5)
-    # imageProcessor = ImageProcessor(rerun = cfg.rerun, static = cfg.static, min_depth = cfg.min_depth, max_depth = cfg.max_depth)
-    # if not cfg.pickle_file_name is None:
-    #     imageProcessor.read_from_pickle(cfg.pickle_file_name)
-    # print(imageProcessor.voxel_map_localizer.voxel_pcd._points)
-    # if cfg.open_communication:
-    #     while True:
-    #         imageProcessor.recv_text()
-    # imageProcessor.read_from_pickle('env.pkl', -1)
-    # imageProcessor.write_to_pickle()
+# imageProcessor = ImageProcessor(rerun = cfg.rerun, static = cfg.static, min_depth = cfg.min_depth, max_depth = cfg.max_depth)
+# if not cfg.pickle_file_name is None:
+#     imageProcessor.read_from_pickle(cfg.pickle_file_name)
+# print(imageProcessor.voxel_map_localizer.voxel_pcd._points)
+# if cfg.open_communication:
+#     while True:
+#         imageProcessor.recv_text()
+# imageProcessor.read_from_pickle('env.pkl', -1)
+# imageProcessor.write_to_pickle()
 
 if __name__ == "__main__":
     main(None)
diff --git a/src/stretch/motion/kinematics.py b/src/stretch/motion/kinematics.py
index bd645fcdc..b22716511 100644
--- a/src/stretch/motion/kinematics.py
+++ b/src/stretch/motion/kinematics.py
@@ -567,7 +567,7 @@ def manip_ik(
         update_pb: bool = True,
         num_attempts: int = 1,
         verbose: bool = False,
-        node_name = None
+        node_name=None,
     ):
         """IK in manipulation mode. Takes in a 4x4 pose_query matrix in se(3) and initial
         configuration of the robot.
@@ -592,7 +592,7 @@ def manip_ik(
             raise NotImplementedError()
 
         q, success, debug_info = self.manip_ik_solver.compute_ik(
-            pos, quat, q0, num_attempts=num_attempts, verbose=verbose, node_name = node_name
+            pos, quat, q0, num_attempts=num_attempts, verbose=verbose, node_name=node_name
         )
 
         if q is not None and success:
diff --git a/src/stretch/motion/pinocchio_ik_solver.py b/src/stretch/motion/pinocchio_ik_solver.py
index 656aa2229..d783f02cd 100644
--- a/src/stretch/motion/pinocchio_ik_solver.py
+++ b/src/stretch/motion/pinocchio_ik_solver.py
@@ -134,10 +134,16 @@ def _qmap_model2control(self, q_input: np.ndarray) -> np.ndarray:
 
         return q_out
 
-    def get_frame_pose(self, config: Union[np.ndarray, dict], node_a: str, node_b: str, ignore_missing_joints: bool = False):
-        '''
-            Get a transformation matrix transforming from node_a frame to node_b frame
-        '''
+    def get_frame_pose(
+        self,
+        config: Union[np.ndarray, dict],
+        node_a: str,
+        node_b: str,
+        ignore_missing_joints: bool = False,
+    ):
+        """
+        Get a transformation matrix transforming from node_a frame to node_b frame
+        """
         q_model = self._qmap_control2model(config, ignore_missing_joints=ignore_missing_joints)
         # print('q_model', q_model)
         pinocchio.forwardKinematics(self.model, self.data, q_model)
@@ -194,7 +200,7 @@ def compute_ik(
         num_attempts: int = 1,
         verbose: bool = False,
         ignore_missing_joints: bool = False,
-        node_name = None
+        node_name=None,
     ) -> Tuple[np.ndarray, bool, dict]:
         """given end-effector position and quaternion, return joint values.
 
@@ -221,7 +227,7 @@ def compute_ik(
         while True:
             pinocchio.forwardKinematics(self.model, self.data, q)
             if node_name is not None:
-                frame_idx = [f.name for f in self.model.frames].index(node_name) 
+                frame_idx = [f.name for f in self.model.frames].index(node_name)
             else:
                 frame_idx = self.ee_frame_idx
             pinocchio.updateFramePlacement(self.model, self.data, frame_idx)
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 668ec5ca3..0939c1d6c 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -211,7 +211,7 @@ def __init__(
                 vectors=[[1, 0, 0], [0, 1, 0], [0, 0, 1]],
                 colors=[[255, 0, 0], [0, 255, 0], [0, 0, 255]],
             ),
-            static = True
+            static=True,
         )
 
         self.bbox_colors_memory = {}
@@ -241,10 +241,14 @@ def setup_blueprint(self, collapse_panels: bool):
     def log_head_camera(self, obs):
         """Log head camera pose and images"""
         rr.set_time_seconds("realtime", time.time())
-        rr.log("world/head_camera/rgb", rr.Image(obs["rgb"]), static = True)
-        rr.log("world/head_camera/depth", rr.DepthImage(obs["depth"]), static = True)
+        rr.log("world/head_camera/rgb", rr.Image(obs["rgb"]), static=True)
+        rr.log("world/head_camera/depth", rr.DepthImage(obs["depth"]), static=True)
         rot, trans = decompose_homogeneous_matrix(obs["camera_pose"])
-        rr.log("world/head_camera", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3), static = True)
+        rr.log(
+            "world/head_camera",
+            rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3),
+            static=True,
+        )
         rr.log(
             "world/head_camera",
             rr.Pinhole(
@@ -252,7 +256,7 @@ def log_head_camera(self, obs):
                 image_from_camera=obs["camera_K"],
                 image_plane_distance=0.15,
             ),
-            static = True
+            static=True,
         )
 
     def log_robot_xyt(self, obs):
@@ -268,10 +272,14 @@ def log_robot_xyt(self, obs):
             colors=[255, 0, 0, 255],
         )
 
-        rr.log("world/robot/arrow", rb_arrow, static = True)
-        rr.log("world/robot/blob", rr.Points3D([0, 0, 0], colors=[255, 0, 0, 255], radii=0.13), static = True)
+        rr.log("world/robot/arrow", rb_arrow, static=True)
+        rr.log(
+            "world/robot/blob",
+            rr.Points3D([0, 0, 0], colors=[255, 0, 0, 255], radii=0.13),
+            static=True,
+        )
         # rr.log("world/robot/arrow", rb_arrow)
-  
+
         rr.log(
             "world/robot",
             rr.Transform3D(
@@ -279,7 +287,7 @@ def log_robot_xyt(self, obs):
                 rotation=rr.RotationAxisAngle(axis=[0, 0, 1], radians=theta),
                 axis_length=0.7,
             ),
-            static = True
+            static=True,
         )
 
     def log_ee_frame(self, obs):
@@ -293,8 +301,10 @@ def log_ee_frame(self, obs):
         ee_arrow = rr.Arrows3D(
             origins=[0, 0, 0], vectors=[0.2, 0, 0], radii=0.02, labels="ee", colors=[0, 255, 0, 255]
         )
-        rr.log("world/ee/arrow", ee_arrow, static = True)
-        rr.log("world/ee", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3), static = True)
+        rr.log("world/ee/arrow", ee_arrow, static=True)
+        rr.log(
+            "world/ee", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3), static=True
+        )
 
     def log_ee_camera(self, servo):
         """Log end effector camera pose and images
@@ -303,10 +313,14 @@ def log_ee_camera(self, servo):
         """
         rr.set_time_seconds("realtime", time.time())
         # EE Camera
-        rr.log("world/ee_camera/rgb", rr.Image(servo.ee_rgb), static = True)
-        rr.log("world/ee_camera/depth", rr.DepthImage(servo.ee_depth), static = True)
+        rr.log("world/ee_camera/rgb", rr.Image(servo.ee_rgb), static=True)
+        rr.log("world/ee_camera/depth", rr.DepthImage(servo.ee_depth), static=True)
         rot, trans = decompose_homogeneous_matrix(servo.ee_camera_pose)
-        rr.log("world/ee_camera", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3), static = True)
+        rr.log(
+            "world/ee_camera",
+            rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3),
+            static=True,
+        )
         rr.log(
             "world/ee_camera",
             rr.Pinhole(
@@ -314,7 +328,7 @@ def log_ee_camera(self, servo):
                 image_from_camera=servo.ee_camera_K,
                 image_plane_distance=0.15,
             ),
-            static = True
+            static=True,
         )
 
     def log_robot_state(self, obs):
@@ -322,7 +336,11 @@ def log_robot_state(self, obs):
         rr.set_time_seconds("realtime", time.time())
         state = obs["joint"]
         for k in HelloStretchIdx.name_to_idx:
-            rr.log(f"robot_state/joint_pose/{k}", rr.Scalar(state[HelloStretchIdx.name_to_idx[k]]), static = True)
+            rr.log(
+                f"robot_state/joint_pose/{k}",
+                rr.Scalar(state[HelloStretchIdx.name_to_idx[k]]),
+                static=True,
+            )
 
     def log_robot_transforms(self, obs):
         """
@@ -350,7 +368,7 @@ def update_voxel_map(
         rr.log(
             "world/point_cloud",
             rr.Points3D(positions=points, radii=np.ones(rgb.shape[0]) * 0.01, colors=np.int64(rgb)),
-            static = True
+            static=True,
         )
 
         t1 = timeit.default_timer()
@@ -378,7 +396,7 @@ def update_voxel_map(
                 radii=np.ones(points.shape[0]) * obstacle_radius,
                 colors=[255, 0, 0],
             ),
-            static = True
+            static=True,
         )
         rr.log(
             "world/explored",
@@ -387,7 +405,7 @@ def update_voxel_map(
                 radii=np.ones(points.shape[0]) * explored_radius,
                 colors=[255, 255, 255],
             ),
-            static = True
+            static=True,
         )
         t6 = timeit.default_timer()
 
@@ -426,7 +444,7 @@ def update_scene_graph(
                 rr.log(
                     f"world/{instance.id}_{name}",
                     rr.Points3D(positions=point_cloud_rgb, colors=np.int64(pcd_rgb)),
-                    static = True
+                    static=True,
                 )
                 half_sizes = [(b[0] - b[1]) / 2 for b in bbox_bounds]
                 bounds.append(half_sizes)
@@ -444,7 +462,7 @@ def update_scene_graph(
                     radii=0.01,
                     colors=colors,
                 ),
-                static = True
+                static=True,
             )
             t1 = timeit.default_timer()
             print("Time to log scene graph objects: ", t1 - t0)

From 91ae6feda0d002ccd8081b56925e12010b6c6409 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 23 Sep 2024 11:29:44 -0400
Subject: [PATCH 215/410] Fix llm server issues

---
 src/stretch/dynav/llm_server.py | 31 +++----------------------------
 1 file changed, 3 insertions(+), 28 deletions(-)

diff --git a/src/stretch/dynav/llm_server.py b/src/stretch/dynav/llm_server.py
index eaec0c5c4..eb395e8cf 100644
--- a/src/stretch/dynav/llm_server.py
+++ b/src/stretch/dynav/llm_server.py
@@ -14,46 +14,21 @@
 
 # import wget
 import time
-from collections import OrderedDict
 from io import BytesIO
 from pathlib import Path
 
-import clip
 import cv2
-import hydra
 import numpy as np
-import open3d as o3d
 import rerun as rr
 import scipy
 import torch
-import torch.nn.functional as F
-import torchvision.transforms.functional as V
-import zmq
 from matplotlib import pyplot as plt
 from PIL import Image
-from torchvision import transforms
-from transformers import AutoModel, AutoProcessor
 
 from stretch.core import get_parameters
-from stretch.dynav.communication_util import (
-    load_socket,
-    recv_array,
-    recv_depth_img,
-    recv_everything,
-    recv_rgb_img,
-    send_array,
-    send_depth_img,
-    send_everything,
-    send_rgb_img,
-)
+from stretch.dynav.communication_util import load_socket, recv_everything
 from stretch.dynav.llm_localizer import LLM_Localizer
-from stretch.dynav.mapping_utils import (
-    AStar,
-    SparseVoxelMap,
-    SparseVoxelMapNavigationSpace,
-    VoxelizedPointcloud,
-)
-from stretch.dynav.scannet import CLASS_LABELS_200
+from stretch.dynav.mapping_utils import AStar, SparseVoxelMap, SparseVoxelMapNavigationSpace
 
 
 def get_inv_intrinsics(intrinsics):
@@ -327,7 +302,7 @@ def process_text(self, text, start_pose):
             res = self.planner.plan(start_pose, point)
             if res.success:
                 waypoints = [pt.state for pt in res.trajectory]
-                # If we are navigating to some object of interst, send (x, y, z) of
+                # If we are navigating to some object of interest, send (x, y, z) of
                 # the object so that we can make sure the robot looks at the object after navigation
                 finished = len(waypoints) <= 10 and mode == "navigation"
                 if not finished:

From d61d11e7c3535823b604a71ef5ba7127df132239 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 23 Sep 2024 11:31:04 -0400
Subject: [PATCH 216/410] update and fix readme spelling issues

---
 src/stretch/dynav/ok_robot_hw/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/stretch/dynav/ok_robot_hw/README.md b/src/stretch/dynav/ok_robot_hw/README.md
index 5e79b1f41..d2c44a595 100644
--- a/src/stretch/dynav/ok_robot_hw/README.md
+++ b/src/stretch/dynav/ok_robot_hw/README.md
@@ -1,16 +1,16 @@
 # Robot Side Code
 
-Most of the heavy code will be runnning in the workstation and will communicate with the robot through sockets
+Most of the heavy code will be running in the workstation and will communicate with the robot through sockets
 
 ## Preparation to run robot side codes
 
 Our hardware codes heavily rely on robot controller provided by [home-robot](https://github.com/facebookresearch/home-robot).
 
-You need to install the home-robot on stretch robot following intructions provided by [home-robot-hw](https://github.com/facebookresearch/home-robot/blob/main/docs/install_robot.md) before running any robot side codes on robot.
+You need to install the home-robot on stretch robot following instructions provided by [home-robot-hw](https://github.com/facebookresearch/home-robot/blob/main/docs/install_robot.md) before running any robot side codes on robot.
 
 To check whether home-robot is installed properly and got familiar with running home-robot based codes, we recommend you try to run [these test scripts](https://github.com/facebookresearch/home-robot/blob/main/tests/hw_manual_test.py).
 
-Once you finised installing home-robot, follow [these steps](../docs/robot-installation.md) to enable OK-Robot use home-robot contollers.
+Once you finished installing Home Robot, follow [these steps](../docs/robot-installation.md) to enable OK-Robot use home-robot controllers.
 
 The success of OK-Robot system also depends on robot calibration and accurate urdf file, so make sure you follow [these calibration steps](../docs/robot-calibration.md) to obtain an accurate urdf file for your robot.
 
@@ -27,8 +27,8 @@ roslaunch home_robot_hw startup_stretch_hector_slam.launch
 python run.py -x1 [x1] -y1 [y1] -x2 [x2] -y2 [y2] -ip [your workstation ip]
 ```
 
-- **\[x1, y1\]** - Co-ordinated of tape on which the base of the robot is on
-- **\[x2, y2\]** - Co-ordinates of the secondary tape.
+- **\[x1, y1\]** - Coordinated of tape on which the base of the robot is on
+- **\[x2, y2\]** - Coordinates of the secondary tape.
 - **ip** - Your workstation ip, the robot will try to communicate with this ip
 - **np** - Navigation port number, the robot will listen to this port number to get planned navigation paths (default: 5555)
 - **mp** - Manipulation port number, the robot will listen to this port number to get estimated manipulation pose (default: 5556)

From e5716459f88e5aa6de5e9c9edfd8778f5793982f Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 23 Sep 2024 14:59:09 -0400
Subject: [PATCH 217/410] change path for debugging

---
 src/stretch/agent/manipulation/dinobot.py    | 3 ++-
 src/stretch/visualization/urdf_visualizer.py | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/manipulation/dinobot.py b/src/stretch/agent/manipulation/dinobot.py
index 5d68bb7f4..e056ab6ae 100644
--- a/src/stretch/agent/manipulation/dinobot.py
+++ b/src/stretch/agent/manipulation/dinobot.py
@@ -15,7 +15,8 @@
 import numpy as np
 import torch
 
-sys.path.append("/home/hello-robot/repos/dino-vit-features")
+# sys.path.append("/home/hello-robot/repos/dino-vit-features")
+sys.path.append("/home/cpaxton/src/dino-vit-features")
 import sys
 
 # Install this DINO repo to extract correspondences: https://github.com/ShirAmir/dino-vit-features
diff --git a/src/stretch/visualization/urdf_visualizer.py b/src/stretch/visualization/urdf_visualizer.py
index 3b42a9044..0963b8614 100644
--- a/src/stretch/visualization/urdf_visualizer.py
+++ b/src/stretch/visualization/urdf_visualizer.py
@@ -20,6 +20,7 @@
 from stretch.motion import HelloStretchIdx
 
 pkg_path = str(importlib_resources.files("stretch_urdf"))
+
 model_name = "SE3"  # RE1V0, RE2V0, SE3
 tool_name = "eoa_wrist_dw3_tool_sg3"  # eoa_wrist_dw3_tool_sg3, tool_stretch_gripper, etc
 urdf_file_path = pkg_path + f"/{model_name}/stretch_description_{model_name}_{tool_name}.urdf"

From ff48b9af55837cc7cbca3bf690bc930e85475943 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 23 Sep 2024 15:02:52 -0400
Subject: [PATCH 218/410] updates to ovmm app

---
 src/stretch/app/ovmm.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/stretch/app/ovmm.py b/src/stretch/app/ovmm.py
index 9dd1cb37e..0c9ebbdea 100644
--- a/src/stretch/app/ovmm.py
+++ b/src/stretch/app/ovmm.py
@@ -21,7 +21,7 @@
 from stretch.agent.task.llm_plan import LLMPlanTask
 from stretch.agent.zmq_client import HomeRobotZmqClient
 from stretch.core import get_parameters
-from stretch.llms.openai_client import OpenaiClient
+from stretch.llms import get_llm_choices, get_llm_client
 from stretch.llms.prompts import ObjectManipNavPromptBuilder
 from stretch.perception import create_semantic_sensor
 
@@ -56,6 +56,12 @@
 )
 # This threshold seems to work ok for Siglip - will not work for e.g. CLIP
 @click.option("--threshold", default=0.05, help="Threshold for similarity when using --all-matches")
+@click.option(
+    "--llm",
+    default="openai",
+    help="Client to use for language model.",
+    type=click.Choice(get_llm_choices()),
+)
 @click.option(
     "--stationary",
     is_flag=True,
@@ -83,6 +89,7 @@ def main(
     all_matches: bool = False,
     threshold: float = 0.5,
     target_object: str = "toy",
+    llm: str = "openai",
 ):
 
     print("- Load parameters")
@@ -111,7 +118,7 @@ def main(
     # agent.voxel_map.read_from_pickle(input_file)
 
     prompt = ObjectManipNavPromptBuilder()
-    client = OpenaiClient(prompt)
+    client = get_llm_client(llm, prompt=prompt)
 
     print("Starting robot exploration...")
 
@@ -125,7 +132,9 @@ def main(
         visualize=False,
     )
 
-    while True:
+    while robot.running:
+
+        # Get a plan from the language model
         text = input("Enter a long horizon task: ")
         plan = client(text)
         print(f"Generated plan: \n{plan}")

From df531a072cd073e08b2ad86cb23c152bd3b0839b Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 23 Sep 2024 15:06:56 -0400
Subject: [PATCH 219/410] add a docker

---
 docs/docker.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/docker.md b/docs/docker.md
index b9e69c6a1..aed741d3c 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -2,6 +2,8 @@
 
 This is a guide to setting up a Docker container for running the Stretch software. This is useful for running the software on a computer that doesn't have the correct dependencies installed, or for running the software in a controlled environment.
 
+Hello Robot has a set of scripts for [building robot-side docker images](https://github.com/hello-robot/stretch_docker), which can be conconsidered separately. Here, we will go through what docker is, why we use it, and how to set it up on your client side to run AI code.
+
 ## What is Docker and Why Would I Use It?
 
 Docker is a tool that allows you to run software in a container. A container is a lightweight, standalone, executable package of software that includes everything needed to run the software, including the code, runtime, system tools, libraries, and settings. Containers are isolated from each other and from the host system, so they are a great way to run software in a controlled environment.

From a50129aafdcf0c800c1274a0b8320553005d8934 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 23 Sep 2024 15:32:23 -0400
Subject: [PATCH 220/410] rate no longer supported

---
 src/stretch/app/ovmm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/stretch/app/ovmm.py b/src/stretch/app/ovmm.py
index 0c9ebbdea..f974dd0cf 100644
--- a/src/stretch/app/ovmm.py
+++ b/src/stretch/app/ovmm.py
@@ -123,7 +123,6 @@ def main(
     print("Starting robot exploration...")
 
     agent.run_exploration(
-        rate=10,
         manual_wait=False,
         explore_iter=20,
         task_goal=target_object,  # arbitrary object to collect

From 0c6c3e94afdabaaa885e2f6cceccd794fc57962f Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 23 Sep 2024 15:34:29 -0400
Subject: [PATCH 221/410] updating stuff here

---
 src/stretch/app/ovmm.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/stretch/app/ovmm.py b/src/stretch/app/ovmm.py
index f974dd0cf..3f257db05 100644
--- a/src/stretch/app/ovmm.py
+++ b/src/stretch/app/ovmm.py
@@ -62,11 +62,7 @@
     help="Client to use for language model.",
     type=click.Choice(get_llm_choices()),
 )
-@click.option(
-    "--stationary",
-    is_flag=True,
-    help="Don't move the robot to the instance, if using real robot instead of offline data",
-)
+@click.option("--explore-iter", default=10, type=int, help="Number of iterations to explore")
 @click.option("--target_object", type=str, default="toy", help="Type of object to pick up and move")
 def main(
     device_id: int = 0,
@@ -85,7 +81,6 @@ def main(
     frame: int = -1,
     text: str = "",
     yes: bool = False,
-    stationary: bool = False,
     all_matches: bool = False,
     threshold: float = 0.5,
     target_object: str = "toy",
@@ -124,7 +119,7 @@ def main(
 
     agent.run_exploration(
         manual_wait=False,
-        explore_iter=20,
+        explore_iter=explore_iter,
         task_goal=target_object,  # arbitrary object to collect
         # as many instances as possible
         go_home_at_end=True,
@@ -137,7 +132,11 @@ def main(
         text = input("Enter a long horizon task: ")
         plan = client(text)
         print(f"Generated plan: \n{plan}")
-        proceed = input("Proceed with plan? [y/n]: ")
+
+        if yes:
+            proceed = True
+        else:
+            proceed = input("Proceed with plan? [y/n]: ")
 
         if plan.startswith("```python"):
             plan = plan.split("\n", 1)[1]

From 4d44be3be352bd2a447e518d741b4c942ff96281 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 23 Sep 2024 16:04:47 -0400
Subject: [PATCH 222/410] update rerun

---
 src/stretch/visualization/rerun.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index f6aabed47..1de83ab27 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -181,6 +181,7 @@ def __init__(
         open_browser: bool = True,
         server_memory_limit: str = "4GB",
         collapse_panels: bool = True,
+        show_cameras_in_3d_view: bool = False,
     ):
         """Rerun visualizer class
         Args:
@@ -193,6 +194,7 @@ def __init__(
         rr.serve(open_browser=open_browser, server_memory_limit=server_memory_limit)
 
         self.display_robot_mesh = display_robot_mesh
+        self.show_cameras_in_3d_view = show_cameras_in_3d_view
 
         if self.display_robot_mesh:
             self.urdf_logger = StretchURDFLogger()
@@ -240,8 +242,9 @@ def setup_blueprint(self, collapse_panels: bool):
     def log_head_camera(self, obs):
         """Log head camera pose and images"""
         rr.set_time_seconds("realtime", time.time())
-        rr.log("world/head_camera/rgb", rr.Image(obs["rgb"]))
-        rr.log("world/head_camera/depth", rr.DepthImage(obs["depth"]))
+        if self.show_cameras_in_3d_view:
+            rr.log("world/head_camera/rgb", rr.Image(obs["rgb"]))
+            rr.log("world/head_camera/depth", rr.DepthImage(obs["depth"]))
         rot, trans = decompose_homogeneous_matrix(obs["camera_pose"])
         rr.log("world/head_camera", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3))
         rr.log(
@@ -296,8 +299,9 @@ def log_ee_camera(self, servo):
         """
         rr.set_time_seconds("realtime", time.time())
         # EE Camera
-        rr.log("world/ee_camera/rgb", rr.Image(servo.ee_rgb))
-        rr.log("world/ee_camera/depth", rr.DepthImage(servo.ee_depth))
+        if self.show_cameras_in_3d_view:
+            rr.log("world/ee_camera/rgb", rr.Image(servo.ee_rgb))
+            rr.log("world/ee_camera/depth", rr.DepthImage(servo.ee_depth))
         rot, trans = decompose_homogeneous_matrix(servo.ee_camera_pose)
         rr.log("world/ee_camera", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3))
         rr.log(

From 0707f4c01440cd9a3e70f32b6078ad56915b4c83 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 23 Sep 2024 16:20:39 -0400
Subject: [PATCH 223/410] colorized point clouds in rerun

---
 src/stretch/visualization/rerun.py | 101 +++++++++++++++++++++--------
 1 file changed, 73 insertions(+), 28 deletions(-)

diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 1de83ab27..6df2b1075 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -18,6 +18,7 @@
 import rerun.blueprint as rrb
 import torch
 
+from stretch.core.interfaces import Observations
 from stretch.mapping.scene_graph import SceneGraph
 from stretch.mapping.voxel.voxel_map import SparseVoxelMapNavigationSpace
 from stretch.motion import HelloStretchIdx
@@ -175,6 +176,9 @@ def log_transforms(self, obs, debug: bool = False):
 
 
 class RerunVsualizer:
+
+    camera_point_radius = 0.01
+
     def __init__(
         self,
         display_robot_mesh: bool = True,
@@ -182,6 +186,7 @@ def __init__(
         server_memory_limit: str = "4GB",
         collapse_panels: bool = True,
         show_cameras_in_3d_view: bool = False,
+        show_camera_point_clouds: bool = True,
     ):
         """Rerun visualizer class
         Args:
@@ -195,6 +200,7 @@ def __init__(
 
         self.display_robot_mesh = display_robot_mesh
         self.show_cameras_in_3d_view = show_cameras_in_3d_view
+        self.show_camera_point_clouds = show_camera_point_clouds
 
         if self.display_robot_mesh:
             self.urdf_logger = StretchURDFLogger()
@@ -239,24 +245,44 @@ def setup_blueprint(self, collapse_panels: bool):
         )
         rr.send_blueprint(my_blueprint)
 
-    def log_head_camera(self, obs):
-        """Log head camera pose and images"""
+    def log_head_camera(self, obs: Observations):
+        """Log head camera pose and images
+
+        Args:
+            obs (Observations): Observation dataclass
+        """
         rr.set_time_seconds("realtime", time.time())
+        rr.log("world/head_camera/rgb", rr.Image(obs.rgb))
+
+        if self.show_camera_point_clouds:
+            head_xyz = obs.get_xyz_in_world_frame().reshape(-1, 3)
+            head_rgb = obs.rgb.reshape(-1, 3)
+            rr.log(
+                "world/head_camera/points",
+                rr.Points3D(
+                    positions=head_xyz,
+                    radii=np.ones(head_xyz.shape[:2]) * self.camera_point_radius,
+                    colors=np.int64(head_rgb),
+                ),
+            )
+        else:
+            rr.log("world/head_camera/depth", rr.depthimage(obs.head_depth))
+
         if self.show_cameras_in_3d_view:
-            rr.log("world/head_camera/rgb", rr.Image(obs["rgb"]))
-            rr.log("world/head_camera/depth", rr.DepthImage(obs["depth"]))
-        rot, trans = decompose_homogeneous_matrix(obs["camera_pose"])
-        rr.log("world/head_camera", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3))
-        rr.log(
-            "world/head_camera",
-            rr.Pinhole(
-                resolution=[obs["rgb"].shape[1], obs["rgb"].shape[0]],
-                image_from_camera=obs["camera_K"],
-                image_plane_distance=0.15,
-            ),
-        )
+            rot, trans = decompose_homogeneous_matrix(obs.camera_pose)
+            rr.log(
+                "world/head_camera", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3)
+            )
+            rr.log(
+                "world/head_camera",
+                rr.Pinhole(
+                    resolution=[obs.rgb.shape[1], obs.rgb.shape[0]],
+                    image_from_camera=obs.camera_K,
+                    image_plane_distance=0.15,
+                ),
+            )
 
-    def log_robot_xyt(self, obs):
+    def log_robot_xyt(self, obs: Observations):
         """Log robot world pose"""
         rr.set_time_seconds("realtime", time.time())
         xy = obs["gps"]
@@ -299,19 +325,35 @@ def log_ee_camera(self, servo):
         """
         rr.set_time_seconds("realtime", time.time())
         # EE Camera
+        rr.log("world/ee_camera/rgb", rr.Image(servo.ee_rgb))
+
+        if self.show_camera_point_clouds:
+            ee_xyz = servo.get_ee_xyz_in_world_frame().reshape(-1, 3)
+            ee_rgb = servo.ee_rgb.reshape(-1, 3)
+            rr.log(
+                "world/ee_camera/points",
+                rr.Points3D(
+                    positions=ee_xyz,
+                    radii=np.ones(ee_xyz.shape[:2]) * self.camera_point_radius,
+                    colors=np.int64(ee_rgb),
+                ),
+            )
+        else:
+            rr.log("world/ee_camera/depth", rr.depthimage(servo.ee_depth))
+
         if self.show_cameras_in_3d_view:
-            rr.log("world/ee_camera/rgb", rr.Image(servo.ee_rgb))
-            rr.log("world/ee_camera/depth", rr.DepthImage(servo.ee_depth))
-        rot, trans = decompose_homogeneous_matrix(servo.ee_camera_pose)
-        rr.log("world/ee_camera", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3))
-        rr.log(
-            "world/ee_camera",
-            rr.Pinhole(
-                resolution=[servo.ee_rgb.shape[1], servo.ee_rgb.shape[0]],
-                image_from_camera=servo.ee_camera_K,
-                image_plane_distance=0.15,
-            ),
-        )
+            rot, trans = decompose_homogeneous_matrix(servo.ee_camera_pose)
+            rr.log(
+                "world/ee_camera", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3)
+            )
+            rr.log(
+                "world/ee_camera",
+                rr.Pinhole(
+                    resolution=[servo.ee_rgb.shape[1], servo.ee_rgb.shape[0]],
+                    image_from_camera=servo.ee_camera_K,
+                    image_plane_distance=0.15,
+                ),
+            )
 
     def log_robot_state(self, obs):
         """Log robot joint states"""
@@ -467,9 +509,12 @@ def step(self, obs, servo):
             try:
                 t0 = timeit.default_timer()
                 self.log_robot_xyt(obs)
-                self.log_head_camera(obs)
                 self.log_ee_frame(obs)
+
+                # Cameras use the lower-res servo object
+                self.log_head_camera(servo)
                 self.log_ee_camera(servo)
+
                 self.log_robot_state(obs)
 
                 if self.display_robot_mesh:

From 22681492ed5136d962ba3737538166ab524114fb Mon Sep 17 00:00:00 2001
From: stretch-se3-3055 <grail-hello-stretch@nyu.edu>
Date: Mon, 23 Sep 2024 15:38:22 -0700
Subject: [PATCH 224/410] update

---
 src/stretch/config/control/noplan_velocity_hw.yaml        | 8 ++++----
 src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py      | 2 +-
 src/stretch_ros2_bridge/launch/cameras.launch.py          | 6 +++---
 .../stretch_ros2_bridge/remote/modules/nav.py             | 4 ++--
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/stretch/config/control/noplan_velocity_hw.yaml b/src/stretch/config/control/noplan_velocity_hw.yaml
index fb560ce42..672574ce0 100644
--- a/src/stretch/config/control/noplan_velocity_hw.yaml
+++ b/src/stretch/config/control/noplan_velocity_hw.yaml
@@ -5,16 +5,16 @@
 #w_max: 0.5  # (vel_m_max - vel_m_default) / wheel_separation_m
 #acc_lin: 0.2  # 0.5 * base.params["motion"]["max"]["accel_m"]
 #acc_ang: 0.4  # 0.5 * (accel_m_max - accel_m_default) / wheel_separation_m
-v_max: 0.2  # base.params["motion"]["default"]["vel_m"]
-w_max: 0.5  # (vel_m_max - vel_m_default) / wheel_separation_m
+v_max: 10.0  # base.params["motion"]["default"]["vel_m"]
+w_max: 5.0  # (vel_m_max - vel_m_default) / wheel_separation_m
   #w_max: 0.6  # (vel_m_max - vel_m_default) / wheel_separation_m
 acc_lin: 0.1  # 0.5 * base.params["motion"]["max"]["accel_m"]
 acc_ang: 0.2  # 0.5 * (accel_m_max - accel_m_default) / wheel_separation_m
 #acc_ang: 0.3  # 0.5 * (accel_m_max - accel_m_default) / wheel_separation_m
 
 # Tolerances for determining whether the goal position or orientation is reached
-lin_error_tol: 0.12
-ang_error_tol: 0.12
+lin_error_tol: 0.15
+ang_error_tol: 0.15
 
 # Error tolerance ratio - scale target error by this to a minimum of the tolerance below
 lin_error_ratio: 0.5
diff --git a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
index de23b0320..f1c7507e2 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
@@ -169,7 +169,7 @@ def pickup(robot, rotation, translation, base_node, gripper_node, gripper_height
 
     # Moving gripper to a point that is 0.2m away from the pose center in the line of gripper
     robot.move_to_pose(
-        [pin_transformed_point1[0], pin_transformed_point1[1], pin_transformed_point1[2] - 0.22],
+        [pin_transformed_point1[0], pin_transformed_point1[1], pin_transformed_point1[2] - 0.2],
         [0, 0, 0],
         [1],
         move_mode = 1
diff --git a/src/stretch_ros2_bridge/launch/cameras.launch.py b/src/stretch_ros2_bridge/launch/cameras.launch.py
index ced4403be..6e7ab248a 100644
--- a/src/stretch_ros2_bridge/launch/cameras.launch.py
+++ b/src/stretch_ros2_bridge/launch/cameras.launch.py
@@ -26,9 +26,9 @@ def generate_launch_description():
         # "temporal_filter.enable": "True",
         # "disparity_filter.enable": "False",
         "device_type": "d435i",
-        "rgb_camera.color_profile": "424x240x30",
-        "depth_module.depth_profile": "424x240x30",
-        "depth_module.infra_profile": "424x240x30",
+        "rgb_camera.color_profile": "640x480x30",
+        "depth_module.depth_profile": "640x480x30",
+        "depth_module.infra_profile": "640x480x30",
         "enable_gyro": "true",
         "enable_accel": "true",
         "gyro_fps": "200",
diff --git a/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/modules/nav.py b/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/modules/nav.py
index 6bdc709d6..eb0cd57a3 100644
--- a/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/modules/nav.py
+++ b/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/modules/nav.py
@@ -155,8 +155,8 @@ def set_velocity(self, v, w):
         Directly sets the linear and angular velocity of robot base.
         """
         msg = Twist()
-        msg.linear.x = v
-        msg.angular.z = w
+        msg.linear.x = float(v)
+        msg.angular.z = float(w)
 
         self._ros_client.goto_off_service.call(Trigger.Request())
         self._ros_client.velocity_pub.publish(msg)

From e0a3a287aab4f851db14bd0550c310a64625eba1 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 24 Sep 2024 10:51:38 -0400
Subject: [PATCH 225/410] Update robocasa

---
 src/stretch/simulation/mujoco_server.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/stretch/simulation/mujoco_server.py b/src/stretch/simulation/mujoco_server.py
index 696e16f37..2e58d2234 100644
--- a/src/stretch/simulation/mujoco_server.py
+++ b/src/stretch/simulation/mujoco_server.py
@@ -20,11 +20,12 @@
 
 try:
     from stretch_mujoco.robocasa_gen import model_generation_wizard
-except:
+except ImportError as e:
     from stretch.utils.logger import error
 
     error("Could not import robocasa!")
     error("Install robosuite and robocasa in order to use model generation wizard.")
+    error(f"Error: {e}")
 
 import stretch.motion.constants as constants
 import stretch.utils.compression as compression

From 1a161dd9f0674b0ad8d8283cd785aec1127dbfa8 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 24 Sep 2024 14:25:51 -0400
Subject: [PATCH 226/410] Add docker startup script

---
 scripts/run_stretch_ai_ros2_bridge_server.sh | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 scripts/run_stretch_ai_ros2_bridge_server.sh

diff --git a/scripts/run_stretch_ai_ros2_bridge_server.sh b/scripts/run_stretch_ai_ros2_bridge_server.sh
new file mode 100644
index 000000000..32942fa3d
--- /dev/null
+++ b/scripts/run_stretch_ai_ros2_bridge_server.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+echo "Starting Stretch AI ROS2 Bridge Server on $HELLO_FLEET_ID"
+echo "========================================================="
+sudo docker run -it \
+    --net=host \
+    --privileged \
+    -v /dev:/dev \
+    -v /home/hello-robot/stretch_user:/home/hello-robot/stretch_user \
+    -v /home/hello-robot/ament_ws/install/stretch_description/share/stretch_description/urdf:/home/hello-robot/stretch_description/share/stretch_description/urdf \
+    -e HELLO_FLEET_ID=$HELLO_FLEET_ID \
+    hellorobotinc/stretch-ai-ros2-bridge \
+    bash -c "source /home/hello-robot/.bashrc; export HELLO_FLEET_ID=$HELLO_FLEET_ID; ros2 launch stretch_ros2_bridge server.launch.py"

From b0208a84195bfe0bfe0f98a88dc2bb3db4ac7922 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 24 Sep 2024 15:36:31 -0400
Subject: [PATCH 227/410] tweaks to exploration; better collision avoidance etc

---
 src/stretch/config/default_planner.yaml | 6 +++---
 src/stretch/visualization/rerun.py      | 8 ++++++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 630f53894..1ea2f2662 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -37,7 +37,7 @@ filters:
   smooth_kernel_size: 2
   # smooth_kernel_size: 0
   use_median_filter: True
-  median_filter_size: 2
+  median_filter_size: 4
   median_filter_max_error: 0.01
   use_derivative_filter: False
   derivative_filter_threshold: 0.1
@@ -101,8 +101,8 @@ scene_graph:
 
 # Navigation space - used for motion planning and computing goals.
 motion_planner:
-  step_size: 0.1
-  rotation_step_size: 0.2 
+  step_size: 0.05
+  rotation_step_size: 0.1
   simplify_plans: True
   shortcut_plans: True
   shortcut_iter: 100
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 6df2b1075..6804c31f2 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -371,10 +371,12 @@ def update_voxel_map(
         self,
         space: SparseVoxelMapNavigationSpace,
         debug: bool = False,
-        explored_radius=0.01,
+        explored_radius=0.025,
         obstacle_radius=0.05,
+        world_radius=0.03,
     ):
         """Log voxel map and send it to Rerun visualizer
+
         Args:
             space (SparseVoxelMapNavigationSpace): Voxel map object
         """
@@ -387,7 +389,9 @@ def update_voxel_map(
 
         rr.log(
             "world/point_cloud",
-            rr.Points3D(positions=points, radii=np.ones(rgb.shape[0]) * 0.01, colors=np.int64(rgb)),
+            rr.Points3D(
+                positions=points, radii=np.ones(rgb.shape[0]) * world_radius, colors=np.int64(rgb)
+            ),
         )
 
         t1 = timeit.default_timer()

From d22f6b732f4f514f787af72163b3423ef8149dab Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 24 Sep 2024 15:41:33 -0400
Subject: [PATCH 228/410] can we defect negative obstacles

---
 src/stretch/config/default_planner.yaml |  2 +-
 src/stretch/mapping/voxel/voxel.py      |  6 ++++--
 src/stretch/utils/point_cloud.py        | 12 ++++++++++--
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 1ea2f2662..30fddfaea 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -11,7 +11,7 @@ tts_engine: "gTTS"
 voxel_size: 0.04 # Size of a voxel in meters
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
-neg_obs_height: -0.10  # Things less than this height ARE obstacles
+neg_obs_height: -0.05  # Things less than this height ARE obstacles
 obs_min_density: 50  # This many points makes it an obstacle
 min_points_per_voxel: 50  # Drop things below this density per voxel
 
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index 78febdbd7..e621207c0 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -110,7 +110,6 @@ class SparseVoxelMap(object):
     )
     debug_valid_depth: bool = False
     debug_instance_memory_processing_time: bool = False
-    use_negative_obstacles: bool = False
 
     def __init__(
         self,
@@ -144,6 +143,7 @@ def __init__(
         prune_detected_objects: bool = False,
         add_local_radius_every_step: bool = False,
         min_points_per_voxel: int = 10,
+        use_negative_obstacles: bool = False,
     ):
         """
         Args:
@@ -199,6 +199,7 @@ def __init__(
         self.median_filter_size = median_filter_size
         self.use_median_filter = use_median_filter
         self.median_filter_max_error = median_filter_max_error
+        self.use_negative_obstacles = use_negative_obstacles
 
         # Derivative filter params
         self.use_derivative_filter = use_derivative_filter
@@ -1191,7 +1192,6 @@ def from_parameters(
             grid_resolution=parameters["voxel_size"],
             obs_min_height=parameters["obs_min_height"],
             obs_max_height=parameters["obs_max_height"],
-            neg_obs_height=parameters["neg_obs_height"],
             min_depth=parameters["min_depth"],
             max_depth=parameters["max_depth"],
             add_local_radius_every_step=parameters["add_local_every_step"],
@@ -1209,6 +1209,8 @@ def from_parameters(
             median_filter_max_error=parameters.get("filters/median_filter_max_error", 0.01),
             use_derivative_filter=parameters.get("filters/use_derivative_filter", False),
             derivative_filter_threshold=parameters.get("filters/derivative_filter_threshold", 0.5),
+            use_negative_obstacles=parameters.get("use_negative_obstacles", False),
+            neg_obs_height=parameters.get("neg_obs_height", -0.10),
             use_instance_memory=use_instance_memory,
             instance_memory_kwargs={
                 "min_pixels_for_instance_view": parameters.get("min_pixels_for_instance_view", 100),
diff --git a/src/stretch/utils/point_cloud.py b/src/stretch/utils/point_cloud.py
index aa70dac83..4f4dd83c9 100644
--- a/src/stretch/utils/point_cloud.py
+++ b/src/stretch/utils/point_cloud.py
@@ -21,11 +21,18 @@
 from scipy.spatial import cKDTree
 
 
-def numpy_to_pcd(xyz: np.ndarray, rgb: np.ndarray = None) -> o3d.geometry.PointCloud:
+def numpy_to_pcd(
+    xyz: np.ndarray, rgb: np.ndarray = None, max_points: int = -1
+) -> o3d.geometry.PointCloud:
     """Create an open3d pointcloud from a single xyz/rgb pair"""
     xyz = xyz.reshape(-1, 3)
     if rgb is not None:
         rgb = rgb.reshape(-1, 3)
+    if max_points > 0:
+        idx = np.random.choice(xyz.shape[0], max_points, replace=False)
+        xyz = xyz[idx]
+        if rgb is not None:
+            rgb = rgb[idx]
     pcd = o3d.geometry.PointCloud()
     pcd.points = o3d.utility.Vector3dVector(xyz)
     if rgb is not None:
@@ -48,13 +55,14 @@ def show_point_cloud(
     save: str = None,
     grasps: list = None,
     size: float = 0.1,
+    max_points: int = 10000,
 ):
     """Shows the point-cloud described by np.ndarrays xyz & rgb.
     Optional origin and rotation params are for showing origin coordinate.
     Optional grasps param for showing a list of 6D poses as coordinate frames.
     size controls scale of coordinate frame's size
     """
-    pcd = numpy_to_pcd(xyz, rgb)
+    pcd = numpy_to_pcd(xyz, rgb, max_points=max_points)
     show_pcd(pcd, orig=orig, R=R, save=save, grasps=grasps, size=size)
 
 

From 8874941186a802248aee562c1f73b3ad54a82c3c Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 24 Sep 2024 15:48:09 -0400
Subject: [PATCH 229/410] add obstacles tracking negative heights

---
 src/stretch/config/default_planner.yaml | 1 +
 src/stretch/mapping/voxel/voxel.py      | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 30fddfaea..ddc95405d 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -12,6 +12,7 @@ voxel_size: 0.04 # Size of a voxel in meters
 obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
 neg_obs_height: -0.05  # Things less than this height ARE obstacles
+use_negative_obstacles: True  # Use the negative height as an obstacle
 obs_min_density: 50  # This many points makes it an obstacle
 min_points_per_voxel: 50  # Drop things below this density per voxel
 
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index e621207c0..5eb015c06 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -848,12 +848,13 @@ def get_2d_map(self, debug: bool = False) -> Tuple[np.ndarray, np.ndarray]:
 
         # Mask out obstacles only above a certain height
         obs_mask = xyz[:, -1] < max_height
+        xyz = xyz[obs_mask, :]
+        counts = counts[obs_mask][:, None]
+
         if self.use_negative_obstacles:
             neg_height = int(self.neg_obs_height / self.grid_resolution)
             negative_obstacles = xyz[:, -1] < neg_height
-            obs_mask = obs_mask | negative_obstacles
-        xyz = xyz[obs_mask, :]
-        counts = counts[obs_mask][:, None]
+            xyz[negative_obstacles, -1] = min_height + 1
 
         # voxels[x_coords, y_coords, z_coords] = 1
         voxels = scatter3d(xyz, counts, grid_size).squeeze()

From 1ca16af959d2a30221eccacdeac6a59eefb579d0 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 25 Sep 2024 08:45:58 -0400
Subject: [PATCH 230/410] fix various issues with robot code

---
 src/stretch/dynav/ok_robot_hw/robot.py | 27 ++++++++++----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/src/stretch/dynav/ok_robot_hw/robot.py b/src/stretch/dynav/ok_robot_hw/robot.py
index ea7b5b762..dd0793e7d 100644
--- a/src/stretch/dynav/ok_robot_hw/robot.py
+++ b/src/stretch/dynav/ok_robot_hw/robot.py
@@ -7,11 +7,8 @@
 # Some code may be adapted from other open-source works with their respective licenses. Original
 # license information maybe found below, if so.
 
-import math
 import os
-import random
-import sys
-import time
+from typing import Any, Dict
 
 import numpy as np
 import pinocchio as pin
@@ -23,7 +20,7 @@
 from stretch.dynav.ok_robot_hw.utils import transform_joint_array
 from stretch.motion.kinematics import HelloStretchIdx
 
-OVERRIDE_STATES = {}
+OVERRIDE_STATES: Dict[str, Any] = {}
 
 
 class HelloRobot:
@@ -136,18 +133,14 @@ def move_to_position(
             target_state[5] = wrist_roll
 
         # Actual Movement
-<<<<<<< HEAD
-        self.robot.arm_to(target_state, blocking = blocking)
-        print('Expected', target_state)
-        print('Actual', self.robot.get_six_joints())
-        print('Error', target_state - self.robot.get_six_joints())
-=======
+        print("Expected", target_state)
+        print("Actual", self.robot.get_six_joints())
+        print("Error", target_state - self.robot.get_six_joints())
         # print('Target Position', target_state)
         # print('pan tilt before', self.robot.get_pan_tilt())
         self.robot.arm_to(target_state, blocking=blocking, head=np.array([self.pan, self.tilt]))
         # print('pan tilt after', self.robot.get_pan_tilt())
         # print('Actual location', self.robot.get_six_joints())
->>>>>>> bd400ba42984a5606fbad1044950d8f36ce190ea
 
         # Head state update and Movement
         # target_head_pan, target_head_tilt = self.robot.get_pan_tilt()
@@ -166,7 +159,7 @@ def move_to_position(
     def pickup(self, width):
         """
         Code for grasping the object
-        Gripper closes gradually until it encounters resistence
+        Gripper closes gradually until it encounters resistance
         """
         next_gripper_pos = width
         while True:
@@ -186,7 +179,7 @@ def pickup(self, width):
 
     def updateJoints(self):
         """
-        update all the current poisitions of joints
+        update all the current positions of joints
         """
         state = self.robot.get_six_joints()
         origin_dist = state[0]
@@ -212,7 +205,7 @@ def updateJoints(self):
     # following function is used to move the robot to a desired joint configuration
     def move_to_joints(self, joints, gripper, mode=0):
         """
-        Given the desrired joints movement this fucntion will the joints accordingly
+        Given the desired joints movement this function will the joints accordingly
         """
         state = self.robot.get_six_joints()
 
@@ -238,7 +231,7 @@ def move_to_joints(self, joints, gripper, mode=0):
             target1[0] = target_state[0]
             target1[1] = min(1.1, target_state[1] + 0.2)
             self.robot.arm_to(target1, blocking=True, head=np.array([self.pan, self.tilt]))
-            
+
         self.robot.arm_to(target_state, blocking=True, head=np.array([self.pan, self.tilt]))
         self.robot.head_to(head_tilt=self.tilt, head_pan=self.pan, blocking=True)
 
@@ -250,7 +243,7 @@ def get_joint_transform(self, node1, node2):
         This function takes two nodes from a robot URDF file as input and
         outputs the coordinate frame of node2 relative to the coordinate frame of node1.
 
-        Mainly used for transforming co-ordinates from camera frame to gripper frame.
+        Mainly used for transforming coordinates from camera frame to gripper frame.
         """
 
         # return frame_transform, frame2, frame1

From 5f4e34041e429e54a94f6856672085ac525a46b3 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 25 Sep 2024 09:22:36 -0400
Subject: [PATCH 231/410] update code to fix various linter errors

---
 .codespell-ignore-words.txt                |  1 +
 src/stretch/dynav/llm_server.py            |  2 +-
 src/stretch/dynav/robot_agent_manip_mdp.py | 36 +++++++++-----------
 src/stretch/dynav/voxel_map_localizer.py   | 13 +++-----
 src/stretch/dynav/voxel_map_server.py      | 39 +++++++---------------
 5 files changed, 33 insertions(+), 58 deletions(-)

diff --git a/.codespell-ignore-words.txt b/.codespell-ignore-words.txt
index b7860a05a..59169ad53 100644
--- a/.codespell-ignore-words.txt
+++ b/.codespell-ignore-words.txt
@@ -1 +1,2 @@
 empy
+crate
diff --git a/src/stretch/dynav/llm_server.py b/src/stretch/dynav/llm_server.py
index aba8f0d5d..2276fd891 100644
--- a/src/stretch/dynav/llm_server.py
+++ b/src/stretch/dynav/llm_server.py
@@ -94,7 +94,7 @@ def get_xyz(depth, pose, intrinsics):
     return xyz
 
 
-class ImageProcessor:
+class LLMImageProcessor:
     def __init__(
         self,
         vision_method="pro_owl",
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index ec74828c4..7f0148c47 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -11,28 +11,22 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
-import copy
 import datetime
-import os
-import pickle
-import threading
 import time
-from multiprocessing import Process
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List
 
 import cv2
 import numpy as np
-import torch
 import zmq
-from matplotlib import pyplot as plt
 
 from stretch.agent import RobotClient
-from stretch.core.parameters import Parameters, get_parameters
+from stretch.core.interfaces import Observations
+from stretch.core.parameters import Parameters
 from stretch.dynav.ok_robot_hw.camera import RealSenseCamera
 from stretch.dynav.ok_robot_hw.global_parameters import *
 from stretch.dynav.ok_robot_hw.robot import HelloRobot as Manipulation_Wrapper
-from stretch.dynav.ok_robot_hw.utils.communication_utils import recv_array, send_array
+
+# from stretch.dynav.ok_robot_hw.utils.communication_utils import recv_array, send_array
 from stretch.dynav.ok_robot_hw.utils.grasper_utils import (
     capture_and_process_image,
     move_to_point,
@@ -85,7 +79,7 @@ def __init__(
         self.pos_err_threshold = 0.35
         self.rot_err_threshold = 0.4
         self.obs_count = 0
-        self.obs_history = []
+        self.obs_history: List[Observations] = []
         self.guarantee_instance_is_reachable = parameters.guarantee_instance_is_reachable
 
         self.image_sender = ImageSender(
@@ -96,16 +90,16 @@ def __init__(
 
             self.image_processor = ImageProcessor(
                 rerun=True, static=False, log="env" + str(env_num) + "_" + str(test_num)
-            )
+            )  # type: ignore
         elif method == "mllm":
-            from stretch.dynav.llm_server import ImageProcessor
+            from stretch.dynav.llm_server import LLMImageProcessor
 
-            self.image_processor = ImageProcessor(
+            self.image_processor = LLMImageProcessor(
                 rerun=True, static=False, log="env" + str(env_num) + "_" + str(test_num)
-            )
+            )  # type: ignore
 
-        self.look_around_times = []
-        self.execute_times = []
+        self.look_around_times: List[float] = []
+        self.execute_times: List[float] = []
 
         timestamp = f"{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}"
 
@@ -163,7 +157,7 @@ def execute_action(
         look_around_finish = time.time()
         look_around_take = look_around_finish - start_time
         print("Path planning takes ", look_around_take, " seconds.")
-        # self.look_around_times.append(look_around_take)
+        self.look_around_times.append(look_around_take)
         # print(self.look_around_times)
         # print(sum(self.look_around_times) / len(self.look_around_times))
 
@@ -276,7 +270,7 @@ def place(self, text, init_tilt=INIT_HEAD_TILT, base_node=TOP_CAMERA_NODE):
         self.manip_wrapper.move_to_position(gripper_pos=1, lift_pos=1.05, arm_pos=0)
         self.manip_wrapper.move_to_position(wrist_pitch=-1.57)
 
-        # Shift the base back to the original point as we are certain that orginal point is navigable in navigation obstacle map
+        # Shift the base back to the original point as we are certain that original point is navigable in navigation obstacle map
         self.manip_wrapper.move_to_position(
             base_trans=-self.manip_wrapper.robot.get_six_joints()[0]
         )
@@ -340,7 +334,7 @@ def manipulate(self, text, init_tilt=INIT_HEAD_TILT, base_node=TOP_CAMERA_NODE):
                 gripper_width=gripper_width,
             )
 
-        # Shift the base back to the original point as we are certain that orginal point is navigable in navigation obstacle map
+        # Shift the base back to the original point as we are certain that original point is navigable in navigation obstacle map
         self.manip_wrapper.move_to_position(
             base_trans=-self.manip_wrapper.robot.get_six_joints()[0]
         )
diff --git a/src/stretch/dynav/voxel_map_localizer.py b/src/stretch/dynav/voxel_map_localizer.py
index 6bd42a1bb..fcc8590fe 100644
--- a/src/stretch/dynav/voxel_map_localizer.py
+++ b/src/stretch/dynav/voxel_map_localizer.py
@@ -7,22 +7,17 @@
 # Some code may be adapted from other open-source works with their respective licenses. Original
 # license information maybe found below, if so.
 
-import math
-from typing import List, Optional, Tuple, Union
+from typing import Optional
 
 import clip
-import cv2
 import numpy as np
 import torch
 import torch.nn.functional as F
-import torchvision.transforms as T
-from PIL import Image
 from sklearn.cluster import DBSCAN
 from torch import Tensor
-from torchvision.transforms.functional import InterpolationMode
 
 # from ultralytics import YOLOWorld
-from transformers import AutoModel, AutoProcessor, Owlv2ForObjectDetection, Owlv2Processor
+from transformers import AutoModel, AutoProcessor, Owlv2ForObjectDetection
 
 from stretch.dynav.mapping_utils import VoxelizedPointcloud
 
@@ -263,7 +258,7 @@ def localize_A(self, A, debug=True, return_debug=False):
         #     res = self.compute_coord(A, obs)
         #     if res is not None:
         #         target_point = res
-        #         debug_text += '#### - Obejct is detected in observations where instance' + str(idx + 1) + ' comes from. **😃** Directly navigate to it.\n'
+        #         debug_text += '#### - Object is detected in observations where instance' + str(idx + 1) + ' comes from. **😃** Directly navigate to it.\n'
         #         break
         #     if self.siglip:
         #         cosine_similarity_check = similarity > 0.14
@@ -301,7 +296,7 @@ def localize_A(self, A, debug=True, return_debug=False):
         if res is not None:
             target_point = res
             debug_text += (
-                "#### - Obejct is detected in observations . **😃** Directly navigate to it.\n"
+                "#### - Object is detected in observations . **😃** Directly navigate to it.\n"
             )
         else:
             # debug_text += '#### - Directly ignore this instance is the target. **😞** \n'
diff --git a/src/stretch/dynav/voxel_map_server.py b/src/stretch/dynav/voxel_map_server.py
index 5437a0d5f..f1d85da55 100644
--- a/src/stretch/dynav/voxel_map_server.py
+++ b/src/stretch/dynav/voxel_map_server.py
@@ -19,18 +19,14 @@
 
 import clip
 import cv2
-import hydra
 import numpy as np
-import open3d as o3d
 import rerun as rr
 import scipy
 import torch
 
 # from stretch.utils.morphology import get_edges
 import torch.nn.functional as F
-import torchvision.transforms.functional as V
 import wget
-import zmq
 from matplotlib import pyplot as plt
 from PIL import Image
 from sam2.build_sam import build_sam2
@@ -40,26 +36,11 @@
 
 # from segment_anything import sam_model_registry, SamPredictor
 # from transformers import AutoProcessor, OwlViTForObjectDetection
-from ultralytics import SAM, YOLO, YOLOWorld
+from ultralytics import YOLOWorld
 
 from stretch.core import get_parameters
-from stretch.dynav.communication_util import (
-    load_socket,
-    recv_array,
-    recv_depth_img,
-    recv_everything,
-    recv_rgb_img,
-    send_array,
-    send_depth_img,
-    send_everything,
-    send_rgb_img,
-)
-from stretch.dynav.mapping_utils import (
-    AStar,
-    SparseVoxelMap,
-    SparseVoxelMapNavigationSpace,
-    VoxelizedPointcloud,
-)
+from stretch.dynav.communication_util import load_socket, recv_everything
+from stretch.dynav.mapping_utils import AStar, SparseVoxelMap, SparseVoxelMapNavigationSpace
 from stretch.dynav.scannet import CLASS_LABELS_200
 from stretch.dynav.voxel_map_localizer import VoxelMapLocalizer
 
@@ -259,6 +240,10 @@ def create_vision_model(self):
         )
 
     def process_text(self, text, start_pose):
+        """
+        Process the text query and return the trajectory for the robot to follow.
+        """
+
         if self.rerun:
             rr.log("/object", rr.Clear(recursive=True), static=self.static)
             rr.log("/robot_start_pose", rr.Clear(recursive=True), static=self.static)
@@ -364,7 +349,7 @@ def process_text(self, text, start_pose):
             else:
                 waypoints = None
                 print("[FAILURE]", res.reason)
-        # If we are navigating to some object of interst, send (x, y, z) of
+        # If we are navigating to some object of interest, send (x, y, z) of
         # the object so that we can make sure the robot looks at the object after navigation
         traj = []
         if waypoints is not None:
@@ -502,7 +487,7 @@ def process_text(self, text, start_pose):
     #         res = self.planner.plan(start_pose, point)
     #         if res.success:
     #             waypoints = [pt.state for pt in res.trajectory]
-    #             # If we are navigating to some object of interst, send (x, y, z) of
+    #             # If we are navigating to some object of interest, send (x, y, z) of
     #             # the object so that we can make sure the robot looks at the object after navigation
     #             print(waypoints)
     #             # finished = (len(waypoints) <= 4 or torch.linalg.norm(torch.Tensor(point)[:2] - torch.Tensor(start_pose[:2])) > 0.8) and mode == 'navigation'
@@ -1061,6 +1046,6 @@ def write_to_pickle(self):
 #         imageProcessor.recv_text()
 # imageProcessor.read_from_pickle('env.pkl', -1)
 # imageProcessor.write_to_pickle()
-
-if __name__ == "__main__":
-    main(None)
+#
+# if __name__ == "__main__":
+#     main(None)

From c7e7d713f445c987c7e8eff1648c87948c7dbf29 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 25 Sep 2024 15:31:37 -0400
Subject: [PATCH 232/410] add read dobbe format code

---
 .../utils/data_tools/read_dobbe_format.py     | 375 ++++++++++++++++++
 1 file changed, 375 insertions(+)
 create mode 100644 src/stretch/utils/data_tools/read_dobbe_format.py

diff --git a/src/stretch/utils/data_tools/read_dobbe_format.py b/src/stretch/utils/data_tools/read_dobbe_format.py
new file mode 100644
index 000000000..e17ee122f
--- /dev/null
+++ b/src/stretch/utils/data_tools/read_dobbe_format.py
@@ -0,0 +1,375 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Process raw dobb-e format for pushing to Hugging Face Hub.
+
+Written by Yi-Che Huang, source:
+
+https://github.com/hello-robot/lerobot/blob/stretch-act/lerobot/common/datasets/push_dataset_to_hub/dobbe_format.py
+"""
+
+import json
+import shutil
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import liblzfse
+import numpy as np
+import torch
+import tqdm
+from datasets import Dataset, Features, Image, Sequence, Value
+
+import stretch.utils.logger as logger
+
+try:
+    from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes
+    from lerobot.common.datasets.utils import hf_transform_to_torch
+    from lerobot.common.datasets.video_utils import VideoFrame
+
+    lerobot_found = True
+except ImportError:
+    logger.warning("lerobot not found. cannot export.")
+    lerobot_found = False
+
+from PIL import Image as PILImage
+from scipy import ndimage
+
+# Set camera input sizes
+IMAGE_SIZE = {"gripper": (240, 320), "head": (320, 240)}
+
+DEPTH_MEDIAN_FILTER_K = 11
+
+ACTION_ORDER = [
+    "base_x_joint",
+    "base_y_joint",
+    "base_theta_joint",
+    "joint_lift",
+    "joint_arm_l0",
+    "joint_wrist_roll",
+    "joint_wrist_pitch",
+    "joint_wrist_yaw",
+    "stretch_gripper",
+]
+
+STATE_ORDER = [
+    "base_x",
+    "base_y",
+    "base_theta",
+    "lift",
+    "arm",
+    "wrist_roll",
+    "wrist_pitch",
+    "wrist_yaw",
+    "gripper_finger_right",
+]
+
+# ACTION_ORDER = [
+#     "joint_mobile_base_translation",
+#     "joint_mobile_base_translate_by",
+#     "joint_mobile_base_rotate_by",
+#     "joint_lift",
+#     "joint_arm_l0",
+#     "joint_wrist_roll",
+#     "joint_wrist_pitch",
+#     "joint_wrist_yaw",
+#     "stretch_gripper",
+# ]
+
+# STATE_ORDER = [
+#     "base_x",
+#     "base_x_vel",
+#     "base_y",
+#     "base_y_vel",
+#     "base_theta",
+#     "base_theta_vel",
+#     "joint_lift",
+#     "joint_arm_l0",
+#     "joint_wrist_roll",
+#     "joint_wrist_pitch",
+#     "joint_wrist_yaw",
+#     "stretch_gripper",
+# ]
+def check_format(raw_dir):
+
+    print("Image sizes set as: ", IMAGE_SIZE)
+
+    episode_dirs = [path for path in Path(raw_dir).iterdir() if path.is_dir()]
+    assert len(episode_dirs) != 0
+
+    for episode_dir in episode_dirs:
+
+        # States and actions json file
+        labels = episode_dir / "labels.json"
+        assert labels.exists()
+
+        for camera in ["gripper", "head"]:
+
+            # Check for image folders
+            compressed_imgs = episode_dir / f"compressed_{camera}_images"
+            if not compressed_imgs.exists():
+                print(
+                    f"Image folder {compressed_imgs} wasn't found. Only video mode will be supported"
+                )
+
+            # Video files
+            compressed_video = episode_dir / f"{camera}_compressed_video_h264.mp4"
+            assert compressed_video.exists()
+
+            # Depth compressed binary files
+            depth_bin_path = episode_dir / f"compressed_np_{camera}_depth_float32.bin"
+            assert depth_bin_path.exists()
+
+
+def unpack_depth(depth_bin, num_frames, size):
+    h, w = size
+    depths = liblzfse.decompress(depth_bin.read_bytes())
+    depths = np.frombuffer(depths, dtype=np.float32).reshape((num_frames, h, w))
+    return depths
+
+
+def clip_and_normalize_depth(depths, median_filter_k=None):
+    # Clips depth to three different scales: 1mm, 10mm, 20mm
+    # depths: (num_frames, h, w)
+    if median_filter_k is not None:
+        depths = ndimage.median_filter(depths, axes=(1, 2), size=median_filter_k)
+
+    depths_1_mm = np.uint8(np.clip(depths * 1000, 0.0, 255.0))
+    depths_10_mm = np.uint8(np.clip(depths * 100, 0.0, 255.0))
+    depths_20_mm = np.uint8(np.clip(depths * 50, 0.0, 255.0))
+
+    depths_stacked = np.stack([depths_1_mm, depths_10_mm, depths_20_mm], axis=-1)
+
+    return depths_stacked
+
+
+def load_from_raw(
+    raw_dir: str | Path,
+    out_dir: Optional[str | Path],
+    fps: Optional[int] = 15,
+    save_video: bool = False,
+    debug: bool = False,
+):
+    episode_dirs = [path for path in Path(raw_dir).iterdir() if path.is_dir()]
+
+    # Fix data type for out_dir
+    if out_dir is not None and isinstance(out_dir, str):
+        out_dir = Path(out_dir)
+
+    # Fix data type for raw_dir
+    if isinstance(raw_dir, str):
+        raw_dir = Path(raw_dir)
+
+    # Set default fps
+    if fps is None:
+        logger.warning("FPS not set. Defaulting to 15.")
+        fps = 15
+
+    ep_dicts = []
+    ep_metadata = []
+    episode_data_index: Dict[str, Any] = {"from": [], "to": []}
+
+    id_from = 0
+    for ep_idx, ep_path in tqdm.tqdm(enumerate(episode_dirs), total=len(episode_dirs)):
+
+        # Dictionary for episode data
+        ep_dict = {}
+        num_frames = 0
+
+        # Parse observation state and action
+        labels = ep_path / "labels.json"
+        with open(labels, "r") as f:
+            labels_dict = json.load(f)
+            num_frames = len(labels_dict)
+
+            progress_variable = np.linspace(0, 1, num_frames).tolist()
+
+            actions = [
+                ([data["actions"][x] for x in ACTION_ORDER] + [progress_variable[int(frame_idx)]])
+                for frame_idx, data in labels_dict.items()
+            ]
+
+            state = [
+                [data["observations"][x] for x in STATE_ORDER] for _, data in labels_dict.items()
+            ]
+
+            ep_dict["observation.state"] = torch.tensor(state)
+            ep_dict["action"] = torch.tensor(actions)
+
+        # Parse observation images
+        for camera in ["gripper", "head"]:
+            img_key = f"observation.images.{camera}"
+            depth_key = f"observation.images.{camera}_depth"
+
+            if save_video:
+                video_path = ep_path / f"{camera}_compressed_video_h264.mp4"
+
+                fname = f"{camera}_episode_{ep_idx:06d}.mp4"
+                video_dir = out_dir / "videos"
+                video_dir.mkdir(parents=True, exist_ok=True)
+                shutil.copy(video_path, video_dir / fname)
+
+                ep_dict[img_key] = [
+                    {"path": f"videos/{fname}", "timestamp": i / fps} for i in range(num_frames)
+                ]
+            else:
+                # Parse RGB images
+                compressed_imgs = ep_path / f"compressed_{camera}_images"
+                assert (
+                    compressed_imgs.exists()
+                ), f"Image folder {compressed_imgs} wasn't found. Only video mode is supported."
+
+                rgb_png = list(compressed_imgs.glob("*.png"))
+                rgb_png.sort()
+
+                images = []
+                for file in rgb_png:
+                    with PILImage.open(file) as f:
+                        images.append(f)
+
+                ep_dict[img_key] = images
+
+            # Depth compressed binary inputs
+            depth_bin_path = ep_path / f"compressed_np_{camera}_depth_float32.bin"
+            depths = unpack_depth(depth_bin_path, num_frames, IMAGE_SIZE[camera])
+
+            depths = clip_and_normalize_depth(depths, DEPTH_MEDIAN_FILTER_K)
+
+            ep_dict[depth_key] = [PILImage.fromarray(x.astype(np.uint8), "RGB") for x in depths]
+
+        # Append episode metadata
+        metadata = ep_path / "configs.json"
+        with open(metadata, "r") as f:
+            metadata_dict = json.load(f)
+            ep_metadata.append(metadata_dict)
+
+        # last step of demonstration is considered done
+        done = torch.zeros(num_frames, dtype=torch.bool)
+        done[-1] = True
+
+        ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames)
+        ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
+        ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
+        ep_dict["next.done"] = done
+
+        assert isinstance(ep_idx, int)
+        ep_dicts.append(ep_dict)
+
+        episode_data_index["from"].append(id_from)
+        episode_data_index["to"].append(id_from + num_frames)
+
+        id_from += num_frames
+
+        # process first episode only
+        if debug:
+            break
+
+    data_dict = {}
+    data_dict = concatenate_episodes(ep_dicts)
+
+    info: Dict[str, Any] = {}
+    info["episode_metadata"] = ep_metadata
+    info["action_order"] = ACTION_ORDER
+    info["state_order"] = STATE_ORDER
+    info["image_size"] = IMAGE_SIZE
+    info["depth_median_filter_k"] = DEPTH_MEDIAN_FILTER_K
+    info["num_episodes"] = len(episode_dirs)
+
+    return data_dict, episode_data_index, info
+
+
+def to_hf_dataset(data_dict, video=False) -> Dataset:
+    features = {}
+
+    keys = [key for key in data_dict if "observation.images." in key]
+    for key in keys:
+        if video:
+            features[key] = VideoFrame()
+        else:
+            features[key] = Image()
+
+    features["action"] = Sequence(
+        length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)
+    )
+    features["observation.state"] = Sequence(
+        length=data_dict["observation.state"].shape[1],
+        feature=Value(dtype="float32", id=None),
+    )
+    features["episode_index"] = Value(dtype="int64", id=None)
+    features["frame_index"] = Value(dtype="int64", id=None)
+    features["timestamp"] = Value(dtype="float32", id=None)
+    features["next.done"] = Value(dtype="bool", id=None)
+    features["index"] = Value(dtype="int64", id=None)
+
+    hf_dataset = Dataset.from_dict(data_dict, features=Features(features))
+    hf_dataset.set_transform(hf_transform_to_torch)
+    return hf_dataset
+
+
+def from_raw_to_lerobot_format(
+    raw_dir: Path | str,
+    out_dir: Optional[Path | str] = None,
+    fps: Optional[int] = None,
+    save_video: bool = False,
+    debug: bool = False,
+):
+    """
+    Convert raw dobb-e format to Hugging Face dataset format.
+
+    Args:
+        raw_dir (Path | str): Path to raw dobb-e format.
+        out_dir (Optional[Path | str], optional): Output directory. Defaults to None.
+        fps (Optional[int], optional): Frames per second. Defaults to None.
+        save_video (bool, optional): Save video. Defaults to False.
+        debug (bool, optional): Debug mode. Defaults to False.
+    """
+
+    if isinstance(raw_dir, str):
+        raw_dir = Path(raw_dir)
+
+    if out_dir is not None and isinstance(out_dir, str):
+        out_dir = Path(out_dir)
+
+    # sanity check
+    check_format(raw_dir)
+
+    if fps is None:
+        fps = 15
+
+    data_dir, episode_data_index, info = load_from_raw(raw_dir, out_dir, fps, save_video, debug)
+    hf_dataset = to_hf_dataset(data_dir, save_video)
+
+    info["fps"] = fps
+    info["video"] = save_video
+
+    return hf_dataset, episode_data_index, info
+
+
+if __name__ == "__main__":
+    # test_path = Path("data/pickup_pink_cup/default_user/default_env/2024-08-30--12-03-23/")
+    test_path = Path("data/pickup_pink_cup/default_user/default_env/")
+
+    data_dir, episode_data_index, info = load_from_raw(
+        test_path, out_dir=None, fps=None, save_video=False, debug=False
+    )
+    breakpoint()

From 898648455bb1b91b48a43ee3f75d134882aba8f4 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 25 Sep 2024 15:34:01 -0400
Subject: [PATCH 233/410] parsing dataset

---
 src/stretch/utils/data_tools/read_dobbe_format.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/stretch/utils/data_tools/read_dobbe_format.py b/src/stretch/utils/data_tools/read_dobbe_format.py
index e17ee122f..a00fb2289 100644
--- a/src/stretch/utils/data_tools/read_dobbe_format.py
+++ b/src/stretch/utils/data_tools/read_dobbe_format.py
@@ -169,6 +169,7 @@ def load_from_raw(
     fps: Optional[int] = 15,
     save_video: bool = False,
     debug: bool = False,
+    max_episodes: Optional[int] = None,
 ):
     episode_dirs = [path for path in Path(raw_dir).iterdir() if path.is_dir()]
 
@@ -185,6 +186,12 @@ def load_from_raw(
         logger.warning("FPS not set. Defaulting to 15.")
         fps = 15
 
+    if max_episodes is not None and (max_episodes < 0 or max_episodes > len(episode_dirs)):
+        logger.warning(
+            f"Invalid max_episodes: {max_episodes}. Had only {len(episode_dirs)} examples. Setting to None."
+        )
+        max_episodes = None
+
     ep_dicts = []
     ep_metadata = []
     episode_data_index: Dict[str, Any] = {"from": [], "to": []}
@@ -284,6 +291,9 @@ def load_from_raw(
         if debug:
             break
 
+        if ep_idx + 1 == max_episodes:
+            break
+
     data_dict = {}
     data_dict = concatenate_episodes(ep_dicts)
 
@@ -370,6 +380,6 @@ def from_raw_to_lerobot_format(
     test_path = Path("data/pickup_pink_cup/default_user/default_env/")
 
     data_dir, episode_data_index, info = load_from_raw(
-        test_path, out_dir=None, fps=None, save_video=False, debug=False
+        test_path, out_dir=None, fps=None, save_video=False, debug=False, max_episodes=1
     )
     breakpoint()

From d8c84c5ff707a549ecb61a5cafba6af1e8bbd9c2 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 25 Sep 2024 15:39:35 -0400
Subject: [PATCH 234/410] update read

---
 .../utils/data_tools/read_dobbe_format.py     | 26 ++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/stretch/utils/data_tools/read_dobbe_format.py b/src/stretch/utils/data_tools/read_dobbe_format.py
index a00fb2289..9d3dfc537 100644
--- a/src/stretch/utils/data_tools/read_dobbe_format.py
+++ b/src/stretch/utils/data_tools/read_dobbe_format.py
@@ -382,4 +382,28 @@ def from_raw_to_lerobot_format(
     data_dir, episode_data_index, info = load_from_raw(
         test_path, out_dir=None, fps=None, save_video=False, debug=False, max_episodes=1
     )
-    breakpoint()
+
+    # Pull out the first episode from episode_data_index
+    from_idx = episode_data_index["from"][0]
+    to_idx = episode_data_index["to"][0]
+
+    # Pull out the first image from data_dir
+    # This is a PIL image
+    pil_gripper_image = data_dir["observation.images.gripper"][0]
+    gripper_image = np.array(pil_gripper_image)
+
+    # Pull out the head image from data_dir
+    pil_head_image = data_dir["observation.images.head"][0]
+    head_image = np.array(pil_head_image)
+
+    import matplotlib.pyplot as plt
+
+    plt.subplot(1, 2, 1)
+    plt.imshow(gripper_image)
+    plt.title("Gripper Image")
+    plt.axis("off")
+    plt.subplot(1, 2, 2)
+    plt.imshow(head_image)
+    plt.title("Head Image")
+    plt.axis("off")
+    plt.show()

From 4f7fd74d5b0e3e7a50d5b410f074ed1550a3a77a Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 25 Sep 2024 15:41:30 -0400
Subject: [PATCH 235/410] update information

---
 src/stretch/utils/data_tools/read_dobbe_format.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/stretch/utils/data_tools/read_dobbe_format.py b/src/stretch/utils/data_tools/read_dobbe_format.py
index 9d3dfc537..10662b20a 100644
--- a/src/stretch/utils/data_tools/read_dobbe_format.py
+++ b/src/stretch/utils/data_tools/read_dobbe_format.py
@@ -389,12 +389,14 @@ def from_raw_to_lerobot_format(
 
     # Pull out the first image from data_dir
     # This is a PIL image
-    pil_gripper_image = data_dir["observation.images.gripper"][0]
+    pil_gripper_image = data_dir["observation.images.gripper"]
     gripper_image = np.array(pil_gripper_image)
 
     # Pull out the head image from data_dir
     pil_head_image = data_dir["observation.images.head"][0]
-    head_image = np.array(pil_head_image)
+    head_image = np.array(pil_head_image.data)
+
+    breakpoint()
 
     import matplotlib.pyplot as plt
 

From eed3e4c9e268db09f70ab496f657bd1ccad346d4 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 25 Sep 2024 15:41:54 -0400
Subject: [PATCH 236/410] update read format

---
 src/stretch/utils/data_tools/read_dobbe_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/utils/data_tools/read_dobbe_format.py b/src/stretch/utils/data_tools/read_dobbe_format.py
index 10662b20a..a2e9c8cc8 100644
--- a/src/stretch/utils/data_tools/read_dobbe_format.py
+++ b/src/stretch/utils/data_tools/read_dobbe_format.py
@@ -394,7 +394,7 @@ def from_raw_to_lerobot_format(
 
     # Pull out the head image from data_dir
     pil_head_image = data_dir["observation.images.head"][0]
-    head_image = np.array(pil_head_image.data)
+    head_image = np.array(pil_head_image)
 
     breakpoint()
 

From 9fe425469ee756916fd6bf8569cd91a26dac6cfe Mon Sep 17 00:00:00 2001
From: Peiqi Liu <leo20021210@163.com>
Date: Wed, 25 Sep 2024 18:08:08 -0400
Subject: [PATCH 237/410] preparing for merge

---
 src/stretch/agent/zmq_client.py               |  20 +--
 src/stretch/dynav/llm_localizer.py            |   5 +-
 src/stretch/dynav/llm_server.py               |  10 +-
 src/stretch/dynav/mapping_utils/__init__.py   |  10 --
 src/stretch/dynav/mapping_utils/a_star.py     |  28 ++-
 src/stretch/dynav/mapping_utils/voxel.py      | 109 +++++------
 src/stretch/dynav/mapping_utils/voxel_map.py  |  41 ++---
 .../dynav/mapping_utils/voxelized_pcd.py      |  60 ++-----
 src/stretch/dynav/ok_robot_hw/README.md       |   6 +-
 src/stretch/dynav/ok_robot_hw/camera.py       |   8 +-
 .../dynav/ok_robot_hw/image_publisher.py      |   3 -
 src/stretch/dynav/ok_robot_hw/robot.py        |  13 +-
 .../dynav/ok_robot_hw/utils/grasper_utils.py  |   7 +-
 src/stretch/dynav/robot_agent_manip_mdp.py    | 108 +----------
 src/stretch/dynav/run_manip.py                |   2 +-
 src/stretch/dynav/run_ok_nav.py               |  14 +-
 src/stretch/dynav/scannet.py                  |   2 +-
 src/stretch/dynav/voxel_map_localizer.py      |   2 +-
 src/stretch/dynav/voxel_map_server.py         | 170 ++----------------
 src/stretch/motion/base/ik_solver_base.py     |   7 +-
 src/stretch/motion/pinocchio_ik_solver.py     |   2 +-
 src/stretch/requirements.txt                  |   2 +-
 src/stretch/visualization/rerun.py            |   3 -
 23 files changed, 163 insertions(+), 469 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index c03a99d03..7a6d4a5bd 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -13,12 +13,11 @@
 import time
 import timeit
 from threading import Lock
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import click
 import numpy as np
 import zmq
-from scipy.spatial.transform import Rotation as R
 from termcolor import colored
 
 import stretch.motion.constants as constants
@@ -324,7 +323,6 @@ def solve_ik(
         self,
         pos: List[float],
         quat: Optional[List[float]] = None,
-        relative: bool = False,
         initial_cfg: np.ndarray = None,
         debug: bool = False,
         node_name=None,
@@ -339,22 +337,12 @@ def solve_ik(
 
         pos_ee_curr, quat_ee_curr = self.get_ee_pose()
         if quat is None:
-            quat = [0, 0, 0, 1] if relative else quat_ee_curr
+            quat = quat_ee_curr
 
         # Compute IK goal: pose relative to base
         pose_desired = posquat2sophus(np.array(pos), np.array(quat))
 
-        if relative:
-            pose_base2ee_curr = posquat2sophus(pos_ee_curr, quat_ee_curr)
-
-            pos_desired = pos_ee_curr + pose_input.translation()
-            so3_desired = pose_input.so3() * pose_base2ee_curr.so3()
-            quat_desired = R.from_matrix(so3_desired.matrix()).as_quat()
-
-            pose_base2ee_desired = posquat2sophus(pos_desired, quat_desired)
-
-        else:
-            pose_base2ee_desired = pose_desired
+        pose_base2ee_desired = pose_desired
 
         pos_ik_goal, quat_ik_goal = sophus2posquat(pose_base2ee_desired)
 
@@ -566,7 +554,7 @@ def arm_to(
 
     def navigate_to(
         self,
-        xyt: ContinuousNavigationAction,
+        xyt: Union[np.ndarray, ContinuousNavigationAction],
         relative=False,
         blocking=False,
         timeout: float = 10.0,
diff --git a/src/stretch/dynav/llm_localizer.py b/src/stretch/dynav/llm_localizer.py
index d99168f5d..e92d891c8 100644
--- a/src/stretch/dynav/llm_localizer.py
+++ b/src/stretch/dynav/llm_localizer.py
@@ -23,8 +23,7 @@
 from torch import Tensor
 from transformers import AutoProcessor, Owlv2ForObjectDetection
 
-from stretch.dynav.mapping_utils import VoxelizedPointcloud
-from stretch.utils.point_cloud_torch import unproject_masked_depth_to_xyz_coordinates
+from stretch.dynav.mapping_utils.voxelized_pcd import VoxelizedPointcloud
 
 genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
 generation_config = genai.GenerationConfig(temperature=0)
@@ -223,7 +222,7 @@ def owl_locater(self, A, encoded_image, timestamps_lst):
                 res = self.compute_coord(A, image_info, threshold=0.2)
                 if res is not None:
                     debug_text = (
-                        "#### - Obejct is detected in observations where instance"
+                        "#### - Object is detected in observations where instance"
                         + str(i + 1)
                         + " comes from. **😃** Directly navigate to it.\n"
                     )
diff --git a/src/stretch/dynav/llm_server.py b/src/stretch/dynav/llm_server.py
index 2276fd891..28834518d 100644
--- a/src/stretch/dynav/llm_server.py
+++ b/src/stretch/dynav/llm_server.py
@@ -11,9 +11,6 @@
 import os
 import pickle
 import threading
-
-# import wget
-# import wget
 import time
 from io import BytesIO
 from pathlib import Path
@@ -29,7 +26,9 @@
 from stretch.core import get_parameters
 from stretch.dynav.communication_util import load_socket, recv_everything
 from stretch.dynav.llm_localizer import LLM_Localizer
-from stretch.dynav.mapping_utils import AStar, SparseVoxelMap, SparseVoxelMapNavigationSpace
+from stretch.dynav.mapping_utils.a_star import AStar
+from stretch.dynav.mapping_utils.voxel import SparseVoxelMap
+from stretch.dynav.mapping_utils.voxel_map import SparseVoxelMapNavigationSpace
 
 
 def get_inv_intrinsics(intrinsics):
@@ -94,7 +93,7 @@ def get_xyz(depth, pose, intrinsics):
     return xyz
 
 
-class LLMImageProcessor:
+class ImageProcessor:
     def __init__(
         self,
         vision_method="pro_owl",
@@ -389,7 +388,6 @@ def sample_frontier(self, start_pose=[0, 0, 0], text=None):
 
     def _recv_image(self):
         while True:
-            # data = recv_array(self.img_socket)
             rgb, depth, intrinsics, pose = recv_everything(self.img_socket)
             start_time = time.time()
             self.process_rgbd_images(rgb, depth, intrinsics, pose)
diff --git a/src/stretch/dynav/mapping_utils/__init__.py b/src/stretch/dynav/mapping_utils/__init__.py
index 6003d865a..2532abc5e 100644
--- a/src/stretch/dynav/mapping_utils/__init__.py
+++ b/src/stretch/dynav/mapping_utils/__init__.py
@@ -6,13 +6,3 @@
 #
 # Some code may be adapted from other open-source works with their respective licenses. Original
 # license information maybe found below, if so.
-
-from .a_star import AStar
-from .voxel import SparseVoxelMap
-from .voxel_map import SparseVoxelMapNavigationSpace
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-from .voxelized_pcd import VoxelizedPointcloud, scatter3d
diff --git a/src/stretch/dynav/mapping_utils/a_star.py b/src/stretch/dynav/mapping_utils/a_star.py
index 727c3a303..95e4b1b44 100644
--- a/src/stretch/dynav/mapping_utils/a_star.py
+++ b/src/stretch/dynav/mapping_utils/a_star.py
@@ -7,16 +7,19 @@
 # Some code may be adapted from other open-source works with their respective licenses. Original
 # license information maybe found below, if so.
 
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
 import heapq
 import math
 import time
-from random import random
-from typing import Callable, List, Optional, Set, Tuple
+from typing import List, Set, Tuple
 
 import numpy as np
-import torch
 
-from stretch.dynav.mapping_utils import SparseVoxelMapNavigationSpace
+from stretch.dynav.mapping_utils.voxel_map import SparseVoxelMapNavigationSpace
 from stretch.motion import PlanResult
 
 
@@ -71,24 +74,19 @@ def reset(self):
         # self._navigable = ~obs
         self.start_time = time.time()
 
-    def verify_path(self, path) -> bool:
-        self.reset()
-        for i in range(max(10, len(path))):
-            if self.point_is_occupied(*self.to_pt(path[i][:2])):
-                return False
-            return True
-
     def point_is_occupied(self, x: int, y: int) -> bool:
         return not bool(self._navigable[x][y])
 
     def to_pt(self, xy: Tuple[float, float]) -> Tuple[int, int]:
-        xy = torch.tensor([xy[0], xy[1]])
-        pt = self.space.voxel_map.xy_to_grid_coords(xy)
+        # # type: ignore to bypass mypy checking
+        xy = np.array([xy[0], xy[1]])  # type: ignore
+        pt = self.space.voxel_map.xy_to_grid_coords(xy)  # type: ignore
         return tuple(pt.tolist())
 
     def to_xy(self, pt: Tuple[int, int]) -> Tuple[float, float]:
-        pt = torch.tensor([pt[0], pt[1]])
-        xy = self.space.voxel_map.grid_coords_to_xy(pt)
+        # # type: ignore to bypass mypy checking
+        pt = np.array([pt[0], pt[1]])  # type: ignore
+        xy = self.space.voxel_map.grid_coords_to_xy(pt)  # type: ignore
         return tuple(xy.tolist())
 
     def compute_dis(self, a: Tuple[int, int], b: Tuple[int, int]):
diff --git a/src/stretch/dynav/mapping_utils/voxel.py b/src/stretch/dynav/mapping_utils/voxel.py
index 625e3cc98..8f39f87a6 100644
--- a/src/stretch/dynav/mapping_utils/voxel.py
+++ b/src/stretch/dynav/mapping_utils/voxel.py
@@ -11,33 +11,25 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
-import copy
+
 import logging
 import math
-import os
 import pickle
-import time
 from collections import namedtuple
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
-import cv2
 import numpy as np
 import open3d as open3d
 import scipy
 import skimage
 import torch
-import torch.nn.functional as F
-import torchvision.transforms.functional as V
-import trimesh
-from scipy.ndimage import gaussian_filter, maximum_filter
+from scipy.ndimage import maximum_filter
 from torch import Tensor
-from torchvision import transforms
 
 from stretch.core.interfaces import Observations
-from stretch.dynav.mapping_utils import VoxelizedPointcloud, scatter3d
+from stretch.dynav.mapping_utils.voxelized_pcd import VoxelizedPointcloud, scatter3d
 from stretch.motion import Footprint, PlanResult, RobotModel
-from stretch.utils.data_tools.dict import update
 from stretch.utils.morphology import binary_dilation, binary_erosion, get_edges
 from stretch.utils.point_cloud import create_visualization_geometries, numpy_to_pcd
 from stretch.utils.point_cloud_torch import unproject_masked_depth_to_xyz_coordinates
@@ -136,9 +128,6 @@ def __init__(
             resolution(float): in meters, size of a voxel
             feature_dim(int): size of feature embeddings to capture per-voxel point
         """
-        print(
-            "------------------------YOU ARE NOW RUNNING PEIQI VOXEL MAP CODES V3-----------------"
-        )
         # TODO: We an use fastai.store_attr() to get rid of this boilerplate code
         self.resolution = resolution
         self.feature_dim = feature_dim
@@ -219,7 +208,7 @@ def __init__(
 
     def reset(self) -> None:
         """Clear out the entire voxel map."""
-        self.observations = []
+        self.observations: List[Frame] = []
         # Create voxelized pointcloud
         self.voxel_pcd = VoxelizedPointcloud(
             voxel_size=self.voxel_resolution,
@@ -231,7 +220,6 @@ def reset(self) -> None:
 
         self._seq = 0
         self._2d_last_updated = -1
-        # Create map here - just reset *some* variables
         self.reset_cache()
 
     def reset_cache(self):
@@ -244,32 +232,33 @@ def reset_cache(self):
         # Store 2d map information
         # This is computed from our various point clouds
         self._map2d = None
-
-    def add_obs(
-        self,
-        obs: Observations,
-        camera_K: Optional[torch.Tensor] = None,
-        *args,
-        **kwargs,
-    ):
-        """Unpack an observation and convert it properly, then add to memory. Pass all other inputs into the add() function as provided."""
-        rgb = self.fix_data_type(obs.rgb)
-        depth = self.fix_data_type(obs.depth)
-        xyz = self.fix_data_type(obs.xyz)
-        camera_pose = self.fix_data_type(obs.camera_pose)
-        base_pose = torch.from_numpy(np.array([obs.gps[0], obs.gps[1], obs.compass[0]])).float()
-        K = self.fix_data_type(obs.camera_K) if camera_K is None else camera_K
-
-        self.add(
-            camera_pose=camera_pose,
-            xyz=xyz,
-            rgb=rgb,
-            depth=depth,
-            base_pose=base_pose,
-            camera_K=K,
-            *args,
-            **kwargs,
-        )
+        self._history_soft = None
+
+    # def add_obs(
+    #     self,
+    #     obs: Observations,
+    #     camera_K: Optional[torch.Tensor] = None,
+    #     *args,
+    #     **kwargs,
+    # ):
+    #     """Unpack an observation and convert it properly, then add to memory. Pass all other inputs into the add() function as provided."""
+    #     rgb = self.fix_data_type(obs.rgb)
+    #     depth = self.fix_data_type(obs.depth)
+    #     xyz = self.fix_data_type(obs.xyz)
+    #     camera_pose = self.fix_data_type(obs.camera_pose)
+    #     base_pose = torch.from_numpy(np.array([obs.gps[0], obs.gps[1], obs.compass[0]])).float()
+    #     K = self.fix_data_type(obs.camera_K) if camera_K is None else camera_K
+
+    #     self.add(
+    #         camera_pose=camera_pose,
+    #         xyz=xyz,
+    #         rgb=rgb,
+    #         depth=depth,
+    #         base_pose=base_pose,
+    #         camera_K=K,
+    #         *args,
+    #         **kwargs,
+    #     )
 
     def add(
         self,
@@ -358,7 +347,6 @@ def add(
                 full_world_xyz = xyz
             else:
                 raise NotImplementedError(f"Unknown xyz_frame {xyz_frame}")
-            # trimesh.transform_points(xyz, camera_pose)
         else:
             full_world_xyz = unproject_masked_depth_to_xyz_coordinates(  # Batchable!
                 depth=depth.unsqueeze(0).unsqueeze(1),
@@ -439,7 +427,7 @@ def _update_visited(self, base_pose: Tensor):
 
     def write_to_pickle(self, filename: str):
         """Write out to a pickle file. This is a rough, quick-and-easy output for debugging, not intended to replace the scalable data writer in data_tools for bigger efforts."""
-        data = {}
+        data: dict[str, Any] = {}
         data["camera_poses"] = []
         data["camera_K"] = []
         data["base_poses"] = []
@@ -474,7 +462,7 @@ def write_to_pickle(self, filename: str):
 
     def write_to_pickle_add_data(self, filename: str, newdata: dict):
         """Write out to a pickle file. This is a rough, quick-and-easy output for debugging, not intended to replace the scalable data writer in data_tools for bigger efforts."""
-        data = {}
+        data: dict[str, Any] = {}
         data["camera_poses"] = []
         data["base_poses"] = []
         data["xyz"] = []
@@ -519,7 +507,7 @@ def fix_data_type(self, tensor) -> torch.Tensor:
         else:
             raise NotImplementedError("unsupported data type for tensor:", tensor)
 
-    def read_from_pickle(self, filename: str, num_frames: int = -1):
+    def read_from_pickle(self, filename: Union[Path, str], num_frames: int = -1):
         """Read from a pickle file as above. Will clear all currently stored data first."""
         self.reset_cache()
         if isinstance(filename, str):
@@ -614,7 +602,7 @@ def get_2d_alignment_heuristics(self, voxel_map_localizer, text, debug: bool = F
 
     def get_2d_map(
         self, debug: bool = False, return_history_id: bool = False
-    ) -> Tuple[np.ndarray, ...]:
+    ) -> Tuple[Tensor, ...]:
         """Get 2d map with explored area and frontiers."""
 
         # Is this already cached? If so we don't need to go to all this work
@@ -747,21 +735,20 @@ def get_2d_map(
         else:
             return obstacles, explored, history_soft
 
-    def xy_to_grid_coords(self, xy: torch.Tensor) -> Optional[np.ndarray]:
+    def xy_to_grid_coords(self, xy: np.ndarray) -> Optional[np.ndarray]:
         """convert xy point to grid coords"""
-        if not isinstance(xy, torch.Tensor):
-            xy = torch.Tensor(xy)
-        assert xy.shape[-1] == 2, "coords must be Nx2 or 2d array"
-        # Handle convertion
+        # Handle conversion
+        # # type: ignore comments used to bypass mypy check
         if isinstance(xy, np.ndarray):
-            xy = torch.from_numpy(xy).float()
+            xy = torch.from_numpy(xy).float()  # type: ignore
+        assert xy.shape[-1] == 2, "coords must be Nx2 or 2d array"
         grid_xy = (xy / self.grid_resolution) + self.grid_origin[:2]
         if torch.any(grid_xy >= self._grid_size_t) or torch.any(grid_xy < torch.zeros(2)):
             return None
         else:
             return grid_xy.int()
 
-    def plan_to_grid_coords(self, plan_result: PlanResult) -> Optional[List[torch.Tensor]]:
+    def plan_to_grid_coords(self, plan_result: PlanResult) -> Optional[List]:
         """Convert a plan properly into grid coordinates"""
         if not plan_result.success:
             return None
@@ -771,14 +758,16 @@ def plan_to_grid_coords(self, plan_result: PlanResult) -> Optional[List[torch.Te
                 traj.append(self.xy_to_grid_coords(node.state[:2]))
             return traj
 
-    def grid_coords_to_xy(self, grid_coords: torch.Tensor) -> np.ndarray:
+    def grid_coords_to_xy(self, grid_coords: np.ndarray) -> np.ndarray:
         """convert grid coordinate point to metric world xy point"""
         assert grid_coords.shape[-1] == 2, "grid coords must be an Nx2 or 2d array"
+        if isinstance(grid_coords, np.ndarray):
+            grid_coords = torch.from_numpy(grid_coords)  # type: ignore
         return (grid_coords - self.grid_origin[:2]) * self.grid_resolution
 
     def grid_coords_to_xyt(self, grid_coords: np.ndarray) -> np.ndarray:
         """convert grid coordinate point to metric world xyt point"""
-        res = torch.ones(3)
+        res = np.ones(3)
         res[:2] = self.grid_coords_to_xy(grid_coords)
         return res
 
@@ -787,7 +776,7 @@ def show(self, backend: str = "open3d", **backend_kwargs) -> Tuple[np.ndarray, n
         if backend == "open3d":
             return self._show_open3d(**backend_kwargs)
         else:
-            raise NotImplementedError(f"Uknown backend {backend}, must be 'open3d' or 'pytorch3d")
+            raise NotImplementedError(f"Unknown backend {backend}, must be 'open3d' or 'pytorch3d")
 
     def get_xyz_rgb(self) -> Tuple[torch.Tensor, torch.Tensor]:
         """Return xyz and rgb of the current map"""
@@ -799,7 +788,7 @@ def sample_from_mask(self, mask: torch.Tensor) -> Optional[np.ndarray]:
         valid_indices = torch.nonzero(mask, as_tuple=False)
         if valid_indices.size(0) > 0:
             random_index = torch.randint(valid_indices.size(0), (1,))
-            return self.grid_coords_to_xy(valid_indices[random_index])
+            return self.grid_coords_to_xy(valid_indices[random_index].numpy())
         else:
             return None
 
@@ -832,7 +821,7 @@ def xyt_is_safe(self, xyt: np.ndarray, robot: Optional[RobotModel] = None) -> bo
 
     def _get_boxes_from_points(
         self,
-        traversible: torch.Tensor,
+        traversible: Union[Tensor, np.ndarray],
         color: List[float],
         is_map: bool = True,
         height: float = 0.0,
diff --git a/src/stretch/dynav/mapping_utils/voxel_map.py b/src/stretch/dynav/mapping_utils/voxel_map.py
index 9b0bb8316..46a609568 100644
--- a/src/stretch/dynav/mapping_utils/voxel_map.py
+++ b/src/stretch/dynav/mapping_utils/voxel_map.py
@@ -12,7 +12,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 import math
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -21,7 +21,7 @@
 import skimage.morphology
 import torch
 
-from stretch.dynav.mapping_utils import SparseVoxelMap
+from stretch.dynav.mapping_utils.voxel import SparseVoxelMap
 from stretch.motion import XYT, Footprint
 from stretch.utils.geometry import angle_difference, interpolate_angles
 from stretch.utils.morphology import (
@@ -50,9 +50,6 @@ def __init__(
         dilate_obstacle_size: int = 2,
         extend_mode: str = "separate",
     ):
-        print(
-            "------------------------YOU ARE NOW RUNNING PEIQI VOXEL NAVIGATION SPACE CODES-----------------"
-        )
         self.step_size = step_size
         self.rotation_step_size = rotation_step_size
         self.voxel_map = voxel_map
@@ -66,7 +63,8 @@ def __init__(
         else:
             self.dof = 2
 
-        self._kernels = {}
+        # # type: ignore comments used to bypass mypy check
+        self._kernels = {}  # type: ignore
 
         if dilate_frontier_size > 0:
             self.dilate_explored_kernel = torch.nn.Parameter(
@@ -112,7 +110,7 @@ def create_collision_masks(self, orientation_resolution: int, show_all: bool = F
         self._orientation_resolution = 64
         self._oriented_masks = []
 
-        # NOTE: this is just debug code - lets you see waht the masks look like
+        # NOTE: this is just debug code - lets you see what the masks look like
         assert not show_all or orientation_resolution == 64
 
         for i in range(orientation_resolution):
@@ -134,10 +132,10 @@ def distance(self, q0: np.ndarray, q1: np.ndarray) -> float:
         assert len(q1) == 3 or len(q1) == 2, "2 or 3 dimensions for goal"
         if len(q1) == 3:
             # Measure to the final position exactly
-            return np.linalg.norm(q0 - q1)
+            return np.linalg.norm(q0 - q1).item()
         else:
             # Measure only to the final goal x/y position
-            return np.linalg.norm(q0[:2] - q1[:2])
+            return np.linalg.norm(q0[:2] - q1[:2]).item()
 
     def extend(self, q0: np.ndarray, q1: np.ndarray) -> np.ndarray:
         """extend towards another configuration in this space. Will be either separate or joint depending on if the robot can "strafe":
@@ -153,7 +151,7 @@ def extend(self, q0: np.ndarray, q1: np.ndarray) -> np.ndarray:
         else:
             raise NotImplementedError(f"not supported: {self.extend_mode=}")
 
-    def _extend_separate(self, q0: np.ndarray, q1: np.ndarray, xy_tol: float = 1e-8) -> np.ndarray:
+    def _extend_separate(self, q0: np.ndarray, q1: np.ndarray, xy_tol: float = 1e-8):
         """extend towards another configuration in this space.
         TODO: we can set the classes here, right now assuming still np.ndarray"""
         assert len(q0) == 3, f"initial configuration must be 3d, was {q0}"
@@ -235,15 +233,16 @@ def get_oriented_mask(self, theta: float) -> torch.Tensor:
 
     def is_valid(
         self,
-        state: torch.Tensor,
+        state: Union[np.ndarray, torch.Tensor, List],
         is_safe_threshold=1.0,
         debug: bool = False,
         verbose: bool = False,
     ) -> bool:
         """Check to see if state is valid; i.e. if there's any collisions if mask is at right place"""
         assert len(state) == 3
-        if isinstance(state, np.ndarray):
-            state = torch.from_numpy(state).float()
+        if isinstance(state, torch.Tensor):
+            state = state.float().numpy()
+        state = np.array(state)
         ok = bool(self.voxel_map.xyt_is_safe(state[:2]))
         # if verbose:
         #     print('is navigable:', ok)
@@ -321,7 +320,7 @@ def compute_theta(self, cur_x, cur_y, end_x, end_y):
 
     def sample_target_point(
         self, start: torch.Tensor, point: torch.Tensor, planner, exploration: bool = False
-    ) -> Optional[np.array]:
+    ) -> Optional[np.ndarray]:
         """Sample a position near the mask and return.
 
         Args:
@@ -337,8 +336,9 @@ def sample_target_point(
             print("No target point find, maybe no point is reachable")
             return None
         reachable_xs, reachable_ys = zip(*reachable_points)
-        reachable_xs = torch.tensor(reachable_xs)
-        reachable_ys = torch.tensor(reachable_ys)
+        # # type: ignore comments used to bypass mypy check
+        reachable_xs = torch.tensor(reachable_xs)  # type: ignore
+        reachable_ys = torch.tensor(reachable_ys)  # type: ignore
         reachable = torch.empty(obstacles.shape, dtype=torch.bool).fill_(False)
         reachable[reachable_xs, reachable_ys] = True
 
@@ -413,7 +413,7 @@ def sample_near_mask(
         verbose: bool = False,
         debug: bool = False,
         look_at_any_point: bool = False,
-    ) -> Optional[np.ndarray]:
+    ):
         """Sample a position near the mask and return.
 
         Args:
@@ -463,7 +463,7 @@ def sample_near_mask(
                 outside_point = find_closest_point_on_mask(mask, point_grid_coords.float())
 
             # convert back
-            point = self.voxel_map.grid_coords_to_xy(point_grid_coords)
+            point = self.voxel_map.grid_coords_to_xy(point_grid_coords.numpy())
             if point is None:
                 print("[VOXEL MAP: sampling] ERR:", point, point_grid_coords)
                 continue
@@ -486,7 +486,8 @@ def sample_near_mask(
                 theta += 2 * np.pi
 
             xyt = torch.zeros(3)
-            xyt[:2] = point
+            # # type: ignore to bypass mypy check
+            xyt[:2] = point  # type: ignore
             xyt[2] = theta
 
             # Check to see if this point is valid
@@ -703,7 +704,7 @@ def sample_closest_frontier(
         verbose: bool = False,
         step_dist: float = 0.1,
         min_dist: float = 0.1,
-    ) -> Optional[torch.Tensor]:
+    ):
         """Sample a valid location on the current frontier using FMM planner to compute geodesic distance. Returns points in order until it finds one that's valid.
 
         Args:
diff --git a/src/stretch/dynav/mapping_utils/voxelized_pcd.py b/src/stretch/dynav/mapping_utils/voxelized_pcd.py
index e3cb35a67..e2898434b 100644
--- a/src/stretch/dynav/mapping_utils/voxelized_pcd.py
+++ b/src/stretch/dynav/mapping_utils/voxelized_pcd.py
@@ -7,41 +7,13 @@
 # Some code may be adapted from other open-source works with their respective licenses. Original
 # license information maybe found below, if so.
 
-"""
-    This file implements VoxelizedPointcloud module in home-robot project (https://github.com/facebookresearch/home-robot).
-    Adapted to be used in ok-robot's navigation voxel map:
-    https://github.com/facebookresearch/home-robot/blob/main/src/home_robot/home_robot/utils/voxel.py
-
-    License:
-
-    MIT License
-
-    Copyright (c) Meta Platforms, Inc. and affiliates.
-
-    Permission is hereby granted, free of charge, to any person obtaining a copy
-    of this software and associated documentation files (the "Software"), to deal
-    in the Software without restriction, including without limitation the rights
-    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-    copies of the Software, and to permit persons to whom the Software is
-    furnished to do so, subject to the following conditions:
-
-    The above copyright notice and this permission notice shall be included in all
-    copies or substantial portions of the Software.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-    SOFTWARE.
-    
-"""
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from typing import List, Optional, Tuple, Union
 
-import cv2
-import numpy as np
 import torch
 from torch import Tensor
 
@@ -266,13 +238,13 @@ def add(
             weights = torch.ones_like(points[..., 0])
 
         if obs_count is None:
-            obs_count = torch.ones_like(weights) * self.obs_count
+            obs_counts = torch.ones_like(weights) * self.obs_count
         else:
-            obs_count = torch.ones_like(weights) * obs_count
+            obs_counts = torch.ones_like(weights) * obs_count
         if entity_id is None:
-            entity_id = torch.ones_like(weights) * self.obs_count
+            entity_ids = torch.ones_like(weights) * self.obs_count
         else:
-            obs_count = torch.ones_like(weights) * entity_id
+            entity_ids = torch.ones_like(weights) * entity_id
         self.obs_count += 1
 
         # Update voxel grid bounds
@@ -309,8 +281,8 @@ def add(
                 weights,
                 rgb,
             )
-            all_obs_counts = obs_count
-            all_entity_ids = entity_id
+            all_obs_counts = obs_counts
+            all_entity_ids = entity_ids
         else:
             assert (self._features is None) == (features is None)
             all_points = torch.cat([self._points, points], dim=0)
@@ -319,8 +291,8 @@ def add(
                 torch.cat([self._features, features], dim=0) if (features is not None) else None
             )
             all_rgb = torch.cat([self._rgb, rgb], dim=0) if (rgb is not None) else None
-            all_obs_counts = torch.cat([self._obs_counts, obs_count], dim=0)
-            all_entity_ids = torch.cat([self._entity_ids, entity_id], dim=0)
+            all_obs_counts = torch.cat([self._obs_counts, obs_counts], dim=0)
+            all_entity_ids = torch.cat([self._entity_ids, entity_ids], dim=0)
         # Future optimization:
         # If there are no new voxels, then we could save a bit of compute time
         # by only recomputing the voxel/cluster for the new points
@@ -349,7 +321,7 @@ def add(
         self._obs_counts, self._entity_ids = self._obs_counts.int(), self._entity_ids.int()
         return
 
-    def get_idxs(self, points: Tensor) -> Tensor:
+    def get_idxs(self, points: Tensor) -> Tuple[Tensor, ...]:
         """Returns voxel index (long tensor) for each point in points
 
         Args:
@@ -396,7 +368,7 @@ def get_consecutive_cluster_idx(self, points: Tensor) -> Tensor:
         ) = self.get_idxs(points)
         return cluster_consecutive_idx
 
-    def get_pointcloud(self) -> Tuple[Tensor]:
+    def get_pointcloud(self) -> Tuple[Tensor, ...]:
         """Returns pointcloud (1 point per occupied voxel)
 
         Returns:
@@ -463,7 +435,7 @@ def voxelize(
     batch: Optional[Tensor] = None,
     start: Optional[Union[float, Tensor]] = None,
     end: Optional[Union[float, Tensor]] = None,
-) -> Tuple[Tensor]:
+) -> Tuple[Tensor, ...]:
     """Returns voxel indices and packed (consecutive) indices for points
 
     Args:
@@ -520,7 +492,7 @@ def reduce_pointcloud(
     obs_counts: Optional[Tensor] = None,
     entity_ids: Optional[Tensor] = None,
     feature_reduce: str = "mean",
-) -> Tuple[Tensor]:
+) -> Tuple[Tensor, ...]:
     """Pools values within each voxel
 
     Args:
diff --git a/src/stretch/dynav/ok_robot_hw/README.md b/src/stretch/dynav/ok_robot_hw/README.md
index 30234778e..ab93a5c63 100644
--- a/src/stretch/dynav/ok_robot_hw/README.md
+++ b/src/stretch/dynav/ok_robot_hw/README.md
@@ -1,6 +1,6 @@
 # Robot Side Code
 
-Most of the heavy code will be running in the workstation and will communicate with the robot through sockets.
+Most of the heavy code will be running in the workstation and will communicate with the robot through sockets
 
 ## Preparation to run robot side codes
 
@@ -10,7 +10,7 @@ You need to install the home-robot on stretch robot following instructions provi
 
 To check whether home-robot is installed properly and got familiar with running home-robot based codes, we recommend you try to run [these test scripts](https://github.com/facebookresearch/home-robot/blob/main/tests/hw_manual_test.py).
 
-Once you finished installing Home Robot, follow [these steps](../docs/robot-installation.md) to enable OK-Robot use home-robot controllers.
+Once you finished installing home-robot, follow [these steps](../docs/robot-installation.md) to enable OK-Robot use home-robot controllers.
 
 The success of OK-Robot system also depends on robot calibration and accurate urdf file, so make sure you follow [these calibration steps](../docs/robot-calibration.md) to obtain an accurate urdf file for your robot.
 
@@ -27,8 +27,6 @@ roslaunch home_robot_hw startup_stretch_hector_slam.launch
 python run.py -x1 [x1] -y1 [y1] -x2 [x2] -y2 [y2] -ip [your workstation ip]
 ```
 
-\<\<\<\<\<\<\< HEAD
-
 - **\[x1, y1\]** - Coordinated of tape on which the base of the robot is on
 - **\[x2, y2\]** - Coordinates of the secondary tape.
 - **ip** - Your workstation ip, the robot will try to communicate with this ip
diff --git a/src/stretch/dynav/ok_robot_hw/camera.py b/src/stretch/dynav/ok_robot_hw/camera.py
index 947cf9a57..d6fa8df07 100644
--- a/src/stretch/dynav/ok_robot_hw/camera.py
+++ b/src/stretch/dynav/ok_robot_hw/camera.py
@@ -17,7 +17,6 @@
 import cv2
 import matplotlib.pyplot as plt
 import numpy as np
-import open3d as o3d
 
 
 class RealSenseCamera:
@@ -33,7 +32,7 @@ def __init__(self, robot):
         self.cx = intrinsics[1, 2]
         print(self.fx, self.fy, self.cx, self.cy)
 
-        # selected ix and iy co-ordinates
+        # selected ix and iy coordinates
         self.ix, self.iy = None, None
 
     def capture_image(self, visualize=False):
@@ -67,8 +66,3 @@ def capture_image(self, visualize=False):
 
     def pixel2d_to_point3d(self, ix, iy):
         return self.points[iy, ix][[1, 0, 2]]
-
-
-if __name__ == "__main__":
-    camera = RealSenseCamera()
-    camera.capture_image()
diff --git a/src/stretch/dynav/ok_robot_hw/image_publisher.py b/src/stretch/dynav/ok_robot_hw/image_publisher.py
index 6afc54a47..c373e4d45 100644
--- a/src/stretch/dynav/ok_robot_hw/image_publisher.py
+++ b/src/stretch/dynav/ok_robot_hw/image_publisher.py
@@ -8,13 +8,10 @@
 # license information maybe found below, if so.
 
 import numpy as np
-import zmq
 from PIL import Image as PILImage
 
 from stretch.dynav.ok_robot_hw.utils.communication_utils import (
     recv_array,
-    recv_depth_img,
-    recv_rgb_img,
     send_array,
     send_depth_img,
     send_rgb_img,
diff --git a/src/stretch/dynav/ok_robot_hw/robot.py b/src/stretch/dynav/ok_robot_hw/robot.py
index dd0793e7d..3ee7dcebd 100644
--- a/src/stretch/dynav/ok_robot_hw/robot.py
+++ b/src/stretch/dynav/ok_robot_hw/robot.py
@@ -8,7 +8,6 @@
 # license information maybe found below, if so.
 
 import os
-from typing import Any, Dict
 
 import numpy as np
 import pinocchio as pin
@@ -20,7 +19,7 @@
 from stretch.dynav.ok_robot_hw.utils import transform_joint_array
 from stretch.motion.kinematics import HelloStretchIdx
 
-OVERRIDE_STATES: Dict[str, Any] = {}
+OVERRIDE_STATES: dict[str, float] = {}
 
 
 class HelloRobot:
@@ -235,6 +234,14 @@ def move_to_joints(self, joints, gripper, mode=0):
         self.robot.arm_to(target_state, blocking=True, head=np.array([self.pan, self.tilt]))
         self.robot.head_to(head_tilt=self.tilt, head_pan=self.pan, blocking=True)
 
+        self.robot.arm_to(target_state, blocking=True, head=np.array([self.pan, self.tilt]))
+        self.robot.head_to(head_tilt=self.tilt, head_pan=self.pan, blocking=True)
+
+        # print('pan tilt after', self.robot.get_pan_tilt())
+        # print(f"current state {self.robot.get_six_joints()}")
+        # print(f"target state {target_state}")
+        # time.sleep(1)
+
         # NOTE: below code is to fix the pitch drift issue in current hello-robot. Remove it if there is no pitch drift issue
         OVERRIDE_STATES["wrist_pitch"] = joints["joint_wrist_pitch"]
 
@@ -279,7 +286,7 @@ def move_to_pose(self, translation_tensor, rotational_tensor, gripper, move_mode
         # print(f"final pos and quat {final_pos}\n {final_quat}")
 
         full_body_cfg = self.robot.solve_ik(
-            final_pos, final_quat, False, None, False, node_name=self.end_link
+            final_pos, final_quat, None, False, node_name=self.end_link
         )
         if full_body_cfg is None:
             print("Warning: Cannot find an IK solution for desired EE pose!")
diff --git a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
index 3f1ceae4c..2b0c7e279 100644
--- a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
+++ b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
@@ -7,7 +7,6 @@
 # Some code may be adapted from other open-source works with their respective licenses. Original
 # license information maybe found below, if so.
 
-import math
 import time
 
 import numpy as np
@@ -47,7 +46,7 @@ def capture_and_process_image(camera, mode, obj, socket, hello_robot):
             time.sleep(1)
 
         elif retry_flag != 0 and side_retries == 3:
-            print("Tried in all angles but couldn't succed")
+            print("Tried in all angles but couldn't succeed")
             time.sleep(1)
             return None, None, None, None
 
@@ -175,7 +174,7 @@ def pickup(
     # Rotation for aligning Robot gripper frame to Model gripper frame
     rotation2_top_mat = np.array([[0, 0, 1], [1, 0, 0], [0, -1, 0]])
 
-    # final Rotation of gripper to hold the objet
+    # final Rotation of gripper to hold the objcet
     pin_final_rotation = np.dot(pin_transformed_frame.rotation, rotation2_top_mat)
     print(f"pin final rotation {pin_final_rotation}")
 
@@ -242,7 +241,7 @@ def pickup(
     # Lifts the arm
     robot.move_to_position(lift_pos=min(robot.robot.get_six_joints()[1] + 0.2, 1.1))
 
-    # Tucks the gripper so that while moving to place it wont collide with any obstacles
+    # Tucks the gripper so that while moving to place it won't collide with any obstacles
     robot.move_to_position(arm_pos=0.01)
     robot.move_to_position(wrist_pitch=0.0)
     robot.move_to_position(lift_pos=min(robot.robot.get_six_joints()[1], 0.9), wrist_yaw=2.5)
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index 7f0148c47..96ed5cfc3 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -7,34 +7,25 @@
 # Some code may be adapted from other open-source works with their respective licenses. Original
 # license information maybe found below, if so.
 
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 import datetime
 import time
-from typing import Any, Dict, List
+from typing import Any, Dict
 
-import cv2
 import numpy as np
 import zmq
 
 from stretch.agent import RobotClient
-from stretch.core.interfaces import Observations
 from stretch.core.parameters import Parameters
+from stretch.dynav.communication_util import recv_array, send_array, send_everything
 from stretch.dynav.ok_robot_hw.camera import RealSenseCamera
 from stretch.dynav.ok_robot_hw.global_parameters import *
 from stretch.dynav.ok_robot_hw.robot import HelloRobot as Manipulation_Wrapper
-
-# from stretch.dynav.ok_robot_hw.utils.communication_utils import recv_array, send_array
 from stretch.dynav.ok_robot_hw.utils.grasper_utils import (
     capture_and_process_image,
     move_to_point,
     pickup,
 )
 
-# from stretch.dynav.llm_server import ImageProcessor
-
 
 class RobotAgentMDP:
     """Basic demo code. Collects everything that we need to make this work."""
@@ -79,40 +70,34 @@ def __init__(
         self.pos_err_threshold = 0.35
         self.rot_err_threshold = 0.4
         self.obs_count = 0
-        self.obs_history: List[Observations] = []
         self.guarantee_instance_is_reachable = parameters.guarantee_instance_is_reachable
 
         self.image_sender = ImageSender(
             ip=ip, image_port=image_port, text_port=text_port, manip_port=manip_port
         )
         if method == "dynamem":
-            from stretch.dynav.voxel_map_server import ImageProcessor
+            from stretch.dynav.voxel_map_server import ImageProcessor as VoxelMapImageProcessor
 
-            self.image_processor = ImageProcessor(
+            self.image_processor = VoxelMapImageProcessor(
                 rerun=True, static=False, log="env" + str(env_num) + "_" + str(test_num)
             )  # type: ignore
         elif method == "mllm":
-            from stretch.dynav.llm_server import LLMImageProcessor
+            from stretch.dynav.llm_server import ImageProcessor as mLLMImageProcessor
 
-            self.image_processor = LLMImageProcessor(
+            self.image_processor = mLLMImageProcessor(
                 rerun=True, static=False, log="env" + str(env_num) + "_" + str(test_num)
             )  # type: ignore
 
-        self.look_around_times: List[float] = []
-        self.execute_times: List[float] = []
+        self.look_around_times: list[float] = []
+        self.execute_times: list[float] = []
 
         timestamp = f"{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}"
 
-        # self.head_lock = threading.Lock()
-
     def look_around(self):
         print("*" * 10, "Look around to check", "*" * 10)
         for pan in [0.4, -0.4, -1.2]:
             for tilt in [-0.6]:
-                start_time = time.time()
                 self.robot.head_to(pan, tilt, blocking=True)
-                end_time = time.time()
-                print("moving head takes ", end_time - start_time, "seconds.")
                 self.update()
 
     def rotate_in_place(self):
@@ -128,7 +113,6 @@ def update(self):
         """Step the data collector. Get a single observation of the world. Remove bad points, such as those from too far or too near the camera. Update the 3d world representation."""
         obs = self.robot.get_observation()
         # self.image_sender.send_images(obs)
-        self.obs_history.append(obs)
         self.obs_count += 1
         rgb, depth, K, camera_pose = obs.rgb, obs.depth, obs.camera_K, obs.camera_pose
         # start_time = time.time()
@@ -157,7 +141,7 @@ def execute_action(
         look_around_finish = time.time()
         look_around_take = look_around_finish - start_time
         print("Path planning takes ", look_around_take, " seconds.")
-        self.look_around_times.append(look_around_take)
+        # self.look_around_times.append(look_around_take)
         # print(self.look_around_times)
         # print(sum(self.look_around_times) / len(self.look_around_times))
 
@@ -346,80 +330,6 @@ def save(self):
             self.image_processor.write_to_pickle()
 
 
-def send_array(socket, A, flags=0, copy=True, track=False):
-    """send a numpy array with metadata"""
-    A = np.array(A)
-    md = dict(
-        dtype=str(A.dtype),
-        shape=A.shape,
-    )
-    socket.send_json(md, flags | zmq.SNDMORE)
-    return socket.send(np.ascontiguousarray(A), flags, copy=copy, track=track)
-
-
-def recv_array(socket, flags=0, copy=True, track=False):
-    """recv a numpy array"""
-    md = socket.recv_json(flags=flags)
-    msg = socket.recv(flags=flags, copy=copy, track=track)
-    A = np.frombuffer(msg, dtype=md["dtype"])
-    return A.reshape(md["shape"])
-
-
-def send_rgb_img(socket, img):
-    img = img.astype(np.uint8)
-    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 90]
-    _, img_encoded = cv2.imencode(".jpg", img, encode_param)
-    socket.send(img_encoded.tobytes())
-
-
-def recv_rgb_img(socket):
-    img = socket.recv()
-    img = np.frombuffer(img, dtype=np.uint8)
-    img = cv2.imdecode(img, cv2.IMREAD_COLOR)
-    return img
-
-
-def send_depth_img(socket, depth_img):
-    depth_img = (depth_img * 1000).astype(np.uint16)
-    encode_param = [
-        int(cv2.IMWRITE_PNG_COMPRESSION),
-        3,
-    ]  # Compression level from 0 (no compression) to 9 (max compression)
-    _, depth_img_encoded = cv2.imencode(".png", depth_img, encode_param)
-    socket.send(depth_img_encoded.tobytes())
-
-
-def recv_depth_img(socket):
-    depth_img = socket.recv()
-    depth_img = np.frombuffer(depth_img, dtype=np.uint8)
-    depth_img = cv2.imdecode(depth_img, cv2.IMREAD_UNCHANGED)
-    depth_img = depth_img / 1000.0
-    return depth_img
-
-
-def send_everything(socket, rgb, depth, intrinsics, pose):
-    send_rgb_img(socket, rgb)
-    socket.recv_string()
-    send_depth_img(socket, depth)
-    socket.recv_string()
-    send_array(socket, intrinsics)
-    socket.recv_string()
-    send_array(socket, pose)
-    socket.recv_string()
-
-
-def recv_everything(socket):
-    rgb = recv_rgb_img(socket)
-    socket.send_string("")
-    depth = recv_depth_img(socket)
-    socket.send_string("")
-    intrinsics = recv_array(socket)
-    socket.send_string("")
-    pose = recv_array(socket)
-    socket.send_string("")
-    return rgb, depth, intrinsics, pose
-
-
 class ImageSender:
     def __init__(
         self,
diff --git a/src/stretch/dynav/run_manip.py b/src/stretch/dynav/run_manip.py
index 35ae069de..5bd3e6202 100644
--- a/src/stretch/dynav/run_manip.py
+++ b/src/stretch/dynav/run_manip.py
@@ -18,7 +18,7 @@
 from stretch.agent import RobotClient
 
 # Mapping and perception
-from stretch.core.parameters import Parameters, get_parameters
+from stretch.core.parameters import get_parameters
 from stretch.dynav import RobotAgentMDP
 
 
diff --git a/src/stretch/dynav/run_ok_nav.py b/src/stretch/dynav/run_ok_nav.py
index 756c3f9ba..28260ce9b 100644
--- a/src/stretch/dynav/run_ok_nav.py
+++ b/src/stretch/dynav/run_ok_nav.py
@@ -7,28 +7,16 @@
 # Some code may be adapted from other open-source works with their respective licenses. Original
 # license information maybe found below, if so.
 
-import threading
-import time
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 import click
 import cv2
-import matplotlib.pyplot as plt
 import numpy as np
-from PIL import Image
 
 from stretch.agent import RobotClient
 
 # Mapping and perception
-from stretch.core.parameters import Parameters, get_parameters
+from stretch.core.parameters import get_parameters
 from stretch.dynav import RobotAgentMDP
 
-# Chat and UI tools
-from stretch.utils.point_cloud import numpy_to_pcd, show_point_cloud
-
 
 def compute_tilt(camera_xyz, target_xyz):
     """
diff --git a/src/stretch/dynav/scannet.py b/src/stretch/dynav/scannet.py
index 6d16e1a9f..c8009658d 100644
--- a/src/stretch/dynav/scannet.py
+++ b/src/stretch/dynav/scannet.py
@@ -95,7 +95,7 @@
     "tray",
     "bowl",
     "speaker",
-    "crate",
+    "crates",
     "projector",
     "book",
     "school bag",
diff --git a/src/stretch/dynav/voxel_map_localizer.py b/src/stretch/dynav/voxel_map_localizer.py
index fcc8590fe..69bef8bf2 100644
--- a/src/stretch/dynav/voxel_map_localizer.py
+++ b/src/stretch/dynav/voxel_map_localizer.py
@@ -19,7 +19,7 @@
 # from ultralytics import YOLOWorld
 from transformers import AutoModel, AutoProcessor, Owlv2ForObjectDetection
 
-from stretch.dynav.mapping_utils import VoxelizedPointcloud
+from stretch.dynav.mapping_utils.voxelized_pcd import VoxelizedPointcloud
 
 
 def get_inv_intrinsics(intrinsics):
diff --git a/src/stretch/dynav/voxel_map_server.py b/src/stretch/dynav/voxel_map_server.py
index f1d85da55..433e4b899 100644
--- a/src/stretch/dynav/voxel_map_server.py
+++ b/src/stretch/dynav/voxel_map_server.py
@@ -40,7 +40,9 @@
 
 from stretch.core import get_parameters
 from stretch.dynav.communication_util import load_socket, recv_everything
-from stretch.dynav.mapping_utils import AStar, SparseVoxelMap, SparseVoxelMapNavigationSpace
+from stretch.dynav.mapping_utils.a_star import AStar
+from stretch.dynav.mapping_utils.voxel import SparseVoxelMap
+from stretch.dynav.mapping_utils.voxel_map import SparseVoxelMapNavigationSpace
 from stretch.dynav.scannet import CLASS_LABELS_200
 from stretch.dynav.voxel_map_localizer import VoxelMapLocalizer
 
@@ -121,7 +123,7 @@ def __init__(
         rerun=True,
         static=True,
         log=None,
-        image_shape=(450, 350),
+        image_shape=(400, 300),
     ):
         self.static = static
         self.siglip = siglip
@@ -268,9 +270,6 @@ def process_text(self, text, start_pose):
             if self.voxel_map_localizer.verify_point(text, traj_target_point):
                 localized_point = traj_target_point
                 debug_text += "## Last visual grounding results looks fine so directly use it.\n"
-                # if self.planner.verify_path(self.traj[:-2]):
-                #     waypoints = self.traj[:-2]
-                #     debug_text += '## Last path planning results looks fine so directly use it.\n'
 
         if waypoints is None:
             # Do visual grounding
@@ -407,118 +406,6 @@ def process_text(self, text, start_pose):
         # self.write_to_pickle()
         return traj
 
-    # def process_text(self, text, start_pose):
-    #     if self.rerun:
-    #         # if not self.static:
-    #         #     rr.set_time_sequence("frame", self.obs_count)
-    #         rr.log('/object', rr.Clear(recursive = True), static = self.static)
-    #         rr.log('/robot_start_pose', rr.Clear(recursive = True), static = self.static)
-    #         rr.log('/direction', rr.Clear(recursive = True), static = self.static)
-    #         rr.log('robot_monologue', rr.Clear(recursive = True), static = self.static)
-    #         rr.log('/Past_observation_most_similar_to_text', rr.Clear(recursive = True), static = self.static)
-    #         if not self.static:
-    #             rr.connect('100.108.67.79:9876')
-
-    #     debug_text = ''
-    #     mode = 'navigation'
-    #     obs = None
-    #     # Do visual grounding
-    #     if text is not None and text != '':
-    #         with self.voxel_map_lock:
-    #             localized_point, debug_text, obs, pointcloud = self.voxel_map_localizer.localize_A(text, debug = True, return_debug = True)
-    #         if localized_point is not None:
-    #             rr.log("/object", rr.Points3D([localized_point[0], localized_point[1], 1.5], colors=torch.Tensor([1, 0, 0]), radii=0.1), static = self.static)
-    #     # Do Frontier based exploration
-    #     if text is None or text == '' or localized_point is None:
-    #         debug_text += '## Navigation fails, so robot starts exploring environments.\n'
-    #         localized_point = self.sample_frontier(start_pose, text)
-    #         mode = 'exploration'
-    #         rr.log("/object", rr.Points3D([0, 0, 0], colors=torch.Tensor([1, 0, 0]), radii=0), static = self.static)
-    #         print('\n', localized_point, '\n')
-
-    #     if localized_point is None:
-    #         return []
-
-    #     if len(localized_point) == 2:
-    #         localized_point = np.array([localized_point[0], localized_point[1], 0])
-
-    #     point = self.sample_navigation(start_pose, localized_point, mode = mode)
-    #     # if mode == 'navigation' and torch.linalg.norm(torch.Tensor(point)[:2] - torch.linalg.norm(localized_point[:2])) > 2.0:
-    #     #     localized_point = self.sample_frontier(start_pose, None)
-    #     #     mode = 'exploration'
-    #     #     point = self.sample_navigation(start_pose, localized_point)
-    #     #     debug_text += '## All reachable points of robot are too far from the target object, explore to find new paths. \n'
-
-    #     if self.rerun:
-    #         buf = BytesIO()
-    #         plt.savefig(buf, format='png')
-    #         buf.seek(0)
-    #         img = Image.open(buf)
-    #         img = np.array(img)
-    #         buf.close()
-    #         rr.log('2d_map', rr.Image(img), static = self.static)
-    #     else:
-    #         if text != '':
-    #             plt.savefig(self.log + '/debug_' + text + str(self.obs_count) + '.png')
-    #         else:
-    #             plt.savefig(self.log + '/debug_exploration' + str(self.obs_count) + '.png')
-    #     plt.clf()
-
-    #     if self.rerun:
-    #         if text is not None and text != '':
-    #             debug_text = '### The goal is to navigate to ' + text + '.\n' + debug_text
-    #         else:
-    #             debug_text = '### I have not received any text query from human user.\n ### So, I plan to explore the environment with Frontier-based exploration.\n'
-    #         debug_text = '# Robot\'s monologue: \n' + debug_text
-    #         rr.log("robot_monologue", rr.TextDocument(debug_text, media_type = rr.MediaType.MARKDOWN), static = self.static)
-
-    #     if obs is not None and mode == 'navigation':
-    #         rgb = self.voxel_map.observations[obs - 1].rgb
-    #         if not self.rerun:
-    #             cv2.imwrite(self.log + '/debug_' + text + '.png', rgb[:, :, [2, 1, 0]])
-    #         else:
-    #             rr.log('/Past_observation_most_similar_to_text', rr.Image(rgb), static = self.static)
-    #     traj = []
-    #     waypoints = None
-    #     if point is None:
-    #         print('Unable to find any target point, some exception might happen')
-    #     else:
-    #         print('Target point is', point)
-    #         res = self.planner.plan(start_pose, point)
-    #         if res.success:
-    #             waypoints = [pt.state for pt in res.trajectory]
-    #             # If we are navigating to some object of interest, send (x, y, z) of
-    #             # the object so that we can make sure the robot looks at the object after navigation
-    #             print(waypoints)
-    #             # finished = (len(waypoints) <= 4 or torch.linalg.norm(torch.Tensor(point)[:2] - torch.Tensor(start_pose[:2])) > 0.8) and mode == 'navigation'
-    #             # finished = mode == 'navigation'
-    #             finished = len(waypoints) <= 4 and mode == 'navigation'
-    #             if not finished:
-    #                 waypoints = waypoints[:4]
-    #             traj = self.planner.clean_path_for_xy(waypoints)
-    #             # traj = traj[1:]
-    #             if finished:
-    #                 traj.append([np.nan, np.nan, np.nan])
-    #                 if isinstance(localized_point, torch.Tensor):
-    #                     localized_point = localized_point.tolist()
-    #                 traj.append(localized_point)
-    #             print('Planned trajectory:', traj)
-    #         else:
-    #             print('[FAILURE]', res.reason)
-
-    #     if traj is not None:
-    #         origins = []
-    #         vectors = []
-    #         for idx in range(len(traj)):
-    #             if idx != len(traj) - 1:
-    #                 origins.append([traj[idx][0], traj[idx][1], 1.5])
-    #                 vectors.append([traj[idx + 1][0] - traj[idx][0], traj[idx + 1][1] - traj[idx][1], 0])
-    #         rr.log("/direction", rr.Arrows3D(origins = origins, vectors = vectors, colors=torch.Tensor([0, 1, 0]), radii=0.05), static = self.static)
-    #         rr.log("/robot_start_pose", rr.Points3D([start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1), static = self.static)
-
-    #     self.write_to_pickle()
-    #     return traj
-
     def sample_navigation(self, start, point, mode="navigation"):
         # plt.clf()
         obstacles, _ = self.voxel_map.get_2d_map()
@@ -728,7 +615,7 @@ def run_owl_sam_clip(self, rgb, mask, world_xyz):
 
             # Debug code, visualize all bounding boxes and segmentation masks
 
-            image_vis = np.array(rgb.permute(1, 2, 0))
+            image_vis = np.ascontiguousarray(np.array(rgb.permute(1, 2, 0)).astype(np.uint8))
             segmentation_color_map = np.zeros(image_vis.shape, dtype=np.uint8)
             # cv2.imwrite('clean_' + str(self.obs_count) + '.jpg', cv2.cvtColor(image_vis, cv2.COLOR_RGB2BGR))
             for idx, box in enumerate(bounding_boxes):
@@ -825,27 +712,24 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
         rgb, depth = torch.from_numpy(rgb), torch.from_numpy(depth)
         rgb = rgb.permute(2, 0, 1).to(torch.uint8)
 
-        h, w = self.image_shape
-        h_image, w_image = depth.shape
-        rgb = F.interpolate(
-            rgb.unsqueeze(0), size=self.image_shape, mode="bilinear", align_corners=False
-        ).squeeze(0)
-        depth = (
-            F.interpolate(
+        if self.image_shape is not None:
+            h, w = self.image_shape
+            h_image, w_image = depth.shape
+            rgb = F.interpolate(
+                rgb.unsqueeze(0), size=self.image_shape, mode="bilinear", align_corners=False
+            ).squeeze()
+            depth = F.interpolate(
                 depth.unsqueeze(0).unsqueeze(0),
                 size=self.image_shape,
                 mode="bilinear",
                 align_corners=False,
-            )
-            .squeeze(0)
-            .squeeze(0)
-        )
-        intrinsics[0, 0] *= w / w_image
-        intrinsics[1, 1] *= h / h_image
-        intrinsics[0, 2] *= w / w_image
-        intrinsics[1, 2] *= h / h_image
+            ).squeeze()
+            intrinsics[0, 0] *= w / w_image
+            intrinsics[1, 1] *= h / h_image
+            intrinsics[0, 2] *= w / w_image
+            intrinsics[1, 2] *= h / h_image
 
-        world_xyz = get_xyz(depth, pose, intrinsics).squeeze(0)
+        world_xyz = get_xyz(depth, pose, intrinsics).squeeze()
 
         median_depth = torch.from_numpy(scipy.ndimage.median_filter(depth, size=5))
         median_filter_error = (depth - median_depth).abs()
@@ -1031,21 +915,3 @@ def write_to_pickle(self):
         with open(filename, "wb") as f:
             pickle.dump(data, f)
         print("write all data to", filename)
-
-
-# @hydra.main(version_base="1.2", config_path=".", config_name="config.yaml")
-# def main(cfg):
-#     torch.manual_seed(1)
-#     imageProcessor = ImageProcessor(rerun = False, static = False, min_depth = 0., max_depth = 2.5)
-# imageProcessor = ImageProcessor(rerun = cfg.rerun, static = cfg.static, min_depth = cfg.min_depth, max_depth = cfg.max_depth)
-# if not cfg.pickle_file_name is None:
-#     imageProcessor.read_from_pickle(cfg.pickle_file_name)
-# print(imageProcessor.voxel_map_localizer.voxel_pcd._points)
-# if cfg.open_communication:
-#     while True:
-#         imageProcessor.recv_text()
-# imageProcessor.read_from_pickle('env.pkl', -1)
-# imageProcessor.write_to_pickle()
-#
-# if __name__ == "__main__":
-#     main(None)
diff --git a/src/stretch/motion/base/ik_solver_base.py b/src/stretch/motion/base/ik_solver_base.py
index b25285429..6ff3faccc 100644
--- a/src/stretch/motion/base/ik_solver_base.py
+++ b/src/stretch/motion/base/ik_solver_base.py
@@ -13,7 +13,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
-from typing import Tuple
+from typing import Optional, Tuple
 
 import numpy as np
 
@@ -31,7 +31,9 @@ def get_num_controllable_joints(self) -> int:
         """returns number of controllable joints under this solver's purview"""
         raise NotImplementedError()
 
-    def compute_fk(self, q) -> Tuple[np.ndarray, np.ndarray]:
+    def compute_fk(
+        self, q, link_name=None, ignore_missing_joints=False
+    ) -> Tuple[np.ndarray, np.ndarray]:
         """given joint values return end-effector position and quaternion associated with it"""
         raise NotImplementedError()
 
@@ -44,6 +46,7 @@ def compute_ik(
         num_attempts: int = 1,
         verbose: bool = False,
         ignore_missing_joints: bool = False,
+        node_name: Optional[str] = None,
     ) -> Tuple[np.ndarray, bool, dict]:
 
         """
diff --git a/src/stretch/motion/pinocchio_ik_solver.py b/src/stretch/motion/pinocchio_ik_solver.py
index d783f02cd..23e467393 100644
--- a/src/stretch/motion/pinocchio_ik_solver.py
+++ b/src/stretch/motion/pinocchio_ik_solver.py
@@ -99,7 +99,7 @@ def get_all_joint_names(self) -> List[str]:
         return [self.model.names[i + 1] for i in range(self.model.nq)]
 
     def _qmap_control2model(
-        self, q_input: np.ndarray, ignore_missing_joints: bool = False
+        self, q_input: Union[np.ndarray, dict], ignore_missing_joints: bool = False
     ) -> np.ndarray:
         """returns a full joint configuration from a partial joint configuration"""
         q_out = self.q_neutral.copy()
diff --git a/src/stretch/requirements.txt b/src/stretch/requirements.txt
index a2cb95b17..339a82c8a 100644
--- a/src/stretch/requirements.txt
+++ b/src/stretch/requirements.txt
@@ -4,7 +4,7 @@ ultralytics
 wget
 # SAM2
 # conda install pyg -c pyg
-
+google-generativeai
 
 # openai
 # zmq
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 0939c1d6c..8d28bb5d7 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -271,15 +271,12 @@ def log_robot_xyt(self, obs):
             labels="robot",
             colors=[255, 0, 0, 255],
         )
-
         rr.log("world/robot/arrow", rb_arrow, static=True)
         rr.log(
             "world/robot/blob",
             rr.Points3D([0, 0, 0], colors=[255, 0, 0, 255], radii=0.13),
             static=True,
         )
-        # rr.log("world/robot/arrow", rb_arrow)
-
         rr.log(
             "world/robot",
             rr.Transform3D(

From 17acecd96132a4dc8548c80725004be617596dbe Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 26 Sep 2024 10:53:42 -0400
Subject: [PATCH 238/410] Add Peiqi's Dynav code

---
 .codespell-ignore-words.txt                   |   1 +
 .gitignore                                    |   4 +-
 src/stretch/agent/zmq_client.py               | 141 ++-
 .../config/control/noplan_velocity_hw.yaml    |   8 +-
 src/stretch/config/default_planner.yaml       |  22 +-
 src/stretch/config/dynav_config.yaml          |  75 ++
 src/stretch/dynav/.gitignore                  |  36 +
 src/stretch/dynav/__init__.py                 |  10 +
 src/stretch/dynav/communication_util.py       |  94 ++
 src/stretch/dynav/config.yaml                 |  12 +
 src/stretch/dynav/llm_localizer.py            | 421 ++++++++
 src/stretch/dynav/llm_server.py               | 581 +++++++++++
 src/stretch/dynav/mapping_utils/__init__.py   |   8 +
 src/stretch/dynav/mapping_utils/a_star.py     | 344 +++++++
 src/stretch/dynav/mapping_utils/voxel.py      | 968 ++++++++++++++++++
 src/stretch/dynav/mapping_utils/voxel_map.py  | 820 +++++++++++++++
 .../dynav/mapping_utils/voxelized_pcd.py      | 634 ++++++++++++
 src/stretch/dynav/ok_robot_hw/README.md       |  43 +
 src/stretch/dynav/ok_robot_hw/camera.py       |  68 ++
 .../dynav/ok_robot_hw/global_parameters.py    |  23 +
 .../dynav/ok_robot_hw/image_publisher.py      |  83 ++
 src/stretch/dynav/ok_robot_hw/robot.py        | 305 ++++++
 .../dynav/ok_robot_hw/utils/__init__.py       |  12 +
 .../ok_robot_hw/utils/communication_utils.py  |  65 ++
 .../dynav/ok_robot_hw/utils/grasper_utils.py  | 248 +++++
 src/stretch/dynav/ok_robot_hw/utils/utils.py  |  32 +
 src/stretch/dynav/robot_agent_manip_mdp.py    | 369 +++++++
 src/stretch/dynav/run_manip.py                |  87 ++
 src/stretch/dynav/run_ok_nav.py               | 174 ++++
 src/stretch/dynav/scannet.py                  | 217 ++++
 src/stretch/dynav/voxel_map_localizer.py      | 418 ++++++++
 src/stretch/dynav/voxel_map_server.py         | 917 +++++++++++++++++
 src/stretch/motion/base/ik_solver_base.py     |   4 +-
 src/stretch/motion/constants.py               |   3 +-
 src/stretch/motion/kinematics.py              |   7 +-
 src/stretch/motion/pinocchio_ik_solver.py     |  39 +-
 src/stretch/requirements.txt                  |  66 ++
 src/stretch/visualization/rerun.py            |  39 +-
 .../stretch_ros2_bridge/remote/modules/nav.py |   4 +-
 39 files changed, 7361 insertions(+), 41 deletions(-)
 create mode 100644 src/stretch/config/dynav_config.yaml
 create mode 100644 src/stretch/dynav/.gitignore
 create mode 100644 src/stretch/dynav/__init__.py
 create mode 100644 src/stretch/dynav/communication_util.py
 create mode 100644 src/stretch/dynav/config.yaml
 create mode 100644 src/stretch/dynav/llm_localizer.py
 create mode 100644 src/stretch/dynav/llm_server.py
 create mode 100644 src/stretch/dynav/mapping_utils/__init__.py
 create mode 100644 src/stretch/dynav/mapping_utils/a_star.py
 create mode 100644 src/stretch/dynav/mapping_utils/voxel.py
 create mode 100644 src/stretch/dynav/mapping_utils/voxel_map.py
 create mode 100644 src/stretch/dynav/mapping_utils/voxelized_pcd.py
 create mode 100644 src/stretch/dynav/ok_robot_hw/README.md
 create mode 100644 src/stretch/dynav/ok_robot_hw/camera.py
 create mode 100644 src/stretch/dynav/ok_robot_hw/global_parameters.py
 create mode 100644 src/stretch/dynav/ok_robot_hw/image_publisher.py
 create mode 100644 src/stretch/dynav/ok_robot_hw/robot.py
 create mode 100644 src/stretch/dynav/ok_robot_hw/utils/__init__.py
 create mode 100644 src/stretch/dynav/ok_robot_hw/utils/communication_utils.py
 create mode 100644 src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
 create mode 100644 src/stretch/dynav/ok_robot_hw/utils/utils.py
 create mode 100644 src/stretch/dynav/robot_agent_manip_mdp.py
 create mode 100644 src/stretch/dynav/run_manip.py
 create mode 100644 src/stretch/dynav/run_ok_nav.py
 create mode 100644 src/stretch/dynav/scannet.py
 create mode 100644 src/stretch/dynav/voxel_map_localizer.py
 create mode 100644 src/stretch/dynav/voxel_map_server.py
 create mode 100644 src/stretch/requirements.txt

diff --git a/.codespell-ignore-words.txt b/.codespell-ignore-words.txt
index b7860a05a..59169ad53 100644
--- a/.codespell-ignore-words.txt
+++ b/.codespell-ignore-words.txt
@@ -1 +1,2 @@
 empy
+crate
diff --git a/.gitignore b/.gitignore
index 04f59a9ef..67db76c46 100644
--- a/.gitignore
+++ b/.gitignore
@@ -135,4 +135,6 @@ data/
 log/
 
 # install folder
-install/
\ No newline at end of file
+install/
+
+debug*/
\ No newline at end of file
diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 22a8c0031..05967d45c 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -13,7 +13,7 @@
 import time
 import timeit
 from threading import Lock
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import click
 import numpy as np
@@ -29,7 +29,7 @@
 from stretch.core.robot import AbstractRobotClient
 from stretch.motion import PlanResult, RobotModel
 from stretch.motion.kinematics import HelloStretchIdx, HelloStretchKinematics
-from stretch.utils.geometry import angle_difference
+from stretch.utils.geometry import angle_difference, posquat2sophus, sophus2posquat
 from stretch.utils.image import Camera
 from stretch.utils.memory import lookup_address
 from stretch.utils.point_cloud import show_point_cloud
@@ -245,6 +245,11 @@ def get_joint_positions(self, timeout: float = 5.0) -> np.ndarray:
             joint_positions = self._state["joint_positions"]
         return joint_positions
 
+    def get_six_joints(self, timeout: float = 5.0) -> np.ndarray:
+        """Get the six major joint positions"""
+        joint_positions = self.get_joint_positions(timeout=timeout)
+        return np.array(self._extract_joint_pos(joint_positions))
+
     def get_joint_velocities(self, timeout: float = 5.0) -> np.ndarray:
         """Get the current joint velocities"""
         t0 = timeit.default_timer()
@@ -292,6 +297,86 @@ def get_base_pose(self, timeout: float = 5.0) -> np.ndarray:
                 xyt = self._state["base_pose"]
         return xyt
 
+    def get_pan_tilt(self):
+        joint_positions, _, _ = self.get_joint_state()
+        return joint_positions[HelloStretchIdx.HEAD_PAN], joint_positions[HelloStretchIdx.HEAD_TILT]
+
+    def get_gripper_position(self):
+        joint_state = self.get_joint_positions()
+        return joint_state[HelloStretchIdx.GRIPPER]
+
+    def get_ee_pose(self, matrix=False, link_name=None, q=None):
+        if q is None:
+            q = self.get_joint_positions()
+        pos, quat = self._robot_model.manip_fk(q, node=link_name)
+
+        if matrix:
+            pose = posquat2sophus(pos, quat)
+            return pose.matrix()
+        else:
+            return pos, quat
+
+    def get_frame_pose(self, q, node_a: str, node_b: str):
+        return self._robot_model.manip_ik_solver.get_frame_pose(q, node_a, node_b)
+
+    def solve_ik(
+        self,
+        pos: List[float],
+        quat: Optional[List[float]] = None,
+        initial_cfg: np.ndarray = None,
+        debug: bool = False,
+        custom_ee_frame: Optional[str] = None,
+    ) -> Optional[np.ndarray]:
+        """Solve inverse kinematics appropriately (or at least try to) and get the joint position
+        that we will be moving to.
+
+        Note: When relative==True, the delta orientation is still defined in the world frame
+
+        Returns None if no solution is found, else returns an executable solution
+        """
+
+        pos_ee_curr, quat_ee_curr = self.get_ee_pose()
+        if quat is None:
+            quat = quat_ee_curr
+
+        # Compute IK goal: pose relative to base
+        pose_desired = posquat2sophus(np.array(pos), np.array(quat))
+
+        pose_base2ee_desired = pose_desired
+
+        pos_ik_goal, quat_ik_goal = sophus2posquat(pose_base2ee_desired)
+
+        # Execute joint command
+        if debug:
+            print("=== EE goto command ===")
+            print(f"Initial EE pose: pos={pos_ee_curr}; quat={quat_ee_curr}")
+            print(f"Input EE pose: pos={np.array(pos)}; quat={np.array(quat)}")
+            print(f"Desired EE pose: pos={pos_ik_goal}; quat={quat_ik_goal}")
+
+        # Perform IK
+        full_body_cfg, ik_success, ik_debug_info = self._robot_model.manip_ik(
+            (pos_ik_goal, quat_ik_goal),
+            q0=initial_cfg,
+            custom_ee_frame=custom_ee_frame,
+        )
+
+        # Expected to return None if we did not get a solution
+        if not ik_success or full_body_cfg is None:
+            return None
+        # Return a valid solution to the IK problem here
+        return full_body_cfg
+
+    def _extract_joint_pos(self, q):
+        """Helper to convert from the general-purpose config including full robot state, into the command space used in just the manip controller. Extracts just lift/arm/wrist information."""
+        return [
+            q[HelloStretchIdx.BASE_X],
+            q[HelloStretchIdx.LIFT],
+            q[HelloStretchIdx.ARM],
+            q[HelloStretchIdx.WRIST_YAW],
+            q[HelloStretchIdx.WRIST_PITCH],
+            q[HelloStretchIdx.WRIST_ROLL],
+        ]
+
     def robot_to(self, joint_angles: np.ndarray, blocking: bool = False, timeout: float = 10.0):
         """Move the robot to a particular joint configuration."""
         next_action = {"joint": joint_angles, "manip_blocking": blocking}
@@ -309,7 +394,7 @@ def head_to(
             logger.warning(
                 f"Head tilt is restricted to be between {self._head_tilt_min} and {self._head_tilt_max} for safety: was{head_tilt}"
             )
-        head_pan = np.clip(head_pan, -np.pi, 0)
+        head_pan = np.clip(head_pan, self._head_pan_min, self._head_pan_max)
         head_tilt = np.clip(head_tilt, -np.pi / 2, 0)
         next_action = {"head_to": [float(head_pan), float(head_tilt)], "manip_blocking": blocking}
         sent = self.send_action(next_action, timeout=timeout)
@@ -321,6 +406,20 @@ def head_to(
             whole_body_q[HelloStretchIdx.HEAD_TILT] = float(head_tilt)
             self._wait_for_head(whole_body_q, block_id=step)
 
+        time.sleep(0.3)
+
+    def look_front(self, blocking: bool = True, timeout: float = 10.0):
+        """Let robot look to its front."""
+        self.head_to(
+            constants.look_front[0], constants.look_front[1], blocking=blocking, timeout=timeout
+        )
+
+    def look_at_ee(self, blocking: bool = True, timeout: float = 10.0):
+        """Let robot look to its arm."""
+        self.head_to(
+            constants.look_at_ee[0], constants.look_at_ee[1], blocking=blocking, timeout=timeout
+        )
+
     def arm_to(
         self,
         joint_angles: Optional[np.ndarray] = None,
@@ -329,7 +428,7 @@ def arm_to(
         blocking: bool = False,
         timeout: float = 10.0,
         verbose: bool = False,
-        min_time: float = 2.0,
+        min_time: float = 2.5,
         **config,
     ) -> bool:
         """Move the arm to a particular joint configuration.
@@ -386,6 +485,8 @@ def arm_to(
             # If head is not specified, we need to set it to the right head position
             # In this case, we assume if moving arm you should look at ee
             _next_action["head_to"] = constants.look_at_ee
+            # cur_pan, cur_tilt = self.get_pan_tilt()
+            # _next_action["head_to"] = np.array([cur_pan, cur_tilt])
         _next_action["manip_blocking"] = blocking
         self.send_action(_next_action)
 
@@ -455,7 +556,7 @@ def arm_to(
 
     def navigate_to(
         self,
-        xyt: ContinuousNavigationAction,
+        xyt: Union[np.ndarray, ContinuousNavigationAction],
         relative=False,
         blocking=False,
         timeout: float = 10.0,
@@ -470,6 +571,11 @@ def navigate_to(
             self._rerun.update_nav_goal(xyt)
         self.send_action(next_action, timeout=timeout, verbose=verbose)
 
+    def set_velocity(self, v: float, w: float):
+        """Move to xyt in global coordinates or relative coordinates."""
+        next_action = {"v": v, "w": w}
+        self.send_action(next_action)
+
     def reset(self):
         """Reset everything in the robot's internal state"""
         self._control_mode = None
@@ -755,10 +861,9 @@ def _wait_for_action(
 
             t1 = timeit.default_timer()
             if t1 - t0 > timeout:
-                print(
-                    f"Waiting for step={block_id} {self._last_step} prev={self._last_step} at {pos} moved {moved_dist:0.04f} angle {angle_dist:0.04f} not_moving {not_moving_count} at_goal {self._state['at_goal']}"
-                )
-                raise RuntimeError(f"Timeout waiting for block with step id = {block_id}")
+                print(f"Timeout waiting for block with step id = {block_id}")
+                break
+                # raise RuntimeError(f"Timeout waiting for block with step id = {block_id}")
 
     def in_manipulation_mode(self) -> bool:
         """is the robot ready to grasp"""
@@ -843,6 +948,21 @@ def get_observation(self):
             observation.seq_id = self._seq_id
         return observation
 
+    def get_images(self, compute_xyz=False):
+        obs = self.get_observation()
+        if compute_xyz:
+            return obs.rgb, obs.depth, obs.xyz
+        else:
+            return obs.rgb, obs.depth
+
+    def get_camera_K(self):
+        obs = self.get_observation()
+        return obs.camera_K
+
+    def get_head_pose(self):
+        obs = self.get_observation()
+        return obs.camera_pose
+
     def execute_trajectory(
         self,
         trajectory: List[np.ndarray],
@@ -853,6 +973,7 @@ def execute_trajectory(
         per_waypoint_timeout: float = 10.0,
         final_timeout: float = 10.0,
         relative: bool = False,
+        blocking: bool = False,
     ):
         """Execute a multi-step trajectory; this is always blocking since it waits to reach each one in turn."""
 
@@ -966,7 +1087,7 @@ def send_action(
             # print("SENDING THIS ACTION:", next_action)
             self.send_socket.send_pyobj(next_action)
             time.sleep(0.01)
-            # TODO: why do we send the action twice?
+
             # print("SENDING THIS ACTION:", next_action)
             self.send_socket.send_pyobj(next_action)
 
diff --git a/src/stretch/config/control/noplan_velocity_hw.yaml b/src/stretch/config/control/noplan_velocity_hw.yaml
index fb560ce42..672574ce0 100644
--- a/src/stretch/config/control/noplan_velocity_hw.yaml
+++ b/src/stretch/config/control/noplan_velocity_hw.yaml
@@ -5,16 +5,16 @@
 #w_max: 0.5  # (vel_m_max - vel_m_default) / wheel_separation_m
 #acc_lin: 0.2  # 0.5 * base.params["motion"]["max"]["accel_m"]
 #acc_ang: 0.4  # 0.5 * (accel_m_max - accel_m_default) / wheel_separation_m
-v_max: 0.2  # base.params["motion"]["default"]["vel_m"]
-w_max: 0.5  # (vel_m_max - vel_m_default) / wheel_separation_m
+v_max: 10.0  # base.params["motion"]["default"]["vel_m"]
+w_max: 5.0  # (vel_m_max - vel_m_default) / wheel_separation_m
   #w_max: 0.6  # (vel_m_max - vel_m_default) / wheel_separation_m
 acc_lin: 0.1  # 0.5 * base.params["motion"]["max"]["accel_m"]
 acc_ang: 0.2  # 0.5 * (accel_m_max - accel_m_default) / wheel_separation_m
 #acc_ang: 0.3  # 0.5 * (accel_m_max - accel_m_default) / wheel_separation_m
 
 # Tolerances for determining whether the goal position or orientation is reached
-lin_error_tol: 0.12
-ang_error_tol: 0.12
+lin_error_tol: 0.15
+ang_error_tol: 0.15
 
 # Error tolerance ratio - scale target error by this to a minimum of the tolerance below
 lin_error_ratio: 0.5
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index ddc95405d..c2e695aeb 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -51,14 +51,20 @@ motion:
   angle_threshold: 0.01  # How much the robot has to rotate to be considered "rotating"
   min_steps_not_moving: 2  # How many steps the robot has to not move before we consider it "stopped"
   joint_tolerance:
-    arm: 0.05
-    base_x: 0.05
-    lift: 0.05
-    wrist_roll: 0.25
-    wrist_pitch: 0.25
-    wrist_yaw: 0.05
-    head_pan: 0.01
-    head_tilt: 0.01
+    arm: 0.02
+    base_x: 0.02
+    lift: 0.02
+    wrist_roll: 0.1
+    wrist_pitch: 0.1
+    wrist_yaw: 0.1
+    # arm: 0.05
+    # base_x: 0.05
+    # lift: 0.05
+    # wrist_roll: 0.25
+    # wrist_pitch: 0.25
+    # wrist_yaw: 0.05
+    head_pan: 0.1
+    head_tilt: 0.1
   joint_thresholds:
     head_not_moving_tolerance: 1.0e-4
     gripper_open_threshold: 0.3
diff --git a/src/stretch/config/dynav_config.yaml b/src/stretch/config/dynav_config.yaml
new file mode 100644
index 000000000..95c1d8e73
--- /dev/null
+++ b/src/stretch/config/dynav_config.yaml
@@ -0,0 +1,75 @@
+# Encoder setup
+# Encoder is used to compute per-object embeddings.
+encoder: "clip"
+encoder_args: "ViT-B/32"
+open_vocab_cat_map_file: projects/real_world_ovmm/configs/example_cat_map.json
+
+# Sparse Voxel Map parameters
+voxel_size: 0.1
+obs_min_height: 0.2  # Ignore things less than this high when planning motions
+obs_max_height: 1.5  # Ignore things over this height (eg ceilings)
+obs_min_density: 5  # This many points makes it an obstacle
+exp_min_density: 1
+
+# Padding
+pad_obstacles: 2  # Add this many units (voxel_size) to the area around obstacles
+min_pad_obstacles: 1  # Do not pad LESS than this amount, for safety.
+
+local_radius: 0.5  # Area around the robot to mark as explored (kind of a hack)
+remove_visited_from_obstacles: False
+min_depth: 0.25
+max_depth: 2.5
+
+# Point cloud cleanup
+filters:
+        smooth_kernel_size: 2
+        use_median_filter: True
+        median_filter_size: 2
+        median_filter_max_error: 0.01
+        use_derivative_filter: False
+        derivative_filter_threshold: 0.1
+        # use_voxel_filter: True
+
+# Exploration
+# in_place_rotation_steps: 8
+in_place_rotation_steps: 0
+
+# TAMP parameters
+guarantee_instance_is_reachable: True
+plan_with_reachable_instances: True
+plan_with_scene_graph: True
+max_near_distance: 0.3
+
+# Navigation space - used for motion planning and computing goals.
+step_size: 0.1
+rotation_step_size: 0.2 
+dilate_frontier_size: 2  # Used to shrink the frontier back from the edges of the world
+dilate_obstacle_size: 0  # Used when selecting navigation goals 
+default_expand_frontier_size: 5  # margin along the frontier where final robot position can be
+motion_planner:
+        simplify_plans: False
+        shortcut_plans: True
+        shortcut_iter: 100
+
+# Trajectory following - how closely we follow intermediate waypoints
+# These should be less strict than whatever parameters the low-level controller is using; this will
+# make sure that the motions end up looking smooth.
+trajectory_pos_err_threshold: 0.15
+trajectory_rot_err_threshold: 0.3
+trajectory_per_step_timeout: 3.0
+
+# User interface
+# Choose one of: (object_to_find, location_to_place), command, or chat
+# Don't use all of them!
+name: "stretch_demo"  # for logging - currently not used
+chat: False
+start_ui_server: False
+vlm_context_length: 20  # How long messages sent to the vlm server can be if we are using it
+limited_obs_publish_sleep: 0.5
+
+# High level stuff: commands to execute 
+command: "pick up a bottle and put it on the chair"
+name: "stretch"
+exploration_steps: 10
+object_to_find: "bottle"
+location_to_place: "chair"
diff --git a/src/stretch/dynav/.gitignore b/src/stretch/dynav/.gitignore
new file mode 100644
index 000000000..79d0c80bb
--- /dev/null
+++ b/src/stretch/dynav/.gitignore
@@ -0,0 +1,36 @@
+*.tar
+*.pyc
+/ok-robot-navigation/outputs/*
+/ok-robot-navigation/sample_debug/*
+/ok-robot-navigation/test/*
+/outputs/*
+*.pt
+*.ply
+**/*.egg-info/
+*.o
+*.egg
+.DS_Store
+.TimeRecord
+license
+ok-robot-manipulation/src/checkpoints/*
+ok-robot-manipulation/src/example_data/*
+/src/sample_save_directory/*
+*.pth
+ok-robot-manipulation/pointnet2/*
+ok-robot-hw/hab_stretch/*
+ok-robot-hw/*.npy
+ok-robot-hw/*.png
+
+# Ignore python build files.
+build
+dist
+example_data
+
+# vim files
+*.swp
+*.swo
+*.swn
+
+# Data files
+*.r3d
+*.npy
\ No newline at end of file
diff --git a/src/stretch/dynav/__init__.py b/src/stretch/dynav/__init__.py
new file mode 100644
index 000000000..770e2711e
--- /dev/null
+++ b/src/stretch/dynav/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+from .robot_agent_manip_mdp import RobotAgentMDP
diff --git a/src/stretch/dynav/communication_util.py b/src/stretch/dynav/communication_util.py
new file mode 100644
index 000000000..aa5d5b192
--- /dev/null
+++ b/src/stretch/dynav/communication_util.py
@@ -0,0 +1,94 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import cv2
+import numpy as np
+import zmq
+
+
+def load_socket(port_number):
+    context = zmq.Context()
+    socket = context.socket(zmq.REP)
+    socket.bind("tcp://*:" + str(port_number))
+
+    return socket
+
+
+def send_array(socket, A, flags=0, copy=True, track=False):
+    """send a numpy array with metadata"""
+    A = np.array(A)
+    md = dict(
+        dtype=str(A.dtype),
+        shape=A.shape,
+    )
+    socket.send_json(md, flags | zmq.SNDMORE)
+    return socket.send(np.ascontiguousarray(A), flags, copy=copy, track=track)
+
+
+def recv_array(socket, flags=0, copy=True, track=False):
+    """recv a numpy array"""
+    md = socket.recv_json(flags=flags)
+    msg = socket.recv(flags=flags, copy=copy, track=track)
+    A = np.frombuffer(msg, dtype=md["dtype"])
+    return A.reshape(md["shape"])
+
+
+def send_rgb_img(socket, img):
+    img = img.astype(np.uint8)
+    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 90]
+    _, img_encoded = cv2.imencode(".jpg", img, encode_param)
+    socket.send(img_encoded.tobytes())
+
+
+def recv_rgb_img(socket):
+    img = socket.recv()
+    img = np.frombuffer(img, dtype=np.uint8)
+    img = cv2.imdecode(img, cv2.IMREAD_COLOR)
+    return img
+
+
+def send_depth_img(socket, depth_img):
+    depth_img = (depth_img * 1000).astype(np.uint16)
+    encode_param = [
+        int(cv2.IMWRITE_PNG_COMPRESSION),
+        3,
+    ]  # Compression level from 0 (no compression) to 9 (max compression)
+    _, depth_img_encoded = cv2.imencode(".png", depth_img, encode_param)
+    socket.send(depth_img_encoded.tobytes())
+
+
+def recv_depth_img(socket):
+    depth_img = socket.recv()
+    depth_img = np.frombuffer(depth_img, dtype=np.uint8)
+    depth_img = cv2.imdecode(depth_img, cv2.IMREAD_UNCHANGED)
+    depth_img = depth_img / 1000.0
+    return depth_img
+
+
+def send_everything(socket, rgb, depth, intrinsics, pose):
+    send_rgb_img(socket, rgb)
+    socket.recv_string()
+    send_depth_img(socket, depth)
+    socket.recv_string()
+    send_array(socket, intrinsics)
+    socket.recv_string()
+    send_array(socket, pose)
+    socket.recv_string()
+
+
+def recv_everything(socket):
+    rgb = recv_rgb_img(socket)
+    socket.send_string("")
+    depth = recv_depth_img(socket)
+    socket.send_string("")
+    intrinsics = recv_array(socket)
+    socket.send_string("")
+    pose = recv_array(socket)
+    socket.send_string("")
+    return rgb, depth, intrinsics, pose
diff --git a/src/stretch/dynav/config.yaml b/src/stretch/dynav/config.yaml
new file mode 100644
index 000000000..33915d9e8
--- /dev/null
+++ b/src/stretch/dynav/config.yaml
@@ -0,0 +1,12 @@
+# pickle_file_name: debug/debug_2024-07-23_11-28-58.pkl
+pickle_file_name: null
+
+rerun: true
+open_communication: true
+static: false
+
+min_depth: 0.25
+max_depth: 2.5
+
+# Add this key
+ui_server_address: "http://localhost:8080"
diff --git a/src/stretch/dynav/llm_localizer.py b/src/stretch/dynav/llm_localizer.py
new file mode 100644
index 000000000..e92d891c8
--- /dev/null
+++ b/src/stretch/dynav/llm_localizer.py
@@ -0,0 +1,421 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import base64
+import os
+import time
+from collections import OrderedDict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from io import BytesIO
+from typing import Optional
+
+import google.generativeai as genai
+import numpy as np
+import torch
+from openai import OpenAI
+from PIL import Image
+from torch import Tensor
+from transformers import AutoProcessor, Owlv2ForObjectDetection
+
+from stretch.dynav.mapping_utils.voxelized_pcd import VoxelizedPointcloud
+
+genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
+generation_config = genai.GenerationConfig(temperature=0)
+safety_settings = [
+    {
+        "category": "HARM_CATEGORY_DANGEROUS",
+        "threshold": "BLOCK_NONE",
+    },
+    {
+        "category": "HARM_CATEGORY_HARASSMENT",
+        "threshold": "BLOCK_NONE",
+    },
+    {
+        "category": "HARM_CATEGORY_HATE_SPEECH",
+        "threshold": "BLOCK_NONE",
+    },
+    {
+        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+        "threshold": "BLOCK_NONE",
+    },
+    {
+        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+        "threshold": "BLOCK_NONE",
+    },
+]
+
+
+def get_inv_intrinsics(intrinsics):
+    # return intrinsics.double().inverse().to(intrinsics)
+    fx, fy, ppx, ppy = (
+        intrinsics[..., 0, 0],
+        intrinsics[..., 1, 1],
+        intrinsics[..., 0, 2],
+        intrinsics[..., 1, 2],
+    )
+    inv_intrinsics = torch.zeros_like(intrinsics)
+    inv_intrinsics[..., 0, 0] = 1.0 / fx
+    inv_intrinsics[..., 1, 1] = 1.0 / fy
+    inv_intrinsics[..., 0, 2] = -ppx / fx
+    inv_intrinsics[..., 1, 2] = -ppy / fy
+    inv_intrinsics[..., 2, 2] = 1.0
+    return inv_intrinsics
+
+
+def get_xyz(depth, pose, intrinsics):
+    """Returns the XYZ coordinates for a set of points.
+
+    Args:
+        depth: The depth array, with shape (B, 1, H, W)
+        pose: The pose array, with shape (B, 4, 4)
+        intrinsics: The intrinsics array, with shape (B, 3, 3)
+
+    Returns:
+        The XYZ coordinates of the projected points, with shape (B, H, W, 3)
+    """
+    if not isinstance(depth, torch.Tensor):
+        depth = torch.from_numpy(depth)
+    if not isinstance(pose, torch.Tensor):
+        pose = torch.from_numpy(pose)
+    if not isinstance(intrinsics, torch.Tensor):
+        intrinsics = torch.from_numpy(intrinsics)
+    while depth.ndim < 4:
+        depth = depth.unsqueeze(0)
+    while pose.ndim < 3:
+        pose = pose.unsqueeze(0)
+    while intrinsics.ndim < 3:
+        intrinsics = intrinsics.unsqueeze(0)
+    (bsz, _, height, width), device, dtype = depth.shape, depth.device, intrinsics.dtype
+
+    # Gets the pixel grid.
+    xs, ys = torch.meshgrid(
+        torch.arange(0, width, device=device, dtype=dtype),
+        torch.arange(0, height, device=device, dtype=dtype),
+        indexing="xy",
+    )
+    xy = torch.stack([xs, ys], dim=-1).flatten(0, 1).unsqueeze(0).repeat_interleave(bsz, 0)
+    xyz = torch.cat((xy, torch.ones_like(xy[..., :1])), dim=-1)
+
+    # Applies intrinsics and extrinsics.
+    # xyz = xyz @ intrinsics.inverse().transpose(-1, -2)
+    xyz = xyz @ get_inv_intrinsics(intrinsics).transpose(-1, -2)
+    xyz = xyz * depth.flatten(1).unsqueeze(-1)
+    xyz = (xyz[..., None, :] * pose[..., None, :3, :3]).sum(dim=-1) + pose[..., None, :3, 3]
+
+    xyz = xyz.unflatten(1, (height, width))
+
+    return xyz
+
+
+class LLM_Localizer:
+    def __init__(
+        self, voxel_map_wrapper=None, exist_model="gemini-1.5-pro", loc_model="owlv2", device="cuda"
+    ):
+        self.voxel_map_wrapper = voxel_map_wrapper
+        self.device = device
+        self.voxel_pcd = VoxelizedPointcloud(voxel_size=0.2).to(self.device)
+        self.existence_checking_model = exist_model
+
+        self.api_key_1 = os.getenv("GOOGLE_API_KEY")
+        self.api_key_2 = os.getenv("GOOGLE_API_KEY_2")
+        self.api_key_3 = os.getenv("GOOGLE_API_KEY_3")
+        self.api_key = self.api_key_1
+
+        self.context_length = 60
+        self.count_threshold = 3
+        if "gpt" in self.existence_checking_model:
+            self.max_img_per_request = 30
+        else:
+            self.max_img_per_request = 200
+
+        if exist_model == "gpt-4o":
+            print("WE ARE USING OPENAI GPT4o")
+            self.gpt_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+        elif exist_model == "gemini-1.5-pro":
+            print("WE ARE USING GEMINI 1.5 PRO")
+
+        elif exist_model == "gemini-1.5-flash":
+            print("WE ARE USING GEMINI 1.5 FLASH")
+        else:
+            print("YOU ARE USING NOTHING!")
+        self.location_checking_model = loc_model
+        if loc_model == "owlv2":
+            self.exist_processor = AutoProcessor.from_pretrained(
+                "google/owlv2-base-patch16-ensemble"
+            )
+            self.exist_model = Owlv2ForObjectDetection.from_pretrained(
+                "google/owlv2-base-patch16-ensemble"
+            ).to(self.device)
+            print("WE ARE USING OWLV2 FOR LOCALIZATION!")
+        else:
+            print("YOU ARE USING VOXEL MAP FOR LOCALIZATION!")
+
+    def add(
+        self,
+        points: Tensor,
+        features: Optional[Tensor],
+        rgb: Optional[Tensor],
+        weights: Optional[Tensor] = None,
+        obs_count: Optional[Tensor] = None,
+    ):
+        points = points.to(self.device)
+        if features is not None:
+            features = features.to(self.device)
+        if rgb is not None:
+            rgb = rgb.to(self.device)
+        if weights is not None:
+            weights = weights.to(self.device)
+        self.voxel_pcd.add(
+            points=points, features=features, rgb=rgb, weights=weights, obs_count=obs_count
+        )
+
+    def compute_coord(self, text, image_info, threshold=0.2):
+        rgb = image_info["image"]
+        inputs = self.exist_processor(text=text, images=rgb, return_tensors="pt")
+        for input in inputs:
+            inputs[input] = inputs[input].to("cuda")
+
+        with torch.no_grad():
+            outputs = self.exist_model(**inputs)
+
+        target_sizes = torch.Tensor([rgb.size[::-1]]).to(self.device)
+        results = self.exist_processor.image_processor.post_process_object_detection(
+            outputs, threshold=threshold, target_sizes=target_sizes
+        )[0]
+        depth = image_info["depth"]
+        xyzs = image_info["xyz"]
+        temp_lst = []
+        for idx, (score, bbox) in enumerate(
+            sorted(zip(results["scores"], results["boxes"]), key=lambda x: x[0], reverse=True)
+        ):
+
+            tl_x, tl_y, br_x, br_y = bbox
+            w, h = depth.shape
+            tl_x, tl_y, br_x, br_y = (
+                int(max(0, tl_x.item())),
+                int(max(0, tl_y.item())),
+                int(min(h, br_x.item())),
+                int(min(w, br_y.item())),
+            )
+            if np.median(depth[tl_y:br_y, tl_x:br_x].reshape(-1)) < 3:
+                coordinate = torch.from_numpy(
+                    np.median(xyzs[tl_y:br_y, tl_x:br_x].reshape(-1, 3), axis=0)
+                )
+                # temp_lst.append(coordinate)
+                return coordinate
+        return None
+        # return temp_lst
+
+    def owl_locater(self, A, encoded_image, timestamps_lst):
+        # query_coord_lst = []
+        # # dbscan = DBSCAN(eps=1.5, min_samples=1)
+        # centroid = None
+        for i in timestamps_lst:
+            if i in encoded_image:
+                image_info = encoded_image[i][-1]
+                res = self.compute_coord(A, image_info, threshold=0.2)
+                if res is not None:
+                    debug_text = (
+                        "#### - Object is detected in observations where instance"
+                        + str(i + 1)
+                        + " comes from. **😃** Directly navigate to it.\n"
+                    )
+                    return res, debug_text, i, None
+        debug_text = "#### - All instances are not the target! Maybe target object has not been observed yet. **😭**\n"
+
+        # if query_coord_lst != []:
+        #     query_coord_lst = np.array(query_coord_lst)
+        #     dbscan.fit(query_coord_lst)
+        #     labels = dbscan.labels_
+        #     unique_labels = set(labels) - {-1}
+        #     largest_cluster_label = None
+        #     largest_cluster_size = 0
+        #     for label in unique_labels:
+        #         cluster_size = np.sum(labels == label)
+        #         if cluster_size > largest_cluster_size:
+        #             largest_cluster_size = cluster_size
+        #             largest_cluster_label = label
+        #     largest_cluster_points = query_coord_lst[labels == largest_cluster_label]
+        #     centroid = largest_cluster_points.mean(axis=0)
+        #     return centroid, debug_text, None, None
+        return None, debug_text, None, None
+
+    def process_chunk(self, chunk, sys_prompt, user_prompt):
+        for i in range(50):
+            try:
+                if "gpt" in self.existence_checking_model:
+                    start_time = time.time()
+                    response = (
+                        self.gpt_client.chat.completions.create(
+                            model=self.existence_checking_model,
+                            messages=[
+                                {"role": "system", "content": sys_prompt},
+                                {"role": "user", "content": user_prompt},
+                                {"role": "user", "content": chunk},
+                            ],
+                            temperature=0.0,
+                        )
+                        .choices[0]
+                        .message.content
+                    )
+
+                    end_time = time.time()
+                    print("GPT request cost:", end_time - start_time)
+                else:
+                    model = genai.GenerativeModel(
+                        model_name=f"models/{self.existence_checking_model}-exp-0827",
+                        system_instruction=sys_prompt,
+                    )
+                    # "models/{self.existence_checking_model}-exp-0827"
+                    response = model.generate_content(
+                        chunk + [user_prompt],
+                        generation_config=generation_config,
+                        safety_settings=safety_settings,
+                    ).text
+                # print("Assistant: ", response)
+                return response
+            except Exception as e:
+                if self.api_key == self.api_key_1:
+                    self.api_key = self.api_key_2
+                elif self.api_key == self.api_key_2:
+                    self.api_key = self.api_key_3
+                elif self.api_key == self.api_key_3:
+                    self.api_key = self.api_key_1
+                genai.configure(api_key=self.api_key)
+                print(f"Error: {e}")
+                # time.sleep(10)
+        print("Execution Failed")
+        return "Execution Failed"
+
+    def update_dict(self, A, timestamps_dict, content):
+        timestamps_lst = content.split("\n")
+        for item in timestamps_lst:
+            if len(item) < 3 or ":" not in item:
+                continue
+            key, value_str = item.split(":")
+            if A not in key:
+                continue
+            if "None" in value_str:
+                value = None
+            else:
+                value = list(map(int, value_str.replace(" ", "").split(",")))
+            if key in timestamps_dict:
+                if timestamps_dict[key] is None:
+                    timestamps_dict[key] = value
+                elif value is not None:
+                    timestamps_dict[key].extend(value)
+            else:
+                timestamps_dict[key] = value
+
+    def llm_locator(self, A, encoded_image):
+        timestamps_dict = {}
+
+        sys_prompt = f"""
+        For object query I give, you need to find timestamps of images that the object is shown, without any unnecessary explanation or space. If the object never exist, please output the object name and the word "None" for timestamps.
+
+        Example:
+        Input:
+        The object you need to find is blue bottle, orange chair, white box
+
+        Output: 
+        blue bottle: 3,6,9
+        orange chair: None
+        white box: 2,4,10"""
+
+        user_prompt = f"""The object you need to find is {A}"""
+        if "gpt" in self.existence_checking_model:
+            content = [item for sublist in list(encoded_image.values()) for item in sublist[:2]][
+                -self.context_length * 2 :
+            ]
+        else:
+            content = [item for sublist in list(encoded_image.values()) for item in sublist[0]][
+                -self.context_length * 2 :
+            ]
+
+        content_chunks = [
+            content[i : i + 2 * self.max_img_per_request]
+            for i in range(0, len(content), 2 * self.max_img_per_request)
+        ]
+
+        with ThreadPoolExecutor(max_workers=5) as executor:
+            future_to_chunk = {
+                executor.submit(self.process_chunk, chunk, sys_prompt, user_prompt): chunk
+                for chunk in content_chunks
+            }
+
+            for future in as_completed(future_to_chunk):
+                chunk = future_to_chunk[future]
+                try:
+                    result = future.result()
+                    if result:
+                        self.update_dict(A, timestamps_dict, result)
+                except Exception as e:
+                    print(f"Exception occurred: {e}")
+        if A not in timestamps_dict:
+            return None, "debug_text", None, None
+
+        timestamps_lst = timestamps_dict[A]
+        if timestamps_lst is None:
+            return None, "debug_text", None, None
+        timestamps_lst = sorted(timestamps_lst, reverse=True)
+        # return None
+        return self.owl_locater(A, encoded_image, timestamps_lst)
+
+    def localize_A(self, A, debug=True, return_debug=False):
+        encoded_image = OrderedDict()
+        counts = torch.bincount(self.voxel_pcd._obs_counts)
+        filtered_obs = (counts > self.count_threshold).nonzero(as_tuple=True)[0].tolist()
+        filtered_obs = sorted(filtered_obs)
+
+        for obs_id in filtered_obs:
+            obs_id -= 1
+            rgb = np.copy(self.voxel_map_wrapper.observations[obs_id].rgb.numpy())
+            depth = self.voxel_map_wrapper.observations[obs_id].depth
+            camera_pose = self.voxel_map_wrapper.observations[obs_id].camera_pose
+            camera_K = self.voxel_map_wrapper.observations[obs_id].camera_K
+            xyz = get_xyz(depth, camera_pose, camera_K)[0]
+            depth = depth.numpy()
+
+            rgb[depth > 2.5] = [0, 0, 0]
+
+            image = Image.fromarray(rgb.astype(np.uint8), mode="RGB")
+            if "gemini" in self.existence_checking_model:
+                encoded_image[obs_id] = [
+                    [f"Following is the image took on timestamp {obs_id}: ", image],
+                    {"image": image, "xyz": xyz, "depth": depth},
+                ]
+            elif "gpt" in self.existence_checking_model:
+                buffered = BytesIO()
+                image.save(buffered, format="PNG")
+                img_bytes = buffered.getvalue()
+                base64_encoded = base64.b64encode(img_bytes).decode("utf-8")
+                encoded_image[obs_id] = [
+                    {
+                        "type": "text",
+                        "text": f"Following is the image took on timestamp {obs_id}: ",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/png;base64,{base64_encoded}"},
+                    },
+                    {"image": image, "xyz": xyz, "depth": depth},
+                ]
+                # print(obs_id)
+
+        start_time = time.time()
+        target_point, debug_text, obs, point = self.llm_locator(A, encoded_image)
+        end_time = time.time()
+        print("It takes", end_time - start_time)
+
+        if not debug:
+            return target_point
+        elif not return_debug:
+            return target_point, debug_text
+        else:
+            return target_point, debug_text, obs, point
diff --git a/src/stretch/dynav/llm_server.py b/src/stretch/dynav/llm_server.py
new file mode 100644
index 000000000..28834518d
--- /dev/null
+++ b/src/stretch/dynav/llm_server.py
@@ -0,0 +1,581 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import datetime
+import os
+import pickle
+import threading
+import time
+from io import BytesIO
+from pathlib import Path
+
+import cv2
+import numpy as np
+import rerun as rr
+import scipy
+import torch
+from matplotlib import pyplot as plt
+from PIL import Image
+
+from stretch.core import get_parameters
+from stretch.dynav.communication_util import load_socket, recv_everything
+from stretch.dynav.llm_localizer import LLM_Localizer
+from stretch.dynav.mapping_utils.a_star import AStar
+from stretch.dynav.mapping_utils.voxel import SparseVoxelMap
+from stretch.dynav.mapping_utils.voxel_map import SparseVoxelMapNavigationSpace
+
+
+def get_inv_intrinsics(intrinsics):
+    # return intrinsics.double().inverse().to(intrinsics)
+    fx, fy, ppx, ppy = (
+        intrinsics[..., 0, 0],
+        intrinsics[..., 1, 1],
+        intrinsics[..., 0, 2],
+        intrinsics[..., 1, 2],
+    )
+    inv_intrinsics = torch.zeros_like(intrinsics)
+    inv_intrinsics[..., 0, 0] = 1.0 / fx
+    inv_intrinsics[..., 1, 1] = 1.0 / fy
+    inv_intrinsics[..., 0, 2] = -ppx / fx
+    inv_intrinsics[..., 1, 2] = -ppy / fy
+    inv_intrinsics[..., 2, 2] = 1.0
+    return inv_intrinsics
+
+
+def get_xyz(depth, pose, intrinsics):
+    """Returns the XYZ coordinates for a set of points.
+
+    Args:
+        depth: The depth array, with shape (B, 1, H, W)
+        pose: The pose array, with shape (B, 4, 4)
+        intrinsics: The intrinsics array, with shape (B, 3, 3)
+
+    Returns:
+        The XYZ coordinates of the projected points, with shape (B, H, W, 3)
+    """
+    if not isinstance(depth, torch.Tensor):
+        depth = torch.from_numpy(depth)
+    if not isinstance(pose, torch.Tensor):
+        pose = torch.from_numpy(pose)
+    if not isinstance(intrinsics, torch.Tensor):
+        intrinsics = torch.from_numpy(intrinsics)
+    while depth.ndim < 4:
+        depth = depth.unsqueeze(0)
+    while pose.ndim < 3:
+        pose = pose.unsqueeze(0)
+    while intrinsics.ndim < 3:
+        intrinsics = intrinsics.unsqueeze(0)
+    (bsz, _, height, width), device, dtype = depth.shape, depth.device, intrinsics.dtype
+
+    # Gets the pixel grid.
+    xs, ys = torch.meshgrid(
+        torch.arange(0, width, device=device, dtype=dtype),
+        torch.arange(0, height, device=device, dtype=dtype),
+        indexing="xy",
+    )
+    xy = torch.stack([xs, ys], dim=-1).flatten(0, 1).unsqueeze(0).repeat_interleave(bsz, 0)
+    xyz = torch.cat((xy, torch.ones_like(xy[..., :1])), dim=-1)
+
+    # Applies intrinsics and extrinsics.
+    # xyz = xyz @ intrinsics.inverse().transpose(-1, -2)
+    xyz = xyz @ get_inv_intrinsics(intrinsics).transpose(-1, -2)
+    xyz = xyz * depth.flatten(1).unsqueeze(-1)
+    xyz = (xyz[..., None, :] * pose[..., None, :3, :3]).sum(dim=-1) + pose[..., None, :3, 3]
+
+    xyz = xyz.unflatten(1, (height, width))
+
+    return xyz
+
+
+class ImageProcessor:
+    def __init__(
+        self,
+        vision_method="pro_owl",
+        siglip=True,
+        device="cuda",
+        min_depth=0.25,
+        max_depth=2.5,
+        img_port=5555,
+        text_port=5556,
+        open_communication=True,
+        rerun=True,
+        static=True,
+        log=None,
+    ):
+        self.static = static
+        self.siglip = siglip
+        current_datetime = datetime.datetime.now()
+        if log is None:
+            self.log = "debug_" + current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
+        else:
+            self.log = log
+        self.rerun = rerun
+        if self.rerun:
+            rr.init(self.log, spawn=True)
+            # if self.static:
+            #     rr.init(self.log, spawn = False)
+            #     rr.connect('100.108.67.79:9876')
+            # else:
+            #     rr.init(self.log, spawn = True)
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.obs_count = 0
+        # There are three vision methods:
+        # 1. 'mask*lip' Following the idea of https://arxiv.org/abs/2112.01071, remove the last layer of any VLM and obtain the dense features
+        # 2. 'mask&*lip' Following the idea of https://mahis.life/clip-fields/, extract segmentation mask and assign a vision-language feature to it
+        self.vision_method = vision_method
+        # If cuda is not available, then device will be forced to be cpu
+        if not torch.cuda.is_available():
+            device = "cpu"
+        self.device = device
+
+        self.create_obstacle_map()
+        self.create_vision_model()
+
+        self.voxel_map_lock = (
+            threading.Lock()
+        )  # Create a lock for synchronizing access to `self.voxel_map_localizer`
+
+        if open_communication:
+            self.img_socket = load_socket(img_port)
+            self.text_socket = load_socket(text_port)
+
+            self.img_thread = threading.Thread(target=self._recv_image)
+            self.img_thread.daemon = True
+            self.img_thread.start()
+
+    def create_obstacle_map(self):
+        print("- Load parameters")
+        parameters = get_parameters("dynav_config.yaml")
+        self.default_expand_frontier_size = parameters["default_expand_frontier_size"]
+        self.voxel_map = SparseVoxelMap(
+            resolution=parameters["voxel_size"],
+            local_radius=parameters["local_radius"],
+            obs_min_height=parameters["obs_min_height"],
+            obs_max_height=parameters["obs_max_height"],
+            obs_min_density=parameters["obs_min_density"],
+            exp_min_density=parameters["exp_min_density"],
+            min_depth=self.min_depth,
+            max_depth=self.max_depth,
+            pad_obstacles=parameters["pad_obstacles"],
+            add_local_radius_points=parameters.get("add_local_radius_points", default=True),
+            remove_visited_from_obstacles=parameters.get(
+                "remove_visited_from_obstacles", default=False
+            ),
+            smooth_kernel_size=parameters.get("filters/smooth_kernel_size", -1),
+            use_median_filter=parameters.get("filters/use_median_filter", False),
+            median_filter_size=parameters.get("filters/median_filter_size", 5),
+            median_filter_max_error=parameters.get("filters/median_filter_max_error", 0.01),
+            use_derivative_filter=parameters.get("filters/use_derivative_filter", False),
+            derivative_filter_threshold=parameters.get("filters/derivative_filter_threshold", 0.5),
+        )
+        self.space = SparseVoxelMapNavigationSpace(
+            self.voxel_map,
+            # step_size=parameters["step_size"],
+            rotation_step_size=parameters["rotation_step_size"],
+            dilate_frontier_size=parameters[
+                "dilate_frontier_size"
+            ],  # 0.6 meters back from every edge = 12 * 0.02 = 0.24
+            dilate_obstacle_size=parameters["dilate_obstacle_size"],
+        )
+
+        # Create a simple motion planner
+        self.planner = AStar(self.space)
+
+    def create_vision_model(self):
+        if self.vision_method == "gpt_owl":
+            self.voxel_map_localizer = LLM_Localizer(
+                self.voxel_map, exist_model="gpt-4o", loc_model="owlv2", device=self.device
+            )
+        elif self.vision_method == "flash_owl":
+            self.voxel_map_localizer = LLM_Localizer(
+                self.voxel_map,
+                exist_model="gemini-1.5-flash",
+                loc_model="owlv2",
+                device=self.device,
+            )
+        elif self.vision_method == "pro_owl":
+            self.voxel_map_localizer = LLM_Localizer(
+                self.voxel_map, exist_model="gemini-1.5-pro", loc_model="owlv2", device=self.device
+            )
+
+    def process_text(self, text, start_pose):
+        if self.rerun:
+            rr.log("/object", rr.Clear(recursive=True), static=self.static)
+            rr.log("/robot_start_pose", rr.Clear(recursive=True), static=self.static)
+            rr.log("/direction", rr.Clear(recursive=True), static=self.static)
+            rr.log("robot_monologue", rr.Clear(recursive=True), static=self.static)
+            rr.log(
+                "/Past_observation_most_similar_to_text",
+                rr.Clear(recursive=True),
+                static=self.static,
+            )
+            if not self.static:
+                rr.connect("100.108.67.79:9876")
+        debug_text = ""
+        mode = "navigation"
+        obs = None
+        # Do visual grounding
+        if text is not None and text != "":
+            with self.voxel_map_lock:
+                localized_point, debug_text, obs, pointcloud = self.voxel_map_localizer.localize_A(
+                    text, debug=True, return_debug=True
+                )
+            if localized_point is not None:
+                rr.log(
+                    "/object",
+                    rr.Points3D(
+                        [localized_point[0], localized_point[1], 1.5],
+                        colors=torch.Tensor([1, 0, 0]),
+                        radii=0.1,
+                    ),
+                    static=self.static,
+                )
+        # Do Frontier based exploration
+        if text is None or text == "" or localized_point is None:
+            debug_text += "## Navigation fails, so robot starts exploring environments.\n"
+            localized_point = self.sample_frontier(start_pose, text)
+            mode = "exploration"
+            rr.log(
+                "/object",
+                rr.Points3D([0, 0, 0], colors=torch.Tensor([1, 0, 0]), radii=0),
+                static=self.static,
+            )
+            print("\n", localized_point, "\n")
+
+        if localized_point is None:
+            return []
+
+        if len(localized_point) == 2:
+            localized_point = np.array([localized_point[0], localized_point[1], 0])
+
+        point = self.sample_navigation(start_pose, localized_point)
+
+        if self.rerun:
+            buf = BytesIO()
+            plt.savefig(buf, format="png")
+            buf.seek(0)
+            img = Image.open(buf)
+            img = np.array(img)
+            buf.close()
+            rr.log("2d_map", rr.Image(img), static=self.static)
+        else:
+            if text != "":
+                plt.savefig(self.log + "/debug_" + text + str(self.obs_count) + ".png")
+            else:
+                plt.savefig(self.log + "/debug_exploration" + str(self.obs_count) + ".png")
+        plt.clf()
+
+        if self.rerun:
+            if text is not None and text != "":
+                debug_text = "### The goal is to navigate to " + text + ".\n" + debug_text
+            else:
+                debug_text = "### I have not received any text query from human user.\n ### So, I plan to explore the environment with Frontier-based exploration.\n"
+            debug_text = "# Robot's monologue: \n" + debug_text
+            rr.log(
+                "robot_monologue",
+                rr.TextDocument(debug_text, media_type=rr.MediaType.MARKDOWN),
+                static=self.static,
+            )
+
+        if obs is not None and mode == "navigation":
+            rgb = self.voxel_map.observations[obs].rgb
+            if not self.rerun:
+                if isinstance(rgb, torch.Tensor):
+                    rgb = np.array(rgb)
+                cv2.imwrite(self.log + "/debug_" + text + ".png", rgb[:, :, [2, 1, 0]])
+            else:
+                rr.log("/Past_observation_most_similar_to_text", rr.Image(rgb), static=self.static)
+        traj = []
+        waypoints = None
+
+        if point is None:
+            print("Unable to find any target point, some exception might happen")
+        else:
+            print("Target point is", point)
+            res = self.planner.plan(start_pose, point)
+            if res.success:
+                waypoints = [pt.state for pt in res.trajectory]
+                # If we are navigating to some object of interest, send (x, y, z) of
+                # the object so that we can make sure the robot looks at the object after navigation
+                finished = len(waypoints) <= 10 and mode == "navigation"
+                if not finished:
+                    waypoints = waypoints[:8]
+                traj = self.planner.clean_path_for_xy(waypoints)
+                # traj = traj[1:]
+                if finished:
+                    traj.append([np.nan, np.nan, np.nan])
+                    if isinstance(localized_point, torch.Tensor):
+                        localized_point = localized_point.tolist()
+                    traj.append(localized_point)
+                print("Planned trajectory:", traj)
+            else:
+                print("[FAILURE]", res.reason)
+
+        if traj is not None:
+            origins = []
+            vectors = []
+            for idx in range(len(traj)):
+                if idx != len(traj) - 1:
+                    origins.append([traj[idx][0], traj[idx][1], 1.5])
+                    vectors.append(
+                        [traj[idx + 1][0] - traj[idx][0], traj[idx + 1][1] - traj[idx][1], 0]
+                    )
+            rr.log(
+                "/direction",
+                rr.Arrows3D(
+                    origins=origins, vectors=vectors, colors=torch.Tensor([0, 1, 0]), radii=0.05
+                ),
+                static=self.static,
+            )
+            rr.log(
+                "/robot_start_pose",
+                rr.Points3D(
+                    [start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1
+                ),
+                static=self.static,
+            )
+
+        # self.write_to_pickle()
+        return traj
+
+    def sample_navigation(self, start, point):
+        plt.clf()
+        obstacles, _ = self.voxel_map.get_2d_map()
+        plt.imshow(obstacles)
+        if point is None:
+            start_pt = self.planner.to_pt(start)
+            plt.scatter(start_pt[1], start_pt[0], s=10)
+            return None
+        goal = self.space.sample_target_point(start, point, self.planner)
+        print("point:", point, "goal:", goal)
+        obstacles, explored = self.voxel_map.get_2d_map()
+        start_pt = self.planner.to_pt(start)
+        plt.scatter(start_pt[1], start_pt[0], s=15, c="b")
+        point_pt = self.planner.to_pt(point)
+        plt.scatter(point_pt[1], point_pt[0], s=15, c="g")
+        if goal is not None:
+            goal_pt = self.planner.to_pt(goal)
+            plt.scatter(goal_pt[1], goal_pt[0], s=10, c="r")
+        return goal
+
+    def sample_frontier(self, start_pose=[0, 0, 0], text=None):
+        with self.voxel_map_lock:
+            if text is not None and text != "":
+                (
+                    index,
+                    time_heuristics,
+                    alignments_heuristics,
+                    total_heuristics,
+                ) = self.space.sample_exploration(start_pose, self.planner, None, None, debug=False)
+            else:
+                index, time_heuristics, _, total_heuristics = self.space.sample_exploration(
+                    start_pose, self.planner, None, None, debug=False
+                )
+                alignments_heuristics = time_heuristics
+
+        obstacles, explored = self.voxel_map.get_2d_map()
+        plt.clf()
+        plt.imshow(obstacles * 0.5 + explored * 0.5)
+        plt.scatter(index[1], index[0], s=20, c="r")
+        return self.voxel_map.grid_coords_to_xyt(torch.tensor([index[0], index[1]]))
+
+    def _recv_image(self):
+        while True:
+            rgb, depth, intrinsics, pose = recv_everything(self.img_socket)
+            start_time = time.time()
+            self.process_rgbd_images(rgb, depth, intrinsics, pose)
+            process_time = time.time() - start_time
+            print("Image processing takes", process_time, "seconds")
+
+    def add_to_voxel_pcd(self, valid_xyz, feature, valid_rgb, weights=None, threshold=0.95):
+        # Adding all points to voxelizedPointCloud is useless and expensive, we should exclude threshold of all points
+        selected_indices = torch.randperm(len(valid_xyz))[: int((1 - threshold) * len(valid_xyz))]
+        if len(selected_indices) == 0:
+            return
+        if valid_xyz is not None:
+            valid_xyz = valid_xyz[selected_indices]
+        if feature is not None:
+            feature = feature[selected_indices]
+        if valid_rgb is not None:
+            valid_rgb = valid_rgb[selected_indices]
+        if weights is not None:
+            weights = weights[selected_indices]
+        with self.voxel_map_lock:
+            self.voxel_map_localizer.add(
+                points=valid_xyz,
+                features=feature,
+                rgb=valid_rgb,
+                weights=weights,
+                obs_count=self.obs_count,
+            )
+
+    def process_rgbd_images(self, rgb, depth, intrinsics, pose):
+        if not os.path.exists(self.log):
+            os.mkdir(self.log)
+        self.obs_count += 1
+        world_xyz = get_xyz(depth, pose, intrinsics).squeeze(0)
+
+        cv2.imwrite("debug/rgb" + str(self.obs_count) + ".jpg", rgb[:, :, [2, 1, 0]])
+
+        rgb, depth = torch.from_numpy(rgb), torch.from_numpy(depth)
+        rgb = rgb.permute(2, 0, 1).to(torch.uint8)
+
+        median_depth = torch.from_numpy(scipy.ndimage.median_filter(depth, size=5))
+        median_filter_error = (depth - median_depth).abs()
+        valid_depth = torch.logical_and(depth < self.max_depth, depth > self.min_depth)
+        valid_depth = valid_depth & (median_filter_error < 0.01).bool()
+
+        with self.voxel_map_lock:
+            self.voxel_map_localizer.voxel_pcd.clear_points(
+                depth, torch.from_numpy(intrinsics), torch.from_numpy(pose), min_samples_clear=20
+            )
+            self.voxel_map.voxel_pcd.clear_points(
+                depth, torch.from_numpy(intrinsics), torch.from_numpy(pose)
+            )
+
+        if "_owl" in self.vision_method:
+            self.run_llm_owl(rgb, ~valid_depth, world_xyz)
+
+        self.voxel_map.add(
+            camera_pose=torch.Tensor(pose),
+            rgb=torch.Tensor(rgb).permute(1, 2, 0),
+            depth=torch.Tensor(depth),
+            camera_K=torch.Tensor(intrinsics),
+        )
+        obs, exp = self.voxel_map.get_2d_map()
+        if self.rerun:
+            if self.voxel_map.voxel_pcd._points is not None:
+                rr.log(
+                    "Obstalce_map/pointcloud",
+                    rr.Points3D(
+                        self.voxel_map.voxel_pcd._points.detach().cpu(),
+                        colors=self.voxel_map.voxel_pcd._rgb.detach().cpu() / 255.0,
+                        radii=0.03,
+                    ),
+                    static=self.static,
+                )
+            if self.voxel_map_localizer.voxel_pcd._points is not None:
+                rr.log(
+                    "Semantic_memory/pointcloud",
+                    rr.Points3D(
+                        self.voxel_map_localizer.voxel_pcd._points.detach().cpu(),
+                        colors=self.voxel_map_localizer.voxel_pcd._rgb.detach().cpu() / 255.0,
+                        radii=0.03,
+                    ),
+                    static=self.static,
+                )
+        else:
+            cv2.imwrite(
+                self.log + "/debug_" + str(self.obs_count) + ".jpg",
+                np.asarray(obs.int() * 127 + exp.int() * 127),
+            )
+
+    def run_llm_owl(self, rgb, mask, world_xyz):
+        valid_xyz = world_xyz[~mask]
+        valid_rgb = rgb.permute(1, 2, 0)[~mask]
+        if len(valid_xyz) != 0:
+            self.add_to_voxel_pcd(valid_xyz, None, valid_rgb)
+
+    def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
+        print("Reading from ", pickle_file_name)
+        rr.init("Debug", spawn=True)
+        if isinstance(pickle_file_name, str):
+            pickle_file_name = Path(pickle_file_name)
+        assert pickle_file_name.exists(), f"No file found at {pickle_file_name}"
+        with pickle_file_name.open("rb") as f:
+            data = pickle.load(f)
+        for i, (camera_pose, xyz, rgb, feats, depth, base_pose, K, world_xyz,) in enumerate(
+            zip(
+                data["camera_poses"],
+                data["xyz"],
+                data["rgb"],
+                data["feats"],
+                data["depth"],
+                data["base_poses"],
+                data["camera_K"],
+                data["world_xyz"],
+            )
+        ):
+            # Handle the case where we dont actually want to load everything
+            if num_frames > 0 and i >= num_frames:
+                break
+
+            camera_pose = self.voxel_map.fix_data_type(camera_pose)
+            xyz = self.voxel_map.fix_data_type(xyz)
+            rgb = self.voxel_map.fix_data_type(rgb)
+            depth = self.voxel_map.fix_data_type(depth)
+            intrinsics = self.voxel_map.fix_data_type(K)
+            if feats is not None:
+                feats = self.voxel_map.fix_data_type(feats)
+            base_pose = self.voxel_map.fix_data_type(base_pose)
+            self.voxel_map.voxel_pcd.clear_points(depth, intrinsics, camera_pose)
+            self.voxel_map.add(
+                camera_pose=camera_pose,
+                xyz=xyz,
+                rgb=rgb,
+                feats=feats,
+                depth=depth,
+                base_pose=base_pose,
+                camera_K=K,
+            )
+            self.obs_count += 1
+        self.voxel_map_localizer.voxel_pcd._points = data["combined_xyz"]
+        self.voxel_map_localizer.voxel_pcd._features = data["combined_feats"]
+        self.voxel_map_localizer.voxel_pcd._weights = data["combined_weights"]
+        self.voxel_map_localizer.voxel_pcd._rgb = data["combined_rgb"]
+        self.voxel_map_localizer.voxel_pcd._obs_counts = data["obs_id"]
+        self.voxel_map_localizer.voxel_pcd._entity_ids = data["entity_id"]
+        self.voxel_map_localizer.voxel_pcd.obs_count = max(
+            self.voxel_map_localizer.voxel_pcd._obs_counts
+        ).item()
+        self.voxel_map.voxel_pcd.obs_count = max(
+            self.voxel_map_localizer.voxel_pcd._obs_counts
+        ).item()
+
+    def write_to_pickle(self):
+        """Write out to a pickle file. This is a rough, quick-and-easy output for debugging, not intended to replace the scalable data writer in data_tools for bigger efforts."""
+        if not os.path.exists("debug"):
+            os.mkdir("debug")
+        filename = "debug/" + self.log + ".pkl"
+        data = {}
+        data["camera_poses"] = []
+        data["camera_K"] = []
+        data["base_poses"] = []
+        data["xyz"] = []
+        data["world_xyz"] = []
+        data["rgb"] = []
+        data["depth"] = []
+        data["feats"] = []
+        for frame in self.voxel_map.observations:
+            # add it to pickle
+            # TODO: switch to using just Obs struct?
+            data["camera_poses"].append(frame.camera_pose)
+            data["base_poses"].append(frame.base_pose)
+            data["camera_K"].append(frame.camera_K)
+            data["xyz"].append(frame.xyz)
+            data["world_xyz"].append(frame.full_world_xyz)
+            data["rgb"].append(frame.rgb)
+            data["depth"].append(frame.depth)
+            data["feats"].append(frame.feats)
+            for k, v in frame.info.items():
+                if k not in data:
+                    data[k] = []
+                data[k].append(v)
+        (
+            data["combined_xyz"],
+            data["combined_feats"],
+            data["combined_weights"],
+            data["combined_rgb"],
+        ) = self.voxel_map_localizer.voxel_pcd.get_pointcloud()
+        data["obs_id"] = self.voxel_map_localizer.voxel_pcd._obs_counts
+        data["entity_id"] = self.voxel_map_localizer.voxel_pcd._entity_ids
+        with open(filename, "wb") as f:
+            pickle.dump(data, f)
+        print("write all data to", filename)
diff --git a/src/stretch/dynav/mapping_utils/__init__.py b/src/stretch/dynav/mapping_utils/__init__.py
new file mode 100644
index 000000000..2532abc5e
--- /dev/null
+++ b/src/stretch/dynav/mapping_utils/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
diff --git a/src/stretch/dynav/mapping_utils/a_star.py b/src/stretch/dynav/mapping_utils/a_star.py
new file mode 100644
index 000000000..95e4b1b44
--- /dev/null
+++ b/src/stretch/dynav/mapping_utils/a_star.py
@@ -0,0 +1,344 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import heapq
+import math
+import time
+from typing import List, Set, Tuple
+
+import numpy as np
+
+from stretch.dynav.mapping_utils.voxel_map import SparseVoxelMapNavigationSpace
+from stretch.motion import PlanResult
+
+
+class Node:
+    """Stores an individual spot in the tree"""
+
+    def __init__(self, state):
+        self.state = state
+
+
+def neighbors(pt: Tuple[int, int]) -> List[Tuple[int, int]]:
+    return [
+        (pt[0] + dx, pt[1] + dy) for dx in range(-1, 2) for dy in range(-1, 2) if (dx, dy) != (0, 0)
+    ]
+
+
+class AStar:
+    """Define RRT planning problem and parameters"""
+
+    def __init__(
+        self,
+        space: SparseVoxelMapNavigationSpace,
+    ):
+        """Create RRT planner with configuration"""
+        self.space = space
+        self.reset()
+
+    def compute_theta(self, cur_x, cur_y, end_x, end_y):
+        theta = 0
+        if end_x == cur_x and end_y >= cur_y:
+            theta = np.pi / 2
+        elif end_x == cur_x and end_y < cur_y:
+            theta = -np.pi / 2
+        else:
+            theta = np.arctan((end_y - cur_y) / (end_x - cur_x))
+            if end_x < cur_x:
+                theta = theta + np.pi
+            # move theta into [-pi, pi] range, for this function specifically,
+            # (theta -= 2 * pi) or (theta += 2 * pi) is enough
+            if theta > np.pi:
+                theta = theta - 2 * np.pi
+            if theta < np.pi:
+                theta = theta + 2 * np.pi
+        return theta
+
+    def reset(self):
+        # print('loading the up to date navigable map')
+        # print('Wait')
+        obs, exp = self.space.voxel_map.get_2d_map()
+        # print('up to date navigable map loaded')
+        self._navigable = ~obs & exp
+        # self._navigable = ~obs
+        self.start_time = time.time()
+
+    def point_is_occupied(self, x: int, y: int) -> bool:
+        return not bool(self._navigable[x][y])
+
+    def to_pt(self, xy: Tuple[float, float]) -> Tuple[int, int]:
+        # # type: ignore to bypass mypy checking
+        xy = np.array([xy[0], xy[1]])  # type: ignore
+        pt = self.space.voxel_map.xy_to_grid_coords(xy)  # type: ignore
+        return tuple(pt.tolist())
+
+    def to_xy(self, pt: Tuple[int, int]) -> Tuple[float, float]:
+        # # type: ignore to bypass mypy checking
+        pt = np.array([pt[0], pt[1]])  # type: ignore
+        xy = self.space.voxel_map.grid_coords_to_xy(pt)  # type: ignore
+        return tuple(xy.tolist())
+
+    def compute_dis(self, a: Tuple[int, int], b: Tuple[int, int]):
+        return ((a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2) ** 0.5
+
+    def compute_obstacle_punishment(self, a: Tuple[int, int], weight: int, avoid: int) -> float:
+        obstacle_punishment = 0
+        # it is a trick, if we compute euclidean dis between a and every obstacle,
+        # this single compute_obstacle_punishment will be O(n) complexity
+        # so we just check a square of size (2*avoid) * (2*avoid)
+        # centered at a, which is O(1) complexity
+        for i in range(-avoid, avoid + 1):
+            for j in range(-avoid, avoid + 1):
+                if self.point_is_occupied(a[0] + i, a[1] + j):
+                    b = [a[0] + i, a[1] + j]
+                    obs_dis = ((a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2) ** 0.5
+                    obstacle_punishment = max((weight / max(obs_dis, 1)), obstacle_punishment)
+        return obstacle_punishment
+
+    # A* heuristic
+    def compute_heuristic(self, a: Tuple[int, int], b: Tuple[int, int], weight=6, avoid=3) -> float:
+        return (
+            self.compute_dis(a, b)
+            + weight * self.compute_obstacle_punishment(a, weight, avoid)
+            + self.compute_obstacle_punishment(b, weight, avoid)
+        )
+
+    def is_in_line_of_sight(self, start_pt: Tuple[int, int], end_pt: Tuple[int, int]) -> bool:
+        """Checks if there is a line-of-sight between two points.
+
+        Implements using Bresenham's line algorithm.
+
+        Args:
+            start_pt: The starting point.
+            end_pt: The ending point.
+
+        Returns:
+            Whether there is a line-of-sight between the two points.
+        """
+
+        dx = end_pt[0] - start_pt[0]
+        dy = end_pt[1] - start_pt[1]
+
+        if abs(dx) > abs(dy):
+            if dx < 0:
+                start_pt, end_pt = end_pt, start_pt
+            for x in range(start_pt[0], end_pt[0] + 1):
+                yf = start_pt[1] + (x - start_pt[0]) / dx * dy
+                for y in list({math.floor(yf), math.ceil(yf)}):
+                    if self.point_is_occupied(x, y):
+                        return False
+
+        else:
+            if dy < 0:
+                start_pt, end_pt = end_pt, start_pt
+            for y in range(start_pt[1], end_pt[1] + 1):
+                xf = start_pt[0] + (y - start_pt[1]) / dy * dx
+                for x in list({math.floor(xf), math.ceil(xf)}):
+                    if self.point_is_occupied(x, y):
+                        return False
+
+        return True
+
+    def is_a_line(self, a, b, c):
+        if a[0] == b[0]:
+            return c[0] == a[0]
+        if b[0] == c[0]:
+            return False
+        return ((c[1] - b[1]) / (c[0] - b[0])) == ((b[1] - a[1]) / (b[0] - a[0]))
+
+    def clean_path_for_xy(self, waypoints):
+        goal = waypoints[-1]
+        waypts = [self.to_pt(waypoint) for waypoint in waypoints]
+        waypts = self.clean_path(waypts)
+        waypoints = [self.to_xy(waypt) for waypt in waypts]
+        traj = []
+        for i in range(len(waypoints) - 1):
+            theta = self.compute_theta(
+                waypoints[i][0], waypoints[i][1], waypoints[i + 1][0], waypoints[i + 1][1]
+            )
+            traj.append([waypoints[i][0], waypoints[i][1], float(theta)])
+        traj.append([waypoints[-1][0], waypoints[-1][1], goal[-1]])
+        return traj
+
+    def clean_path(self, path) -> List[Tuple[int, int]]:
+        """Cleans up the final path.
+
+        This implements a simple algorithm where, given some current position,
+        we find the last point in the path that is in line-of-sight, and then
+        we set the current position to that point. This is repeated until we
+        reach the end of the path. This is not particularly efficient, but
+        it's simple and works well enough.
+
+        Args:
+            path: The path to clean up.
+
+        Returns:
+            The cleaned up path.
+        """
+        cleaned_path = [path[0]]
+        i = 0
+        while i < len(path) - 1:
+            for j in range(len(path) - 1, i, -1):
+                if self.is_in_line_of_sight(path[i][:2], path[j][:2]):
+                    break
+            else:
+                j = i + 1
+            # Include the mid waypoint to avoid the collision
+            # if j - i >= 2:
+            #     cleaned_path.append(path[(i + j) // 2])
+            cleaned_path.append(path[j])
+            i = j
+        return cleaned_path
+
+    def get_unoccupied_neighbor(self, pt: Tuple[int, int], goal_pt=None) -> Tuple[int, int]:
+        if not self.point_is_occupied(*pt):
+            return pt
+
+        # This is a sort of hack to deal with points that are close to an edge.
+        # If the start point is occupied, we check adjacent points until we get
+        # one which is unoccupied. If we can't find one, we throw an error.
+        neighbor_pts = neighbors(pt)
+        if goal_pt is not None:
+            goal_pt_non_null = goal_pt
+            neighbor_pts.sort(key=lambda n: self.compute_heuristic(n, goal_pt_non_null))
+        for neighbor_pt in neighbor_pts:
+            if not self.point_is_occupied(*neighbor_pt):
+                return neighbor_pt
+        print(
+            "The robot might stand on a non navigable point, so check obstacle map and restart roslaunch"
+        )
+        return None
+        # raise ValueError("The robot might stand on a non navigable point, so check obstacle map and restart roslaunch")
+
+    def get_reachable_points(self, start_pt: Tuple[int, int]) -> Set[Tuple[int, int]]:
+        """Gets all reachable points from a given starting point.
+
+        Args:
+            start_pt: The starting point
+
+        Returns:
+            The set of all reachable points
+        """
+
+        self.reset()
+        start_pt = self.get_unoccupied_neighbor(start_pt)
+        if start_pt is None:
+            return set()
+
+        reachable_points: set[Tuple[int, int]] = set()
+        to_visit = [start_pt]
+        while to_visit:
+            pt = to_visit.pop()
+            if pt in reachable_points:
+                continue
+            reachable_points.add(pt)
+            for new_pt in neighbors(pt):
+                if new_pt in reachable_points:
+                    continue
+                if self.point_is_occupied(new_pt[0], new_pt[1]):
+                    continue
+                to_visit.append(new_pt)
+        return reachable_points
+
+    def run_astar(
+        self,
+        start_xy: Tuple[float, float],
+        end_xy: Tuple[float, float],
+        remove_line_of_sight_points: bool = False,
+    ) -> List[Tuple[float, float]]:
+
+        start_pt, end_pt = self.to_pt(start_xy), self.to_pt(end_xy)
+
+        # Checks that both points are unoccupied.
+        # start_pt = self.get_unoccupied_neighbor(start_pt, end_pt)
+        start_pt = self.get_unoccupied_neighbor(start_pt)
+        end_pt = self.get_unoccupied_neighbor(end_pt, start_pt)
+        # print('A* formally starts ', time.time() - self.start_time, ' seconds after path planning starts')
+        if start_pt is None or end_pt is None:
+            return None
+
+        # Implements A* search.
+        q = [(0, start_pt)]
+        came_from: dict = {start_pt: None}
+        cost_so_far: dict[Tuple[int, int], float] = {start_pt: 0.0}
+        while q:
+            _, current = heapq.heappop(q)
+
+            if current == end_pt:
+                break
+
+            for nxt in neighbors(current):
+                if self.point_is_occupied(nxt[0], nxt[1]):
+                    continue
+                new_cost = cost_so_far[current] + self.compute_heuristic(current, nxt)
+                if nxt not in cost_so_far or new_cost < cost_so_far[nxt]:
+                    cost_so_far[nxt] = new_cost
+                    priority = new_cost + self.compute_heuristic(end_pt, nxt)
+                    heapq.heappush(q, (priority, nxt))  # type: ignore
+                    came_from[nxt] = current
+
+        else:
+            return None
+
+        # Reconstructs the path.
+        path = []
+        current = end_pt
+        while current != start_pt:
+            path.append(current)
+            prev = came_from[current]
+            if prev is None:
+                break
+            current = prev
+        path.append(start_pt)
+        path.reverse()
+
+        # Clean up the path.
+        if remove_line_of_sight_points:
+            path = self.clean_path(path)
+
+        # return [start_xy] + [self.to_xy(pt) for pt in path[1:-1]] + [end_xy]
+        return [start_xy] + [self.to_xy(pt) for pt in path[1:]]
+
+    def plan(self, start, goal, verbose: bool = True) -> PlanResult:
+        """plan from start to goal. creates a new tree.
+
+        Based on Caelan Garrett's code (MIT licensed):
+        https://github.com/caelan/motion-planners/blob/master/motion_planners/rrt.py
+        """
+        # assert len(start) == self.space.dof, "invalid start dimensions"
+        # assert len(goal) == self.space.dof, "invalid goal dimensions"
+        # self.start_time = time.time()
+        self.reset()
+        if not self.space.is_valid(goal):
+            if verbose:
+                print("[Planner] invalid goal")
+            return PlanResult(False, reason="[Planner] invalid goal")
+        # Add start to the tree
+        # print('Start running A* ', time.time() - self.start_time, ' seconds after path planning starts')
+        waypoints = self.run_astar(start[:2], goal[:2])
+        # print('Finish running A* ', time.time() - self.start_time, ' seconds after path planning starts')
+
+        if waypoints is None:
+            if verbose:
+                print("A* fails, check obstacle map")
+            return PlanResult(False, reason="A* fails, check obstacle map")
+        trajectory = []
+        for i in range(len(waypoints) - 1):
+            theta = self.compute_theta(
+                waypoints[i][0], waypoints[i][1], waypoints[i + 1][0], waypoints[i + 1][1]
+            )
+            trajectory.append(Node([waypoints[i][0], waypoints[i][1], float(theta)]))
+        trajectory.append(Node([waypoints[-1][0], waypoints[-1][1], goal[-1]]))
+        # print('Finish computing theta ', time.time() - self.start_time, ' seconds after path planning starts')
+        return PlanResult(True, trajectory=trajectory)
diff --git a/src/stretch/dynav/mapping_utils/voxel.py b/src/stretch/dynav/mapping_utils/voxel.py
new file mode 100644
index 000000000..8f39f87a6
--- /dev/null
+++ b/src/stretch/dynav/mapping_utils/voxel.py
@@ -0,0 +1,968 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import math
+import pickle
+from collections import namedtuple
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import open3d as open3d
+import scipy
+import skimage
+import torch
+from scipy.ndimage import maximum_filter
+from torch import Tensor
+
+from stretch.core.interfaces import Observations
+from stretch.dynav.mapping_utils.voxelized_pcd import VoxelizedPointcloud, scatter3d
+from stretch.motion import Footprint, PlanResult, RobotModel
+from stretch.utils.morphology import binary_dilation, binary_erosion, get_edges
+from stretch.utils.point_cloud import create_visualization_geometries, numpy_to_pcd
+from stretch.utils.point_cloud_torch import unproject_masked_depth_to_xyz_coordinates
+from stretch.utils.visualization import create_disk
+
+Frame = namedtuple(
+    "Frame",
+    [
+        "camera_pose",
+        "camera_K",
+        "xyz",
+        "rgb",
+        "feats",
+        "depth",
+        "base_pose",
+        "info",
+        "full_world_xyz",
+        "xyz_frame",
+    ],
+)
+
+VALID_FRAMES = ["camera", "world"]
+
+DEFAULT_GRID_SIZE = [180, 180]
+
+logger = logging.getLogger(__name__)
+
+
+def ensure_tensor(arr):
+    if isinstance(arr, np.ndarray):
+        return Tensor(arr)
+    if not isinstance(arr, Tensor):
+        raise ValueError(f"arr of unknown type ({type(arr)}) cannot be cast to Tensor")
+
+
+class SparseVoxelMap(object):
+    """Create a voxel map object which captures 3d information.
+
+    This class represents a 3D voxel map used for capturing environmental information. It provides various parameters
+    for configuring the map's properties, such as resolution, feature dimensions.
+
+    Attributes:
+        resolution (float): The size of a voxel in meters.
+        feature_dim (int): The size of feature embeddings to capture per-voxel point.
+        grid_size (Tuple[int, int]): The dimensions of the voxel grid (optional).
+        grid_resolution (float): The resolution of the grid (optional).
+        obs_min_height (float): The minimum height for observations.
+        obs_max_height (float): The maximum height for observations.
+        obs_min_density (float): The minimum density for observations.
+        smooth_kernel_size (int): The size of the smoothing kernel.
+        add_local_radius_points (bool): Whether to add local radius points.
+        remove_visited_from_obstacles(bool): subtract out observations potentially of the robot
+        local_radius (float): The radius for local points.
+        min_depth (float): The minimum depth for observations.
+        max_depth (float): The maximum depth for observations.
+        pad_obstacles (int): Padding for obstacles.
+        voxel_kwargs (Dict[str, Any]): Additional voxel configuration.
+        map_2d_device (str): The device for 2D mapping.
+    """
+
+    DEFAULT_INSTANCE_MAP_KWARGS = dict(
+        du_scale=1,
+        instance_association="bbox_iou",
+        log_dir_overwrite_ok=True,
+        mask_cropped_instances="False",
+    )
+
+    def __init__(
+        self,
+        resolution: float = 0.1,
+        feature_dim: int = 3,
+        grid_size: Tuple[int, int] = None,
+        grid_resolution: float = 0.1,
+        obs_min_height: float = 0.1,
+        obs_max_height: float = 1.5,
+        obs_min_density: float = 50,
+        exp_min_density: float = 5,
+        smooth_kernel_size: int = 2,
+        add_local_radius_points: bool = True,
+        remove_visited_from_obstacles: bool = False,
+        local_radius: float = 0.8,
+        min_depth: float = 0.25,
+        max_depth: float = 2.0,
+        pad_obstacles: int = 0,
+        voxel_kwargs: Dict[str, Any] = {},
+        map_2d_device: str = "cpu",
+        use_median_filter: bool = False,
+        median_filter_size: int = 5,
+        median_filter_max_error: float = 0.01,
+        use_derivative_filter: bool = False,
+        derivative_filter_threshold: float = 0.5,
+        point_update_threshold: float = 0.9,
+    ):
+        """
+        Args:
+            resolution(float): in meters, size of a voxel
+            feature_dim(int): size of feature embeddings to capture per-voxel point
+        """
+        # TODO: We an use fastai.store_attr() to get rid of this boilerplate code
+        self.resolution = resolution
+        self.feature_dim = feature_dim
+        self.obs_min_height = obs_min_height
+        self.obs_max_height = obs_max_height
+        self.obs_min_density = obs_min_density
+        self.exp_min_density = exp_min_density
+
+        # Smoothing kernel params
+        self.smooth_kernel_size = smooth_kernel_size
+        if self.smooth_kernel_size > 0:
+            self.smooth_kernel = torch.nn.Parameter(
+                torch.from_numpy(skimage.morphology.disk(smooth_kernel_size))
+                .unsqueeze(0)
+                .unsqueeze(0)
+                .float(),
+                requires_grad=False,
+            )
+        else:
+            self.smooth_kernel = None
+
+        # Median filter params
+        self.median_filter_size = median_filter_size
+        self.use_median_filter = use_median_filter
+        self.median_filter_max_error = median_filter_max_error
+
+        # Derivative filter params
+        self.use_derivative_filter = use_derivative_filter
+        self.derivative_filter_threshold = derivative_filter_threshold
+        self.point_update_threshold = point_update_threshold
+
+        self.grid_resolution = grid_resolution
+        self.voxel_resolution = resolution
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.pad_obstacles = int(pad_obstacles)
+
+        self.voxel_kwargs = voxel_kwargs
+        self.map_2d_device = map_2d_device
+
+        if self.pad_obstacles > 0:
+            self.dilate_obstacles_kernel = torch.nn.Parameter(
+                torch.from_numpy(skimage.morphology.disk(self.pad_obstacles))
+                .unsqueeze(0)
+                .unsqueeze(0)
+                .float(),
+                requires_grad=False,
+            )
+        else:
+            self.dilate_obstacles_kernel = None
+
+        # Add points with local_radius to the voxel map at (0,0,0) unless we receive lidar points
+        self._add_local_radius_points = add_local_radius_points
+        self._remove_visited_from_obstacles = remove_visited_from_obstacles
+        self.local_radius = local_radius
+
+        # Create disk for mapping explored areas near the robot - since camera can't always see it
+        self._disk_size = np.ceil(self.local_radius / self.grid_resolution)
+
+        self._visited_disk = torch.from_numpy(
+            create_disk(self._disk_size, (2 * self._disk_size) + 1)
+        ).to(map_2d_device)
+
+        if grid_size is not None:
+            self.grid_size = [grid_size[0], grid_size[1]]
+        else:
+            self.grid_size = DEFAULT_GRID_SIZE
+        # Track the center of the grid - (0, 0) in our coordinate system
+        # We then just need to update everything when we want to track obstacles
+        self.grid_origin = Tensor(self.grid_size + [0], device=map_2d_device) // 2
+        # Used to track the offset from our observations so maps dont use too much space
+
+        # Used for tensorized bounds checks
+        self._grid_size_t = Tensor(self.grid_size, device=map_2d_device)
+
+        # Init variables
+        self.reset()
+
+    def reset(self) -> None:
+        """Clear out the entire voxel map."""
+        self.observations: List[Frame] = []
+        # Create voxelized pointcloud
+        self.voxel_pcd = VoxelizedPointcloud(
+            voxel_size=self.voxel_resolution,
+            dim_mins=None,
+            dim_maxs=None,
+            feature_pool_method="mean",
+            **self.voxel_kwargs,
+        )
+
+        self._seq = 0
+        self._2d_last_updated = -1
+        self.reset_cache()
+
+    def reset_cache(self):
+        """Clear some tracked things"""
+        # Stores points in 2d coords where robot has been
+        self._visited = torch.zeros(self.grid_size, device=self.map_2d_device)
+
+        self.voxel_pcd.reset()
+
+        # Store 2d map information
+        # This is computed from our various point clouds
+        self._map2d = None
+        self._history_soft = None
+
+    # def add_obs(
+    #     self,
+    #     obs: Observations,
+    #     camera_K: Optional[torch.Tensor] = None,
+    #     *args,
+    #     **kwargs,
+    # ):
+    #     """Unpack an observation and convert it properly, then add to memory. Pass all other inputs into the add() function as provided."""
+    #     rgb = self.fix_data_type(obs.rgb)
+    #     depth = self.fix_data_type(obs.depth)
+    #     xyz = self.fix_data_type(obs.xyz)
+    #     camera_pose = self.fix_data_type(obs.camera_pose)
+    #     base_pose = torch.from_numpy(np.array([obs.gps[0], obs.gps[1], obs.compass[0]])).float()
+    #     K = self.fix_data_type(obs.camera_K) if camera_K is None else camera_K
+
+    #     self.add(
+    #         camera_pose=camera_pose,
+    #         xyz=xyz,
+    #         rgb=rgb,
+    #         depth=depth,
+    #         base_pose=base_pose,
+    #         camera_K=K,
+    #         *args,
+    #         **kwargs,
+    #     )
+
+    def add(
+        self,
+        camera_pose: Tensor,
+        rgb: Tensor,
+        xyz: Optional[Tensor] = None,
+        camera_K: Optional[Tensor] = None,
+        feats: Optional[Tensor] = None,
+        depth: Optional[Tensor] = None,
+        base_pose: Optional[Tensor] = None,
+        instance_image: Optional[Tensor] = None,
+        instance_classes: Optional[Tensor] = None,
+        instance_scores: Optional[Tensor] = None,
+        obs: Optional[Observations] = None,
+        xyz_frame: str = "camera",
+        **info,
+    ):
+        """Add this to our history of observations. Also update the current running map.
+
+        Parameters:
+            camera_pose(Tensor): [4,4] cam_to_world matrix
+            rgb(Tensor): N x 3 color points
+            camera_K(Tensor): [3,3] camera instrinsics matrix -- usually pinhole model
+            xyz(Tensor): N x 3 point cloud points in camera coordinates
+            feats(Tensor): N x D point cloud features; D == 3 for RGB is most common
+            base_pose(Tensor): optional location of robot base
+            instance_image(Tensor): [H,W] image of ints where values at a pixel correspond to instance_id
+            instance_classes(Tensor): [K] tensor of ints where class = instance_classes[instance_id]
+            instance_scores: [K] of detection confidence score = instance_scores[instance_id]
+            # obs: observations
+        """
+        # TODO: we should remove the xyz/feats maybe? just use observations as input?
+        # TODO: switch to using just Obs struct?
+        # Shape checking
+        assert rgb.ndim == 3 or rgb.ndim == 2, f"{rgb.ndim=}: must be 2 or 3"
+        if isinstance(rgb, np.ndarray):
+            rgb = torch.from_numpy(rgb)
+        if isinstance(camera_pose, np.ndarray):
+            camera_pose = torch.from_numpy(camera_pose)
+        if depth is not None:
+            assert (
+                rgb.shape[:-1] == depth.shape
+            ), f"depth and rgb image sizes must match; got {rgb.shape=} {depth.shape=}"
+        assert xyz is not None or (camera_K is not None and depth is not None)
+        if xyz is not None:
+            assert (
+                xyz.shape[-1] == 3
+            ), "xyz must have last dimension = 3 for x, y, z position of points"
+            assert rgb.shape == xyz.shape, "rgb shape must match xyz"
+            # Make sure shape is correct here for xyz and any passed-in features
+            if feats is not None:
+                assert (
+                    feats.shape[-1] == self.feature_dim
+                ), f"features must match voxel feature dimenstionality of {self.feature_dim}"
+                assert xyz.shape[0] == feats.shape[0], "features must be available for each point"
+            else:
+                pass
+            if isinstance(xyz, np.ndarray):
+                xyz = torch.from_numpy(xyz)
+        if depth is not None:
+            assert depth.ndim == 2 or xyz_frame == "world"
+        if camera_K is not None:
+            assert camera_K.ndim == 2, "camera intrinsics K must be a 3x3 matrix"
+        assert (
+            camera_pose.ndim == 2 and camera_pose.shape[0] == 4 and camera_pose.shape[1] == 4
+        ), "Camera pose must be a 4x4 matrix representing a pose in SE(3)"
+        assert (
+            xyz_frame in VALID_FRAMES
+        ), f"frame {xyz_frame} was not valid; should one one of {VALID_FRAMES}"
+
+        # Apply a median filter to remove bad depth values when mapping and exploring
+        # This is not strictly necessary but the idea is to clean up bad pixels
+        if depth is not None and self.use_median_filter:
+            median_depth = torch.from_numpy(
+                scipy.ndimage.median_filter(depth, size=self.median_filter_size)
+            )
+            median_filter_error = (depth - median_depth).abs()
+
+        # Get full_world_xyz
+        if xyz is not None:
+            if xyz_frame == "camera":
+                full_world_xyz = (
+                    torch.cat([xyz, torch.ones_like(xyz[..., [0]])], dim=-1) @ camera_pose.T
+                )[..., :3]
+            elif xyz_frame == "world":
+                full_world_xyz = xyz
+            else:
+                raise NotImplementedError(f"Unknown xyz_frame {xyz_frame}")
+        else:
+            full_world_xyz = unproject_masked_depth_to_xyz_coordinates(  # Batchable!
+                depth=depth.unsqueeze(0).unsqueeze(1),
+                pose=camera_pose.unsqueeze(0),
+                inv_intrinsics=torch.linalg.inv(camera_K[:3, :3]).unsqueeze(0),
+            )
+        # add observations before we start changing things
+        self.observations.append(
+            Frame(
+                camera_pose,
+                camera_K,
+                xyz,
+                rgb,
+                feats,
+                depth,
+                base_pose,
+                info,
+                full_world_xyz,
+                xyz_frame=xyz_frame,
+            )
+        )
+
+        valid_depth = torch.full_like(rgb[:, 0], fill_value=True, dtype=torch.bool)
+        if depth is not None:
+            valid_depth = (depth > self.min_depth) & (depth < self.max_depth)
+
+            if self.use_derivative_filter:
+                edges = get_edges(depth, threshold=self.derivative_filter_threshold)
+                valid_depth = valid_depth & ~edges
+
+            if self.use_median_filter:
+                valid_depth = (
+                    valid_depth & (median_filter_error < self.median_filter_max_error).bool()
+                )
+
+        # Add to voxel grid
+        if feats is not None:
+            feats = feats[valid_depth].reshape(-1, feats.shape[-1])
+        rgb = rgb[valid_depth].reshape(-1, 3)
+        world_xyz = full_world_xyz.view(-1, 3)[valid_depth.flatten()]
+
+        # TODO: weights could also be confidence, inv distance from camera, etc
+        if world_xyz.nelement() > 0:
+            selected_indices = torch.randperm(len(world_xyz))[
+                : int((1 - self.point_update_threshold) * len(world_xyz))
+            ]
+            if len(selected_indices) == 0:
+                return
+            if world_xyz is not None:
+                world_xyz = world_xyz[selected_indices]
+            if feats is not None:
+                feats = feats[selected_indices]
+            if rgb is not None:
+                rgb = rgb[selected_indices]
+            self.voxel_pcd.add(world_xyz, features=feats, rgb=rgb, weights=None)
+
+        if self._add_local_radius_points:
+            # TODO: just get this from camera_pose?
+            self._update_visited(camera_pose[:3, 3].to(self.map_2d_device))
+        if base_pose is not None:
+            self._update_visited(base_pose.to(self.map_2d_device))
+
+        # Increment sequence counter
+        self._seq += 1
+
+    def _update_visited(self, base_pose: Tensor):
+        """Update 2d map of where robot has visited"""
+        # Add exploration here
+        # Base pose can be whatever, going to assume xyt for now
+        map_xy = ((base_pose[:2] / self.grid_resolution) + self.grid_origin[:2]).int()
+        x0 = int(map_xy[0] - self._disk_size)
+        x1 = int(map_xy[0] + self._disk_size + 1)
+        y0 = int(map_xy[1] - self._disk_size)
+        y1 = int(map_xy[1] + self._disk_size + 1)
+        assert x0 >= 0
+        assert y0 >= 0
+        self._visited[x0:x1, y0:y1] += self._visited_disk
+
+    def write_to_pickle(self, filename: str):
+        """Write out to a pickle file. This is a rough, quick-and-easy output for debugging, not intended to replace the scalable data writer in data_tools for bigger efforts."""
+        data: dict[str, Any] = {}
+        data["camera_poses"] = []
+        data["camera_K"] = []
+        data["base_poses"] = []
+        data["xyz"] = []
+        data["world_xyz"] = []
+        data["rgb"] = []
+        data["depth"] = []
+        data["feats"] = []
+        for frame in self.observations:
+            # add it to pickle
+            # TODO: switch to using just Obs struct?
+            data["camera_poses"].append(frame.camera_pose)
+            data["base_poses"].append(frame.base_pose)
+            data["camera_K"].append(frame.camera_K)
+            data["xyz"].append(frame.xyz)
+            data["world_xyz"].append(frame.full_world_xyz)
+            data["rgb"].append(frame.rgb)
+            data["depth"].append(frame.depth)
+            data["feats"].append(frame.feats)
+            for k, v in frame.info.items():
+                if k not in data:
+                    data[k] = []
+                data[k].append(v)
+        (
+            data["combined_xyz"],
+            data["combined_feats"],
+            data["combined_weights"],
+            data["combined_rgb"],
+        ) = self.voxel_pcd.get_pointcloud()
+        with open(filename, "wb") as f:
+            pickle.dump(data, f)
+
+    def write_to_pickle_add_data(self, filename: str, newdata: dict):
+        """Write out to a pickle file. This is a rough, quick-and-easy output for debugging, not intended to replace the scalable data writer in data_tools for bigger efforts."""
+        data: dict[str, Any] = {}
+        data["camera_poses"] = []
+        data["base_poses"] = []
+        data["xyz"] = []
+        data["rgb"] = []
+        data["depth"] = []
+        data["feats"] = []
+        for key, value in newdata.items():
+            data[key] = value
+        for frame in self.observations:
+            # add it to pickle
+            # TODO: switch to using just Obs struct?
+            data["camera_poses"].append(frame.camera_pose)
+            data["base_poses"].append(frame.base_pose)
+            data["xyz"].append(frame.xyz)
+            data["rgb"].append(frame.rgb)
+            data["depth"].append(frame.depth)
+            data["feats"].append(frame.feats)
+            for k, v in frame.info.items():
+                if k not in data:
+                    data[k] = []
+                data[k].append(v)
+        (
+            data["combined_xyz"],
+            data["combined_feats"],
+            data["combined_weights"],
+            data["combined_rgb"],
+        ) = self.voxel_pcd.get_pointcloud()
+        with open(filename, "wb") as f:
+            pickle.dump(data, f)
+
+    def fix_data_type(self, tensor) -> torch.Tensor:
+        """make sure tensors are in the right format for this model"""
+        # If its empty just hope we're handling that somewhere else
+        if tensor is None:
+            return None
+        # Conversions
+        if isinstance(tensor, np.ndarray):
+            tensor = torch.from_numpy(tensor)
+        # Data types
+        if isinstance(tensor, torch.Tensor):
+            return tensor.float()
+        else:
+            raise NotImplementedError("unsupported data type for tensor:", tensor)
+
+    def read_from_pickle(self, filename: Union[Path, str], num_frames: int = -1):
+        """Read from a pickle file as above. Will clear all currently stored data first."""
+        self.reset_cache()
+        if isinstance(filename, str):
+            filename = Path(filename)
+        assert filename.exists(), f"No file found at {filename}"
+        with filename.open("rb") as f:
+            data = pickle.load(f)
+        for i, (camera_pose, xyz, rgb, feats, depth, base_pose, K, world_xyz,) in enumerate(
+            zip(
+                data["camera_poses"],
+                data["xyz"],
+                data["rgb"],
+                data["feats"],
+                data["depth"],
+                data["base_poses"],
+                data["camera_K"],
+                data["world_xyz"],
+            )
+        ):
+            # Handle the case where we dont actually want to load everything
+            if num_frames > 0 and i >= num_frames:
+                break
+
+            camera_pose = self.fix_data_type(camera_pose)
+            xyz = self.fix_data_type(xyz)
+            rgb = self.fix_data_type(rgb)
+            depth = self.fix_data_type(depth)
+            if feats is not None:
+                feats = self.fix_data_type(feats)
+            base_pose = self.fix_data_type(base_pose)
+            self.add(
+                camera_pose=camera_pose,
+                xyz=xyz,
+                rgb=rgb,
+                feats=feats,
+                depth=depth,
+                base_pose=base_pose,
+                camera_K=K,
+            )
+
+    def recompute_map(self):
+        """Recompute the entire map from scratch instead of doing incremental updates.
+        This is a helper function which recomputes everything from the beginning.
+
+        Currently this will be slightly inefficient since it recreates all the objects incrementally.
+        """
+        old_observations = self.observations
+        self.reset_cache()
+        for frame in old_observations:
+            self.add(
+                frame.camera_pose,
+                frame.xyz,
+                frame.rgb,
+                frame.feats,
+                frame.depth,
+                frame.base_pose,
+                frame.obs,
+                **frame.info,
+            )
+
+    def get_2d_alignment_heuristics(self, voxel_map_localizer, text, debug: bool = False):
+        if voxel_map_localizer.voxel_pcd._points is None:
+            return None
+        # Convert metric measurements to discrete
+        # Gets the xyz correctly - for now everything is assumed to be within the correct distance of origin
+        xyz, _, _, _ = voxel_map_localizer.voxel_pcd.get_pointcloud()
+        xyz = xyz.detach().cpu()
+        if xyz is None:
+            xyz = torch.zeros((0, 3))
+
+        device = xyz.device
+        xyz = ((xyz / self.grid_resolution) + self.grid_origin).long()
+        xyz[xyz[:, -1] < 0, -1] = 0
+
+        # Crop to robot height
+        min_height = int(self.obs_min_height / self.grid_resolution)
+        max_height = int(self.obs_max_height / self.grid_resolution)
+        grid_size = self.grid_size + [max_height]
+
+        # Mask out obstacles only above a certain height
+        obs_mask = xyz[:, -1] < max_height
+        xyz = xyz[obs_mask, :]
+        alignments = voxel_map_localizer.find_alignment_over_model(text)[0].detach().cpu()
+        alignments = alignments[obs_mask][:, None]
+
+        alignment_heuristics = scatter3d(xyz, alignments, grid_size, "max")
+        alignment_heuristics = torch.max(alignment_heuristics, dim=-1).values
+        alignment_heuristics = torch.from_numpy(
+            maximum_filter(alignment_heuristics.numpy(), size=7)
+        )
+        return alignment_heuristics
+
+    def get_2d_map(
+        self, debug: bool = False, return_history_id: bool = False
+    ) -> Tuple[Tensor, ...]:
+        """Get 2d map with explored area and frontiers."""
+
+        # Is this already cached? If so we don't need to go to all this work
+        if (
+            self._map2d is not None
+            and self._history_soft is not None
+            and self._seq == self._2d_last_updated
+        ):
+            return self._map2d if not return_history_id else (*self._map2d, self._history_soft)
+
+        # Convert metric measurements to discrete
+        # Gets the xyz correctly - for now everything is assumed to be within the correct distance of origin
+        xyz, _, counts, _ = self.voxel_pcd.get_pointcloud()
+        # print(counts)
+        # if xyz is not None:
+        #     counts = torch.ones(xyz.shape[0])
+        obs_ids = self.voxel_pcd._obs_counts
+        if xyz is None:
+            xyz = torch.zeros((0, 3))
+            counts = torch.zeros((0))
+            obs_ids = torch.zeros((0))
+
+        device = xyz.device
+        xyz = ((xyz / self.grid_resolution) + self.grid_origin + 0.5).long()
+        xyz[xyz[:, -1] < 0, -1] = 0
+
+        # Crop to robot height
+        min_height = int(self.obs_min_height / self.grid_resolution)
+        max_height = int(self.obs_max_height / self.grid_resolution)
+        # print('min_height', min_height, 'max_height', max_height)
+        grid_size = self.grid_size + [max_height]
+        voxels = torch.zeros(grid_size, device=device)
+
+        # Mask out obstacles only above a certain height
+        obs_mask = xyz[:, -1] < max_height
+        xyz = xyz[obs_mask, :]
+        counts = counts[obs_mask][:, None]
+        # print(counts)
+        obs_ids = obs_ids[obs_mask][:, None]
+
+        # voxels[x_coords, y_coords, z_coords] = 1
+        voxels = scatter3d(xyz, counts, grid_size)
+        history_ids = scatter3d(xyz, obs_ids, grid_size, "max")
+
+        # Compute the obstacle voxel grid based on what we've seen
+        obstacle_voxels = voxels[:, :, min_height:max_height]
+        obstacles_soft = torch.sum(obstacle_voxels, dim=-1)
+        obstacles = obstacles_soft > self.obs_min_density
+
+        # history_ids = history_ids[:, :, min_height:max_height]
+        history_soft = torch.max(history_ids, dim=-1).values
+        history_soft = torch.from_numpy(maximum_filter(history_soft.float().numpy(), size=5))
+
+        if self._remove_visited_from_obstacles:
+            # Remove "visited" points containing observations of the robot
+            obstacles *= (1 - self._visited).bool()
+
+        if self.dilate_obstacles_kernel is not None:
+            obstacles = binary_dilation(
+                obstacles.float().unsqueeze(0).unsqueeze(0),
+                self.dilate_obstacles_kernel,
+            )[0, 0].bool()
+
+        # Explored area = only floor mass
+        # floor_voxels = voxels[:, :, :min_height]
+        explored_soft = torch.sum(voxels, dim=-1)
+
+        # Add explored radius around the robot, up to min depth
+        # TODO: make sure lidar is supported here as well; if we do not have lidar assume a certain radius is explored
+        explored = explored_soft > self.exp_min_density
+        explored = (torch.zeros_like(explored) + self._visited).to(torch.bool) | explored
+
+        # Also shrink the explored area to build more confidence
+        # That we will not collide with anything while moving around
+        # if self.dilate_obstacles_kernel is not None:
+        #    explored = binary_erosion(
+        #        explored.float().unsqueeze(0).unsqueeze(0),
+        #        self.dilate_obstacles_kernel,
+        #    )[0, 0].bool()
+        if self.smooth_kernel_size > 0:
+            # Opening and closing operations here on explore
+            explored = binary_erosion(
+                binary_dilation(explored.float().unsqueeze(0).unsqueeze(0), self.smooth_kernel),
+                self.smooth_kernel,
+            )  # [0, 0].bool()
+            explored = binary_dilation(
+                binary_erosion(explored, self.smooth_kernel),
+                self.smooth_kernel,
+            )[0, 0].bool()
+
+            # Obstacles just get dilated and eroded
+            # This might influence the obstacle size
+            # obstacles = binary_erosion(
+            #     binary_dilation(
+            #         obstacles.float().unsqueeze(0).unsqueeze(0), self.smooth_kernel
+            #     ),
+            #     self.smooth_kernel,
+            # )[0, 0].bool()
+        if debug:
+            import matplotlib.pyplot as plt
+
+            plt.subplot(2, 2, 1)
+            plt.imshow(obstacles_soft.detach().cpu().numpy())
+            plt.title("obstacles soft")
+            plt.axis("off")
+            plt.subplot(2, 2, 2)
+            plt.imshow(explored_soft.detach().cpu().numpy())
+            plt.title("explored soft")
+            plt.axis("off")
+            plt.subplot(2, 2, 3)
+            plt.imshow(obstacles.detach().cpu().numpy())
+            plt.title("obstacles")
+            plt.axis("off")
+            plt.subplot(2, 2, 4)
+            plt.imshow(explored.detach().cpu().numpy())
+            plt.axis("off")
+            plt.title("explored")
+            plt.show()
+
+        # Update cache
+        obstacles[0:30, :] = True
+        obstacles[-30:, :] = True
+        obstacles[:, 0:30] = True
+        obstacles[:, -30:] = True
+        self._map2d = (obstacles, explored)
+        self._2d_last_updated = self._seq
+        self._history_soft = history_soft
+        if not return_history_id:
+            return obstacles, explored
+        else:
+            return obstacles, explored, history_soft
+
+    def xy_to_grid_coords(self, xy: np.ndarray) -> Optional[np.ndarray]:
+        """convert xy point to grid coords"""
+        # Handle conversion
+        # # type: ignore comments used to bypass mypy check
+        if isinstance(xy, np.ndarray):
+            xy = torch.from_numpy(xy).float()  # type: ignore
+        assert xy.shape[-1] == 2, "coords must be Nx2 or 2d array"
+        grid_xy = (xy / self.grid_resolution) + self.grid_origin[:2]
+        if torch.any(grid_xy >= self._grid_size_t) or torch.any(grid_xy < torch.zeros(2)):
+            return None
+        else:
+            return grid_xy.int()
+
+    def plan_to_grid_coords(self, plan_result: PlanResult) -> Optional[List]:
+        """Convert a plan properly into grid coordinates"""
+        if not plan_result.success:
+            return None
+        else:
+            traj = []
+            for node in plan_result.trajectory:
+                traj.append(self.xy_to_grid_coords(node.state[:2]))
+            return traj
+
+    def grid_coords_to_xy(self, grid_coords: np.ndarray) -> np.ndarray:
+        """convert grid coordinate point to metric world xy point"""
+        assert grid_coords.shape[-1] == 2, "grid coords must be an Nx2 or 2d array"
+        if isinstance(grid_coords, np.ndarray):
+            grid_coords = torch.from_numpy(grid_coords)  # type: ignore
+        return (grid_coords - self.grid_origin[:2]) * self.grid_resolution
+
+    def grid_coords_to_xyt(self, grid_coords: np.ndarray) -> np.ndarray:
+        """convert grid coordinate point to metric world xyt point"""
+        res = np.ones(3)
+        res[:2] = self.grid_coords_to_xy(grid_coords)
+        return res
+
+    def show(self, backend: str = "open3d", **backend_kwargs) -> Tuple[np.ndarray, np.ndarray]:
+        """Display the aggregated point cloud."""
+        if backend == "open3d":
+            return self._show_open3d(**backend_kwargs)
+        else:
+            raise NotImplementedError(f"Unknown backend {backend}, must be 'open3d' or 'pytorch3d")
+
+    def get_xyz_rgb(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Return xyz and rgb of the current map"""
+        points, _, _, rgb = self.voxel_pcd.get_pointcloud()
+        return points, rgb
+
+    def sample_from_mask(self, mask: torch.Tensor) -> Optional[np.ndarray]:
+        """Sample from any mask"""
+        valid_indices = torch.nonzero(mask, as_tuple=False)
+        if valid_indices.size(0) > 0:
+            random_index = torch.randint(valid_indices.size(0), (1,))
+            return self.grid_coords_to_xy(valid_indices[random_index].numpy())
+        else:
+            return None
+
+    def xyt_is_safe(self, xyt: np.ndarray, robot: Optional[RobotModel] = None) -> bool:
+        """Check to see if a given xyt position is known to be safe."""
+        if robot is not None:
+            raise NotImplementedError("not currently checking against robot base geometry")
+        obstacles, explored = self.get_2d_map()
+        # Convert xy to grid coords
+        grid_xy = self.xy_to_grid_coords(xyt[:2])
+        # Check to see if grid coords are explored and obstacle free
+        if grid_xy is None:
+            # Conversion failed - probably out of bounds
+            return False
+        obstacles, explored = self.get_2d_map()
+        # Convert xy to grid coords
+        grid_xy = self.xy_to_grid_coords(xyt[:2])
+        # Check to see if grid coords are explored and obstacle free
+        if grid_xy is None:
+            # Conversion failed - probably out of bounds
+            return False
+        navigable = ~obstacles & explored
+        return bool(navigable[grid_xy[0], grid_xy[1]])
+        # if robot is not None:
+        #     # TODO: check against robot geometry
+        #     raise NotImplementedError(
+        #         "not currently checking against robot base geometry"
+        #     )
+        # return True
+
+    def _get_boxes_from_points(
+        self,
+        traversible: Union[Tensor, np.ndarray],
+        color: List[float],
+        is_map: bool = True,
+        height: float = 0.0,
+        offset: Optional[np.ndarray] = None,
+    ):
+        """Get colored boxes for a mask"""
+        # Get indices for all traversible locations
+        traversible_indices = np.argwhere(traversible)
+        # Traversible indices will be a 2xN array, so we need to transpose it.
+        # Set to floor/max obs height and bright red
+        if is_map:
+            traversible_pts = self.grid_coords_to_xy(traversible_indices.T)
+        else:
+            traversible_pts = (
+                traversible_indices.T - np.ceil([d / 2 for d in traversible.shape])
+            ) * self.grid_resolution
+        if offset is not None:
+            traversible_pts += offset
+
+        geoms = []
+        for i in range(traversible_pts.shape[0]):
+            center = np.array(
+                [
+                    traversible_pts[i, 0],
+                    traversible_pts[i, 1],
+                    self.obs_min_height + height,
+                ]
+            )
+            dimensions = np.array(
+                [self.grid_resolution, self.grid_resolution, self.grid_resolution]
+            )
+
+            # Create a custom geometry with red color
+            mesh_box = open3d.geometry.TriangleMesh.create_box(
+                width=dimensions[0], height=dimensions[1], depth=dimensions[2]
+            )
+            mesh_box.paint_uniform_color(color)  # Set color to red
+            mesh_box.translate(center)
+
+            # Visualize the red box
+            geoms.append(mesh_box)
+        return geoms
+
+    def _get_open3d_geometries(
+        self,
+        orig: Optional[np.ndarray] = None,
+        norm: float = 255.0,
+        xyt: Optional[np.ndarray] = None,
+        footprint: Optional[Footprint] = None,
+        **backend_kwargs,
+    ):
+        """Show and return bounding box information and rgb color information from an explored point cloud. Uses open3d."""
+
+        # Create a combined point cloud
+        # pc_xyz, pc_rgb, pc_feats = self.get_data()
+        points, _, _, rgb = self.voxel_pcd.get_pointcloud()
+        pcd = numpy_to_pcd(points.detach().cpu().numpy(), (rgb / norm).detach().cpu().numpy())
+        if orig is None:
+            orig = np.zeros(3)
+        geoms = create_visualization_geometries(pcd=pcd, orig=orig)
+
+        # Get the explored/traversible area
+        obstacles, explored = self.get_2d_map()
+        traversible = explored & ~obstacles
+
+        geoms += self._get_boxes_from_points(traversible, [0, 1, 0])
+        geoms += self._get_boxes_from_points(obstacles, [1, 0, 0])
+
+        if xyt is not None and footprint is not None:
+            geoms += self._get_boxes_from_points(
+                footprint.get_rotated_mask(self.grid_resolution, float(xyt[2])),
+                [0, 0, 1],
+                is_map=False,
+                height=0.1,
+                offset=xyt[:2],
+            )
+
+        return geoms
+
+    def _get_o3d_robot_footprint_geometry(
+        self,
+        xyt: np.ndarray,
+        dimensions: Optional[np.ndarray] = None,
+        length_offset: float = 0,
+    ):
+        """Get a 3d mesh cube for the footprint of the robot. But this does not work very well for whatever reason."""
+        # Define the dimensions of the cube
+        if dimensions is None:
+            dimensions = np.array([0.2, 0.2, 0.2])  # Cube dimensions (length, width, height)
+
+        x, y, theta = xyt
+        # theta = theta - np.pi/2
+
+        # Create a custom mesh cube with the specified dimensions
+        mesh_cube = open3d.geometry.TriangleMesh.create_box(
+            width=dimensions[0], height=dimensions[1], depth=dimensions[2]
+        )
+
+        # Define the transformation matrix for position and orientation
+        rotation_matrix = np.array(
+            [
+                [math.cos(theta), -math.sin(theta), 0],
+                [math.sin(theta), math.cos(theta), 0],
+                [0, 0, 1],
+            ]
+        )
+        # Calculate the translation offset based on theta
+        length_offset = 0
+        x_offset = length_offset - (dimensions[0] / 2)
+        y_offset = -1 * dimensions[1] / 2
+        dx = (math.cos(theta) * x_offset) + (math.cos(theta - np.pi / 2) * y_offset)
+        dy = (math.sin(theta + np.pi / 2) * y_offset) + (math.sin(theta) * x_offset)
+        translation_vector = np.array([x + dx, y + dy, 0])  # Apply offset based on theta
+        transformation_matrix = np.identity(4)
+        transformation_matrix[:3, :3] = rotation_matrix
+        transformation_matrix[:3, 3] = translation_vector
+
+        # Apply the transformation to the cube
+        mesh_cube.transform(transformation_matrix)
+
+        # Set the color of the cube to blue
+        mesh_cube.paint_uniform_color([0.0, 0.0, 1.0])  # Set color to blue
+
+        return mesh_cube
+
+    def _show_open3d(
+        self,
+        orig: Optional[np.ndarray] = None,
+        norm: float = 255.0,
+        xyt: Optional[np.ndarray] = None,
+        footprint: Optional[Footprint] = None,
+        **backend_kwargs,
+    ):
+        """Show and return bounding box information and rgb color information from an explored point cloud. Uses open3d."""
+
+        # get geometries so we can use them
+        geoms = self._get_open3d_geometries(orig, norm, xyt=xyt, footprint=footprint)
+
+        # Show the geometries of where we have explored
+        open3d.visualization.draw_geometries(geoms)
+
+        # Returns xyz and rgb for further inspection
+        points, _, _, rgb = self.voxel_pcd.get_pointcloud()
+        return points, rgb
diff --git a/src/stretch/dynav/mapping_utils/voxel_map.py b/src/stretch/dynav/mapping_utils/voxel_map.py
new file mode 100644
index 000000000..46a609568
--- /dev/null
+++ b/src/stretch/dynav/mapping_utils/voxel_map.py
@@ -0,0 +1,820 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import List, Optional, Tuple, Union
+
+import matplotlib.pyplot as plt
+import numpy as np
+import skfmm
+import skimage
+import skimage.morphology
+import torch
+
+from stretch.dynav.mapping_utils.voxel import SparseVoxelMap
+from stretch.motion import XYT, Footprint
+from stretch.utils.geometry import angle_difference, interpolate_angles
+from stretch.utils.morphology import (
+    binary_dilation,
+    binary_erosion,
+    expand_mask,
+    find_closest_point_on_mask,
+    get_edges,
+)
+
+
+class SparseVoxelMapNavigationSpace(XYT):
+    """subclass for sampling XYT states from explored space"""
+
+    # Used for making sure we do not divide by zero anywhere
+    tolerance: float = 1e-8
+
+    def __init__(
+        self,
+        voxel_map: SparseVoxelMap,
+        step_size: float = 0.1,
+        rotation_step_size: float = 0.5,
+        use_orientation: bool = False,
+        orientation_resolution: int = 64,
+        dilate_frontier_size: int = 12,
+        dilate_obstacle_size: int = 2,
+        extend_mode: str = "separate",
+    ):
+        self.step_size = step_size
+        self.rotation_step_size = rotation_step_size
+        self.voxel_map = voxel_map
+        self.create_collision_masks(orientation_resolution)
+        self.extend_mode = extend_mode
+
+        # Always use 3d states
+        self.use_orientation = use_orientation
+        if self.use_orientation:
+            self.dof = 3
+        else:
+            self.dof = 2
+
+        # # type: ignore comments used to bypass mypy check
+        self._kernels = {}  # type: ignore
+
+        if dilate_frontier_size > 0:
+            self.dilate_explored_kernel = torch.nn.Parameter(
+                torch.from_numpy(skimage.morphology.disk(dilate_frontier_size))
+                .unsqueeze(0)
+                .unsqueeze(0)
+                .float(),
+                requires_grad=False,
+            )
+        else:
+            self.dilate_explored_kernel = None
+        if dilate_obstacle_size > 0:
+            self.dilate_obstacles_kernel = torch.nn.Parameter(
+                torch.from_numpy(skimage.morphology.disk(dilate_obstacle_size))
+                .unsqueeze(0)
+                .unsqueeze(0)
+                .float(),
+                requires_grad=False,
+            )
+        else:
+            self.dilate_obstacles_kernel = None
+
+    def draw_state_on_grid(
+        self, img: np.ndarray, state: np.ndarray, weight: int = 10
+    ) -> np.ndarray:
+        """Helper function to draw masks on image"""
+        grid_xy = self.voxel_map.xy_to_grid_coords(state[:2])
+        mask = self.get_oriented_mask(state[2])
+        x0 = int(np.round(float(grid_xy[0] - mask.shape[0] // 2)))
+        x1 = x0 + mask.shape[0]
+        y0 = int(np.round(float(grid_xy[1] - mask.shape[1] // 2)))
+        y1 = y0 + mask.shape[1]
+        img[x0:x1, y0:y1] += mask * weight
+        return img
+
+    def create_collision_masks(self, orientation_resolution: int, show_all: bool = False):
+        """Create a set of orientation masks
+
+        Args:
+            orientation_resolution: number of bins to break it into
+        """
+        self._footprint = Footprint(width=0.34, length=0.33, width_offset=0.0, length_offset=-0.1)
+        self._orientation_resolution = 64
+        self._oriented_masks = []
+
+        # NOTE: this is just debug code - lets you see what the masks look like
+        assert not show_all or orientation_resolution == 64
+
+        for i in range(orientation_resolution):
+            theta = i * 2 * np.pi / orientation_resolution
+            mask = self._footprint.get_rotated_mask(
+                self.voxel_map.grid_resolution, angle_radians=theta
+            )
+            if show_all:
+                plt.subplot(8, 8, i + 1)
+                plt.axis("off")
+                plt.imshow(mask.cpu().numpy())
+            self._oriented_masks.append(mask)
+        if show_all:
+            plt.show()
+
+    def distance(self, q0: np.ndarray, q1: np.ndarray) -> float:
+        """Return distance between q0 and q1."""
+        assert len(q0) == 3, "must use 3 dimensions for current state"
+        assert len(q1) == 3 or len(q1) == 2, "2 or 3 dimensions for goal"
+        if len(q1) == 3:
+            # Measure to the final position exactly
+            return np.linalg.norm(q0 - q1).item()
+        else:
+            # Measure only to the final goal x/y position
+            return np.linalg.norm(q0[:2] - q1[:2]).item()
+
+    def extend(self, q0: np.ndarray, q1: np.ndarray) -> np.ndarray:
+        """extend towards another configuration in this space. Will be either separate or joint depending on if the robot can "strafe":
+        separate: move then rotate
+        joint: move and rotate all at once."""
+        assert len(q0) == 3, f"initial configuration must be 3d, was {q0}"
+        assert len(q1) == 3 or len(q1) == 2, f"final configuration can be 2d or 3d, was {q1}"
+        if self.extend_mode == "separate":
+            return self._extend_separate(q0, q1)
+        elif self.extend_mode == "joint":
+            # Just default to linear interpolation, does not use rotation_step_size
+            return super().extend(q0, q1)
+        else:
+            raise NotImplementedError(f"not supported: {self.extend_mode=}")
+
+    def _extend_separate(self, q0: np.ndarray, q1: np.ndarray, xy_tol: float = 1e-8):
+        """extend towards another configuration in this space.
+        TODO: we can set the classes here, right now assuming still np.ndarray"""
+        assert len(q0) == 3, f"initial configuration must be 3d, was {q0}"
+        assert len(q1) == 3 or len(q1) == 2, f"final configuration can be 2d or 3d, was {q1}"
+        dxy = q1[:2] - q0[:2]
+        step = dxy / np.linalg.norm(dxy + self.tolerance) * self.step_size
+        xy = np.copy(q0[:2])
+        goal_dxy = np.linalg.norm(q1[:2] - q0[:2])
+        if (
+            goal_dxy
+            > xy_tol
+            # or goal_dxy > self.step_size
+            # or angle_difference(q1[-1], q0[-1]) > self.rotation_step_size
+        ):
+            # Turn to new goal
+            # Compute theta looking at new goal point
+            new_theta = math.atan2(dxy[1], dxy[0])
+            if new_theta < 0:
+                new_theta += 2 * np.pi
+
+            # TODO: orient towards the new theta
+            cur_theta = q0[-1]
+            angle_diff = angle_difference(new_theta, cur_theta)
+            while angle_diff > self.rotation_step_size:
+                # Interpolate
+                cur_theta = interpolate_angles(cur_theta, new_theta, self.rotation_step_size)
+                # print("interp ang =", cur_theta, "from =", cur_theta, "to =", new_theta)
+                yield np.array([xy[0], xy[1], cur_theta])
+                angle_diff = angle_difference(new_theta, cur_theta)
+
+            # First, turn in the right direction
+            next_pt = np.array([xy[0], xy[1], new_theta])
+            # After this we should have finished turning
+            yield next_pt
+
+            # Now take steps towards the right goal
+            while np.linalg.norm(xy - q1[:2]) > self.step_size:
+                xy = xy + step
+                yield np.array([xy[0], xy[1], new_theta])
+
+            # Update current angle
+            cur_theta = new_theta
+
+            # Finish stepping to goal
+            xy[:2] = q1[:2]
+            yield np.array([xy[0], xy[1], cur_theta])
+        else:
+            cur_theta = q0[-1]
+
+        # now interpolate to goal angle
+        angle_diff = angle_difference(q1[-1], cur_theta)
+        while angle_diff > self.rotation_step_size:
+            # Interpolate
+            cur_theta = interpolate_angles(cur_theta, q1[-1], self.rotation_step_size)
+            yield np.array([xy[0], xy[1], cur_theta])
+            angle_diff = angle_difference(q1[-1], cur_theta)
+
+        # Get to final angle
+        yield np.array([xy[0], xy[1], q1[-1]])
+
+        # At the end, rotate into the correct orientation
+        yield q1
+
+    def _get_theta_index(self, theta: float) -> int:
+        """gets the index associated with theta here"""
+        if theta < 0:
+            theta += 2 * np.pi
+        if theta >= 2 * np.pi:
+            theta -= 2 * np.pi
+        assert theta >= 0 and theta <= 2 * np.pi, "only angles between 0 and 2*PI allowed"
+        theta_idx = np.round((theta / (2 * np.pi) * self._orientation_resolution) - 0.5)
+        if theta_idx == self._orientation_resolution:
+            theta_idx = 0
+        return int(theta_idx)
+
+    def get_oriented_mask(self, theta: float) -> torch.Tensor:
+        theta_idx = self._get_theta_index(theta)
+        return self._oriented_masks[theta_idx]
+
+    def is_valid(
+        self,
+        state: Union[np.ndarray, torch.Tensor, List],
+        is_safe_threshold=1.0,
+        debug: bool = False,
+        verbose: bool = False,
+    ) -> bool:
+        """Check to see if state is valid; i.e. if there's any collisions if mask is at right place"""
+        assert len(state) == 3
+        if isinstance(state, torch.Tensor):
+            state = state.float().numpy()
+        state = np.array(state)
+        ok = bool(self.voxel_map.xyt_is_safe(state[:2]))
+        # if verbose:
+        #     print('is navigable:', ok)
+        if not ok:
+            # This was
+            return False
+
+        # Now sample mask at this location
+        mask = self.get_oriented_mask(state[-1])
+        assert mask.shape[0] == mask.shape[1], "square masks only for now"
+        dim = mask.shape[0]
+        half_dim = dim // 2
+        grid_xy = self.voxel_map.xy_to_grid_coords(state[:2])
+        x0 = int(grid_xy[0]) - half_dim
+        x1 = x0 + dim
+        y0 = int(grid_xy[1]) - half_dim
+        y1 = y0 + dim
+
+        obstacles, explored = self.voxel_map.get_2d_map()
+
+        crop_obs = obstacles[x0:x1, y0:y1]
+        crop_exp = explored[x0:x1, y0:y1]
+        assert mask.shape == crop_obs.shape
+        assert mask.shape == crop_exp.shape
+
+        collision = torch.any(crop_obs & mask)
+
+        p_is_safe = (torch.sum((crop_exp & mask) | ~mask) / (mask.shape[0] * mask.shape[1])).item()
+        is_safe = p_is_safe >= is_safe_threshold
+        if verbose:
+            print(f"{collision=}, {is_safe=}, {p_is_safe=}, {is_safe_threshold=}")
+
+        valid = bool((not collision) and is_safe)
+        if debug:
+            if collision:
+                print("- state in collision")
+            if not is_safe:
+                print("- not safe")
+
+            print(f"{valid=}")
+            obs = obstacles.cpu().numpy().copy()
+            exp = explored.cpu().numpy().copy()
+            obs[x0:x1, y0:y1] = 1
+            plt.subplot(321)
+            plt.imshow(obs)
+            plt.subplot(322)
+            plt.imshow(exp)
+            plt.subplot(323)
+            plt.imshow(crop_obs.cpu().numpy())
+            plt.title("obstacles")
+            plt.subplot(324)
+            plt.imshow(crop_exp.cpu().numpy())
+            plt.title("explored")
+            plt.subplot(325)
+            plt.imshow(mask.cpu().numpy())
+            plt.show()
+
+        return valid
+
+    def compute_theta(self, cur_x, cur_y, end_x, end_y):
+        theta = 0
+        if end_x == cur_x and end_y >= cur_y:
+            theta = np.pi / 2
+        elif end_x == cur_x and end_y < cur_y:
+            theta = -np.pi / 2
+        else:
+            theta = np.arctan((end_y - cur_y) / (end_x - cur_x))
+            if end_x < cur_x:
+                theta = theta + np.pi
+            if theta > np.pi:
+                theta = theta - 2 * np.pi
+            if theta < -np.pi:
+                theta = theta + 2 * np.pi
+        return theta
+
+    def sample_target_point(
+        self, start: torch.Tensor, point: torch.Tensor, planner, exploration: bool = False
+    ) -> Optional[np.ndarray]:
+        """Sample a position near the mask and return.
+
+        Args:
+            look_at_any_point(bool): robot should look at the closest point on target mask instead of average pt
+        """
+
+        obstacles, explored = self.voxel_map.get_2d_map()
+
+        # Extract edges from our explored mask
+        start_pt = planner.to_pt(start)
+        reachable_points = planner.get_reachable_points(start_pt)
+        if len(reachable_points) == 0:
+            print("No target point find, maybe no point is reachable")
+            return None
+        reachable_xs, reachable_ys = zip(*reachable_points)
+        # # type: ignore comments used to bypass mypy check
+        reachable_xs = torch.tensor(reachable_xs)  # type: ignore
+        reachable_ys = torch.tensor(reachable_ys)  # type: ignore
+        reachable = torch.empty(obstacles.shape, dtype=torch.bool).fill_(False)
+        reachable[reachable_xs, reachable_ys] = True
+
+        obstacles, explored = self.voxel_map.get_2d_map()
+        # if self.dilate_obstacles_kernel is not None:
+        #     obstacles = binary_dilation(
+        #         obstacles.float().unsqueeze(0).unsqueeze(0), self.dilate_obstacles_kernel
+        #     )[0, 0].bool()
+        reachable = reachable & ~obstacles
+
+        target_x, target_y = planner.to_pt(point)
+
+        xs, ys = torch.where(reachable)
+        if len(xs) < 1:
+            print("No target point find, maybe no point is reachable")
+            return None
+        selected_targets = torch.stack([xs, ys], dim=-1)[
+            torch.linalg.norm(
+                (torch.stack([xs, ys], dim=-1) - torch.tensor([target_x, target_y])).float(), dim=-1
+            )
+            .topk(k=len(xs), largest=False)
+            .indices
+        ]
+
+        # TODO: was this:
+        # expanded_mask = expanded_mask & less_explored & ~obstacles
+
+        for selected_target in selected_targets:
+            selected_x, selected_y = planner.to_xy([selected_target[0], selected_target[1]])
+            theta = self.compute_theta(selected_x, selected_y, point[0], point[1])
+
+            # if debug and self.is_valid([selected_x, selected_y, theta]):
+            #     import matplotlib.pyplot as plt
+
+            #     obstacles, explored = self.voxel_map.get_2d_map()
+            #     plt.scatter(ys, xs, s = 1)
+            #     plt.scatter(selected_target[1], selected_target[0], s = 10)
+            #     plt.scatter(target_y, target_x, s = 10)
+            #     plt.imshow(obstacles)
+            target_is_valid = self.is_valid([selected_x, selected_y, theta])
+            # print('Target:', [selected_x, selected_y, theta])
+            # print('Target is valid:', target_is_valid)
+            if not target_is_valid:
+                continue
+            if np.linalg.norm([selected_x - point[0], selected_y - point[1]]) <= 0.35:
+                continue
+            elif np.linalg.norm([selected_x - point[0], selected_y - point[1]]) <= 0.45:
+                # print('OBSTACLE AVOIDANCE')
+                # print(selected_target[0].int(), selected_target[1].int())
+                i = (point[0] - selected_target[0]) // abs(point[0] - selected_target[0])
+                j = (point[1] - selected_target[1]) // abs(point[1] - selected_target[1])
+                index_i = int(selected_target[0].int() + i)
+                index_j = int(selected_target[1].int() + j)
+                if obstacles[index_i][index_j]:
+                    target_is_valid = False
+            # elif np.linalg.norm([selected_x - point[0], selected_y - point[1]]) <= 0.5:
+            #     for i in [-1, 0, 1]:
+            #         for j in [-1, 0, 1]:
+            #             if obstacles[selected_target[0] + i][selected_target[1] + j]:
+            #                 target_is_valid = False
+            if not target_is_valid:
+                continue
+
+            return np.array([selected_x, selected_y, theta])
+        return None
+
+    def sample_near_mask(
+        self,
+        mask: torch.Tensor,
+        radius_m: float = 0.6,
+        max_tries: int = 1000,
+        verbose: bool = False,
+        debug: bool = False,
+        look_at_any_point: bool = False,
+    ):
+        """Sample a position near the mask and return.
+
+        Args:
+            look_at_any_point(bool): robot should look at the closest point on target mask instead of average pt
+        """
+
+        obstacles, explored = self.voxel_map.get_2d_map()
+
+        # Extract edges from our explored mask
+
+        # Radius computed from voxel map measurements
+        radius = np.ceil(radius_m / self.voxel_map.grid_resolution)
+        expanded_mask = expand_mask(mask, radius)
+
+        # TODO: was this:
+        # expanded_mask = expanded_mask & less_explored & ~obstacles
+        expanded_mask = expanded_mask & explored & ~obstacles
+        # print(torch.where(explored & ~obstacles))
+        # print(torch.where(expanded_mask))
+
+        if debug:
+            import matplotlib.pyplot as plt
+
+            plt.imshow(
+                mask.int() * 20 + expanded_mask.int() * 10 + explored.int() + obstacles.int() * 5
+            )
+            # import datetime
+            # current_datetime = datetime.datetime.now()
+            # formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
+            # plt.savefig('debug_' + formatted_datetime + '.png')
+
+        # Where can the robot go?
+        valid_indices = torch.nonzero(expanded_mask, as_tuple=False)
+        if valid_indices.size(0) == 0:
+            print("[VOXEL MAP: sampling] No valid goals near mask!")
+            return None
+        if not look_at_any_point:
+            mask_indices = torch.nonzero(mask, as_tuple=False)
+            outside_point = mask_indices.float().mean(dim=0)
+
+        # maximum number of tries
+        for i in range(max_tries):
+            random_index = torch.randint(valid_indices.size(0), (1,))
+            point_grid_coords = valid_indices[random_index]
+
+            if look_at_any_point:
+                outside_point = find_closest_point_on_mask(mask, point_grid_coords.float())
+
+            # convert back
+            point = self.voxel_map.grid_coords_to_xy(point_grid_coords.numpy())
+            if point is None:
+                print("[VOXEL MAP: sampling] ERR:", point, point_grid_coords)
+                continue
+            if outside_point is None:
+                print(
+                    "[VOXEL MAP: sampling] ERR finding closest pt:",
+                    point,
+                    point_grid_coords,
+                    "closest =",
+                    outside_point,
+                )
+                continue
+            theta = math.atan2(
+                outside_point[1] - point_grid_coords[0, 1],
+                outside_point[0] - point_grid_coords[0, 0],
+            )
+
+            # Ensure angle is in 0 to 2 * PI
+            if theta < 0:
+                theta += 2 * np.pi
+
+            xyt = torch.zeros(3)
+            # # type: ignore to bypass mypy check
+            xyt[:2] = point  # type: ignore
+            xyt[2] = theta
+
+            # Check to see if this point is valid
+            if verbose:
+                print("[VOXEL MAP: sampling]", radius, i, "sampled", xyt)
+            if self.is_valid(xyt, verbose=verbose):
+                yield xyt
+
+        # We failed to find anything useful
+        return None
+
+    def has_zero_contour(self, phi):
+        """
+        Check if a zero contour exists in the given phi array.
+
+        Parameters:
+        - phi: 2D NumPy array with boolean values.
+
+        Returns:
+        - True if a zero contour exists, False otherwise.
+        """
+        # Check if there are True and False values in the array
+        has_true_values = np.any(phi)
+        has_false_values = np.any(~phi)
+
+        # Return True if both True and False values are present
+        return has_true_values and has_false_values
+
+    def _get_kernel(self, size: int):
+        """Return a kernel for expanding/shrinking areas."""
+        if size <= 0:
+            return None
+        if size not in self._kernels:
+            kernel = torch.nn.Parameter(
+                torch.from_numpy(skimage.morphology.disk(size)).unsqueeze(0).unsqueeze(0).float(),
+                requires_grad=False,
+            )
+            self._kernels[size] = kernel
+        return self._kernels[size]
+
+    def get_frontier(
+        self, expand_size: int = 5, debug: bool = False
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute frontier regions of the map"""
+
+        obstacles, explored = self.voxel_map.get_2d_map()
+        # Extract edges from our explored mask
+        # if self.dilate_obstacles_kernel is not None:
+        #     obstacles = binary_dilation(
+        #         obstacles.float().unsqueeze(0).unsqueeze(0), self.dilate_obstacles_kernel
+        #     )[0, 0].bool()
+        if self.dilate_explored_kernel is not None:
+            less_explored = binary_erosion(
+                explored.float().unsqueeze(0).unsqueeze(0), self.dilate_explored_kernel
+            )[0, 0]
+        else:
+            less_explored = explored
+
+        # Get the masks from our 3d map
+        edges = get_edges(less_explored)
+
+        # Do not explore obstacles any more
+        traversible = explored & ~obstacles
+        frontier_edges = edges & ~obstacles
+
+        kernel = self._get_kernel(expand_size)
+        if kernel is not None:
+            expanded_frontier = binary_dilation(
+                frontier_edges.float().unsqueeze(0).unsqueeze(0),
+                kernel,
+            )[0, 0].bool()
+        else:
+            # This is a bad idea, planning will probably fail
+            expanded_frontier = frontier_edges
+
+        outside_frontier = expanded_frontier & ~explored
+        frontier = expanded_frontier & ~obstacles & explored
+
+        if debug:
+            import matplotlib.pyplot as plt
+
+            plt.subplot(321)
+            plt.imshow(obstacles.cpu().numpy())
+            plt.title("obstacles")
+            plt.subplot(322)
+            plt.imshow(explored.bool().cpu().numpy())
+            plt.title("explored")
+            plt.subplot(323)
+            plt.imshow((traversible & frontier).cpu().numpy())
+            plt.title("traversible & frontier")
+            plt.subplot(324)
+            plt.imshow((expanded_frontier).cpu().numpy())
+            plt.title("just frontiers")
+            plt.subplot(325)
+            plt.imshow((edges).cpu().numpy())
+            plt.title("edges")
+            plt.subplot(326)
+            plt.imshow((frontier_edges).cpu().numpy())
+            plt.title("frontier_edges")
+            # plt.show()
+
+        return frontier, outside_frontier, traversible
+
+    def sample_exploration(
+        self,
+        xyt,
+        planner,
+        voxel_map_localizer=None,
+        text=None,
+        debug=False,
+    ):
+        obstacles, explored, history_soft = self.voxel_map.get_2d_map(return_history_id=True)
+        if len(xyt) == 3:
+            xyt = xyt[:2]
+        reachable_points = planner.get_reachable_points(planner.to_pt(xyt))
+        reachable_xs, reachable_ys = zip(*reachable_points)
+        reachable_xs = torch.tensor(reachable_xs)
+        reachable_ys = torch.tensor(reachable_ys)
+
+        reachable_map = torch.zeros_like(obstacles)
+        reachable_map[reachable_xs, reachable_ys] = 1
+        reachable_map = reachable_map.to(torch.bool)
+        edges = get_edges(reachable_map)
+        # kernel = self._get_kernel(expand_size)
+        kernel = None
+        if kernel is not None:
+            expanded_frontier = binary_dilation(
+                edges.float().unsqueeze(0).unsqueeze(0),
+                kernel,
+            )[0, 0].bool()
+        else:
+            expanded_frontier = edges
+        outside_frontier = expanded_frontier & ~reachable_map
+        time_heuristics = self._time_heuristic(history_soft, outside_frontier, debug=debug)
+        if voxel_map_localizer is not None:
+            alignments_heuristics = self.voxel_map.get_2d_alignment_heuristics(
+                voxel_map_localizer, text
+            )
+            alignments_heuristics = self._alignment_heuristic(
+                alignments_heuristics, outside_frontier, debug=debug
+            )
+            total_heuristics = time_heuristics + alignments_heuristics
+        else:
+            alignments_heuristics = None
+            total_heuristics = time_heuristics
+
+        rounded_heuristics = np.ceil(total_heuristics * 100) / 100
+        max_heuristic = rounded_heuristics.max()
+        indices = np.column_stack(np.where(rounded_heuristics == max_heuristic))
+        closest_index = np.argmin(
+            np.linalg.norm(indices - np.asarray(planner.to_pt([0, 0, 0])), axis=-1)
+        )
+        index = indices[closest_index]
+        # index = np.unravel_index(np.argmax(total_heuristics), total_heuristics.shape)
+        # debug = True
+        if debug:
+            from matplotlib import pyplot as plt
+
+            plt.subplot(221)
+            plt.imshow(obstacles.int() * 5 + outside_frontier.int() * 10)
+            plt.subplot(222)
+            plt.imshow(explored.int() * 5)
+            plt.subplot(223)
+            plt.imshow(total_heuristics)
+            plt.scatter(index[1], index[0], s=15, c="g")
+            plt.show()
+        return index, time_heuristics, alignments_heuristics, total_heuristics
+
+    def _alignment_heuristic(
+        self,
+        alignments,
+        outside_frontier,
+        alignment_smooth=50,
+        alignment_threshold=0.12,
+        debug=False,
+    ):
+        alignments = np.ma.masked_array(alignments, ~outside_frontier)
+        alignment_heuristics = 1 / (
+            1 + np.exp(-alignment_smooth * (alignments - alignment_threshold))
+        )
+        index = np.unravel_index(np.argmax(alignment_heuristics), alignments.shape)
+        if debug:
+            plt.clf()
+            plt.title("alignment")
+            plt.imshow(alignment_heuristics)
+            plt.scatter(index[1], index[0], s=15, c="g")
+            plt.show()
+        return alignment_heuristics
+
+    def _time_heuristic(
+        self, history_soft, outside_frontier, time_smooth=0.1, time_threshold=15, debug=False
+    ):
+        history_soft = np.ma.masked_array(history_soft, ~outside_frontier)
+        time_heuristics = history_soft.max() - history_soft
+        time_heuristics[history_soft < 1] = float("inf")
+        time_heuristics = 1 / (1 + np.exp(-time_smooth * (time_heuristics - time_threshold)))
+        index = np.unravel_index(np.argmax(time_heuristics), history_soft.shape)
+        # return index
+        # debug = True
+        if debug:
+            # plt.clf()
+            plt.title("time")
+            plt.imshow(time_heuristics)
+            plt.scatter(index[1], index[0], s=15, c="r")
+            plt.show()
+        return time_heuristics
+
+    def sample_closest_frontier(
+        self,
+        xyt: np.ndarray,
+        max_tries: int = 1000,
+        expand_size: int = 5,
+        debug: bool = False,
+        verbose: bool = False,
+        step_dist: float = 0.1,
+        min_dist: float = 0.1,
+    ):
+        """Sample a valid location on the current frontier using FMM planner to compute geodesic distance. Returns points in order until it finds one that's valid.
+
+        Args:
+            xyt(np.ndrray): [x, y, theta] of the agent; must be of size 2 or 3.
+            max_tries(int): number of attempts to make for rejection sampling
+            debug(bool): show visualizations of frontiers
+            step_dist(float): how far apart in geo dist these points should be
+        """
+        assert len(xyt) == 2 or len(xyt) == 3, f"xyt must be of size 2 or 3 instead of {len(xyt)}"
+
+        frontier, outside_frontier, traversible = self.get_frontier(
+            expand_size=expand_size, debug=debug
+        )
+
+        # from scipy.ndimage.morphology import distance_transform_edt
+        m = np.ones_like(traversible)
+        start_x, start_y = self.voxel_map.xy_to_grid_coords(xyt[:2]).int().cpu().numpy()
+        if verbose or debug:
+            print("--- Coordinates ---")
+            print(f"{xyt=}")
+            print(f"{start_x=}, {start_y=}")
+
+        m[start_x, start_y] = 0
+        m = np.ma.masked_array(m, ~traversible)
+
+        if not self.has_zero_contour(m):
+            if verbose:
+                print("traversible frontier had zero contour! no where to go.")
+            return None
+
+        distance_map = skfmm.distance(m, dx=1)
+        frontier_map = distance_map.copy()
+        # Masks are the areas we are ignoring - ignore everything but the frontiers
+        frontier_map.mask = np.bitwise_or(frontier_map.mask, ~frontier.cpu().numpy())
+
+        # Get distances of frontier points
+        distances = frontier_map.compressed()
+        xs, ys = np.where(~frontier_map.mask)
+
+        if verbose or debug:
+            print(f"-> found {len(distances)} items")
+
+        assert len(xs) == len(ys) and len(xs) == len(distances)
+        tries = 1
+        prev_dist = -1 * float("Inf")
+        for x, y, dist in sorted(zip(xs, ys, distances), key=lambda x: x[2]):
+            if dist < min_dist:
+                continue
+
+            # Don't explore too close to where we are
+            if dist < prev_dist + step_dist:
+                continue
+            prev_dist = dist
+
+            point_grid_coords = torch.FloatTensor([[x, y]])
+            outside_point = find_closest_point_on_mask(outside_frontier, point_grid_coords)
+
+            if outside_point is None:
+                print(
+                    "[VOXEL MAP: sampling] ERR finding closest pt:",
+                    point_grid_coords,
+                    "closest =",
+                    outside_point,
+                )
+                continue
+
+            yield self.voxel_map.grid_coords_to_xy(outside_point)
+            # # convert back to real-world coordinates
+            # point = self.voxel_map.grid_coords_to_xy(point_grid_coords)
+            # if point is None:
+            #     print("[VOXEL MAP: sampling] ERR:", point, point_grid_coords)
+            #     continue
+
+            # theta = math.atan2(
+            #     outside_point[1] - point_grid_coords[0, 1],
+            #     outside_point[0] - point_grid_coords[0, 0],
+            # )
+            # if debug:
+            #     print(f"{dist=}, {x=}, {y=}, {theta=}")
+
+            # # Ensure angle is in 0 to 2 * PI
+            # if theta < 0:
+            #     theta += 2 * np.pi
+
+            # xyt = torch.zeros(3)
+            # xyt[:2] = point
+            # xyt[2] = theta
+
+            # # Check to see if this point is valid
+            # if verbose:
+            #     print("[VOXEL MAP: sampling] sampled", xyt)
+            # if self.is_valid(xyt, debug=debug):
+            #     yield xyt
+
+            # tries += 1
+            # if tries > max_tries:
+            #     break
+        yield None
+
+    def show(
+        self,
+        orig: Optional[np.ndarray] = None,
+        norm: float = 255.0,
+        backend: str = "open3d",
+    ):
+        """Tool for debugging map representations that we have created"""
+        geoms = self.voxel_map._get_open3d_geometries(orig, norm)
+
+        # lazily import open3d - it's a tough dependency
+        import open3d
+
+        # Show the geometries of where we have explored
+        open3d.visualization.draw_geometries(geoms)
diff --git a/src/stretch/dynav/mapping_utils/voxelized_pcd.py b/src/stretch/dynav/mapping_utils/voxelized_pcd.py
new file mode 100644
index 000000000..e2898434b
--- /dev/null
+++ b/src/stretch/dynav/mapping_utils/voxelized_pcd.py
@@ -0,0 +1,634 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import Tensor
+
+USE_TORCH_GEOMETRIC = True
+if USE_TORCH_GEOMETRIC:
+    from torch_geometric.nn.pool.consecutive import consecutive_cluster
+    from torch_geometric.nn.pool.voxel_grid import voxel_grid
+    from torch_geometric.utils import scatter
+else:
+    from stretch.utils.torch_geometric import consecutive_cluster, voxel_grid
+    from stretch.utils.torch_scatter import scatter
+
+from sklearn.cluster import DBSCAN
+
+
+def project_points(points_3d, K, pose):
+    if not isinstance(K, torch.Tensor):
+        K = torch.Tensor(K)
+    K = K.to(points_3d)
+    if not isinstance(pose, torch.Tensor):
+        pose = torch.Tensor(pose)
+    pose = pose.to(points_3d)
+    # Convert points to homogeneous coordinates
+    points_3d_homogeneous = torch.hstack(
+        (points_3d, torch.ones((points_3d.shape[0], 1)).to(points_3d))
+    )
+
+    # Transform points into camera coordinate system
+    points_camera_homogeneous = torch.matmul(torch.linalg.inv(pose), points_3d_homogeneous.T).T
+    points_camera_homogeneous = points_camera_homogeneous[:, :3]
+
+    # Project points into image plane
+    points_2d_homogeneous = torch.matmul(K, points_camera_homogeneous.T).T
+    points_2d = points_2d_homogeneous[:, :2] / points_2d_homogeneous[:, 2:]
+
+    return points_2d
+
+
+def get_depth_values(points_3d, pose):
+    # Convert points to homogeneous coordinates
+    if not isinstance(pose, torch.Tensor):
+        pose = torch.Tensor(pose)
+    pose = pose.to(points_3d)
+    points_3d_homogeneous = torch.hstack(
+        (points_3d, torch.ones((points_3d.shape[0], 1)).to(points_3d))
+    )
+
+    # Transform points into camera coordinate system
+    points_camera_homogeneous = torch.matmul(torch.linalg.inv(pose), points_3d_homogeneous.T).T
+
+    # Extract depth values (z-coordinates)
+    depth_values = points_camera_homogeneous[:, 2]
+
+    return depth_values
+
+
+class VoxelizedPointcloud:
+    _INTERNAL_TENSORS = [
+        "_points",
+        "_features",
+        "_weights",
+        "_rgb",
+        "dim_mins",
+        "dim_maxs",
+        "_mins",
+        "_maxs",
+    ]
+
+    _INIT_ARGS = ["voxel_size", "dim_mins", "dim_maxs", "feature_pool_method"]
+
+    def __init__(
+        self,
+        voxel_size: float = 0.05,
+        dim_mins: Optional[Tensor] = None,
+        dim_maxs: Optional[Tensor] = None,
+        feature_pool_method: str = "mean",
+    ):
+        """
+
+        Args:
+            voxel_size (Tensor): float, voxel size in each dim
+            dim_mins (Tensor): 3, tensor of minimum coords possible in voxel grid
+            dim_maxs (Tensor): 3, tensor of maximum coords possible in voxel grid
+            feature_pool_method (str, optional): How to pool features within a voxel. One of 'mean', 'max', 'sum'. Defaults to 'mean'.
+        """
+
+        assert (dim_mins is None) == (dim_maxs is None)
+        self.dim_mins = dim_mins
+        self.dim_maxs = dim_maxs
+        self.voxel_size = voxel_size
+        self.feature_pool_method = feature_pool_method
+        assert self.feature_pool_method in [
+            "mean",
+            "max",
+            "sum",
+        ], f"Unknown feature pool method {feature_pool_method}"
+
+        self.reset()
+
+    def reset(self):
+        """Resets internal tensors"""
+        self._points, self._features, self._weights, self._rgb = None, None, None, None
+        self._obs_counts = None
+        self._entity_ids = None
+        self._mins = self.dim_mins
+        self._maxs = self.dim_maxs
+        self.obs_count = 1
+
+    def clear_points(self, depth, intrinsics, pose, depth_is_valid=None, min_samples_clear=None):
+        if self._points is not None:
+            xys = project_points(self._points.detach().cpu(), intrinsics, pose).int()
+            xys = xys[:, [1, 0]]
+            proj_depth = get_depth_values(self._points.detach().cpu(), pose)
+            H, W = depth.shape
+
+            # Some points are projected to (i, j) on image plane and i, j might be smaller than 0 or greater than image size
+            # which will lead to Index Error.
+            valid_xys = xys.clone()
+            valid_xys[
+                torch.any(
+                    torch.stack(
+                        [
+                            xys[:, 0] < 0,
+                            xys[:, 0] >= H,
+                            xys[:, 1] < 0,
+                            xys[:, 1] >= W,
+                        ],
+                        dim=0,
+                    ),
+                    dim=0,
+                )
+            ] = 0
+            indices = torch.any(
+                torch.stack(
+                    [
+                        # the points are projected outside image frame
+                        xys[:, 0] < 0,
+                        xys[:, 0] >= H,
+                        xys[:, 1] < 0,
+                        xys[:, 1] >= W,
+                        # the points are projected to the image frame but is blocked by some obstacles
+                        depth[valid_xys[:, 0], valid_xys[:, 1]] < (proj_depth - 0.1),
+                        # the points are projected to the image frame but they are behind camera
+                        depth[valid_xys[:, 0], valid_xys[:, 1]] < 0.01,
+                        proj_depth < 0.01,
+                        # depth is too large
+                        # (~depth_is_valid)[valid_xys[:, 0], valid_xys[:, 1]],
+                        proj_depth > 2.0,
+                    ],
+                    dim=0,
+                ),
+                dim=0,
+            )
+
+            # if self._entity_ids is not None:
+            #     removed_entities, removed_counts = torch.unique(self._entity_ids.detach().cpu()[~indices], return_counts = True)
+            #     total_counts = torch.bincount(self._entity_ids.detach().cpu())
+            #     entities_to_be_removed = removed_entities[(removed_counts > total_counts[removed_entities] * 0.6) | (total_counts[removed_entities] - removed_counts < 5)]
+            #     indices = indices & ~torch.isin(self._entity_ids.detach().cpu(), entities_to_be_removed)
+
+            # print('Clearing non valid points...')
+            # print('Removing ' + str((~indices).sum().item()) + ' points.')
+            indices = indices.to(self._points.device)
+            self._points = self._points[indices]
+            if self._features is not None:
+                self._features = self._features[indices]
+            if self._weights is not None:
+                self._weights = self._weights[indices]
+            if self._rgb is not None:
+                self._rgb = self._rgb[indices]
+            if self._obs_counts is not None:
+                self._obs_counts = self._obs_counts[indices]
+            if self._entity_ids is not None:
+                self._entity_ids = self._entity_ids[indices]
+
+            if self._entity_ids is not None and min_samples_clear is not None:
+                dbscan = DBSCAN(eps=self.voxel_size * 4, min_samples=min_samples_clear)
+                cluster_vertices = torch.cat(
+                    (
+                        self._points.detach().cpu(),
+                        self._entity_ids.detach().cpu().reshape(-1, 1) * 1000,
+                    ),
+                    -1,
+                ).numpy()
+                clusters = dbscan.fit(cluster_vertices)
+                labels = clusters.labels_
+                indices = labels != -1
+                self._points = self._points[indices]
+                if self._features is not None:
+                    self._features = self._features[indices]
+                if self._weights is not None:
+                    self._weights = self._weights[indices]
+                if self._rgb is not None:
+                    self._rgb = self._rgb[indices]
+                if self._obs_counts is not None:
+                    self._obs_counts = self._obs_counts[indices]
+                if self._entity_ids is not None:
+                    self._entity_ids = self._entity_ids[indices]
+
+    def add(
+        self,
+        points: Tensor,
+        features: Optional[Tensor],
+        rgb: Optional[Tensor],
+        weights: Optional[Tensor] = None,
+        obs_count: Optional[int] = None,
+        entity_id: Optional[int] = None,
+    ):
+        """Add a feature pointcloud to the voxel grid.
+
+        Args:
+            points (Tensor): N x 3 points to add to the voxel grid
+            features (Tensor): N x D features associated with each point.
+                Reduction method can be set with feature_reduciton_method in init
+            rgb (Tensor): N x 3 colors s associated with each point.
+            weights (Optional[Tensor], optional): Weights for each point.
+                Can be detection confidence, distance to camera, etc.
+                Defaults to None.
+        """
+
+        if weights is None:
+            weights = torch.ones_like(points[..., 0])
+
+        if obs_count is None:
+            obs_counts = torch.ones_like(weights) * self.obs_count
+        else:
+            obs_counts = torch.ones_like(weights) * obs_count
+        if entity_id is None:
+            entity_ids = torch.ones_like(weights) * self.obs_count
+        else:
+            entity_ids = torch.ones_like(weights) * entity_id
+        self.obs_count += 1
+
+        # Update voxel grid bounds
+        # This isn't strictly necessary since the functions below can infer the bounds
+        # But we might want to do this anyway to enforce that bounds are a multiple of self.voxel_size
+        # And to enforce that the added points are within user-defined boundaries, if those were specified.
+        pos_mins, _ = points.min(dim=0)
+        pos_maxs, _ = points.max(dim=0)
+        if self.dim_mins is not None:
+            assert torch.all(
+                self.dim_mins <= pos_mins
+            ), "Got points outside of user-defined 3D bounds"
+        if self.dim_maxs is not None:
+            assert torch.all(
+                pos_maxs <= self.dim_maxs
+            ), "Got points outside of user-defined 3D bounds"
+
+        if self._mins is None:
+            self._mins, self._maxs = pos_mins, pos_maxs
+            # recompute_voxels = True
+        else:
+            assert self._maxs is not None, "How did self._mins get set without self._maxs?"
+            # recompute_voxels = torch.any(pos_mins < self._mins) or torch.any(self._maxs < pos_maxs)
+            self._mins = torch.min(self._mins, pos_mins)
+            self._maxs = torch.max(self._maxs, pos_maxs)
+
+        if self._points is None:
+            assert self._features is None, "How did self._points get unset while _features is set?"
+            # assert self._rgbs is None, "How did self._points get unset while _rgbs is set?"
+            assert self._weights is None, "How did self._points get unset while _weights is set?"
+            all_points, all_features, all_weights, all_rgb = (
+                points,
+                features,
+                weights,
+                rgb,
+            )
+            all_obs_counts = obs_counts
+            all_entity_ids = entity_ids
+        else:
+            assert (self._features is None) == (features is None)
+            all_points = torch.cat([self._points, points], dim=0)
+            all_weights = torch.cat([self._weights, weights], dim=0)
+            all_features = (
+                torch.cat([self._features, features], dim=0) if (features is not None) else None
+            )
+            all_rgb = torch.cat([self._rgb, rgb], dim=0) if (rgb is not None) else None
+            all_obs_counts = torch.cat([self._obs_counts, obs_counts], dim=0)
+            all_entity_ids = torch.cat([self._entity_ids, entity_ids], dim=0)
+        # Future optimization:
+        # If there are no new voxels, then we could save a bit of compute time
+        # by only recomputing the voxel/cluster for the new points
+        # e.g. if recompute_voxels:
+        #   raise NotImplementedError
+        cluster_voxel_idx, cluster_consecutive_idx, _ = voxelize(
+            all_points, voxel_size=self.voxel_size, start=self._mins, end=self._maxs
+        )
+        (
+            self._points,
+            self._features,
+            self._weights,
+            self._rgb,
+            self._obs_counts,
+            self._entity_ids,
+        ) = reduce_pointcloud(
+            cluster_consecutive_idx,
+            pos=all_points,
+            features=all_features,
+            weights=all_weights,
+            rgbs=all_rgb,
+            obs_counts=all_obs_counts,
+            entity_ids=all_entity_ids,
+            feature_reduce=self.feature_pool_method,
+        )
+        self._obs_counts, self._entity_ids = self._obs_counts.int(), self._entity_ids.int()
+        return
+
+    def get_idxs(self, points: Tensor) -> Tuple[Tensor, ...]:
+        """Returns voxel index (long tensor) for each point in points
+
+        Args:
+            points (Tensor): N x 3
+
+        Returns:
+            cluster_voxel_idx (Tensor): The voxel grid index (long tensor) for each point in points
+            cluster_consecutive_idx (Tensor): Voxel grid reindexed to be consecutive (packed)
+        """
+        (
+            cluster_voxel_idx,
+            cluster_consecutive_idx,
+            _,
+        ) = voxelize(points, self.voxel_size, start=self._mins, end=self._maxs)
+        return cluster_voxel_idx, cluster_consecutive_idx
+
+    def get_voxel_idx(self, points: Tensor) -> Tensor:
+        """Returns voxel index (long tensor) for each point in points
+
+        Args:
+            points (Tensor): N x 3
+
+        Returns:
+            Tensor: voxel index (long tensor) for each point in points
+        """
+        (
+            cluster_voxel_idx,
+            _,
+        ) = self.get_idxs(points)
+        return cluster_voxel_idx
+
+    def get_consecutive_cluster_idx(self, points: Tensor) -> Tensor:
+        """Returns voxel index (long tensor) for each point in points
+
+        Args:
+            points (Tensor): N x 3
+
+        Returns:
+            Tensor: voxel index (long tensor) for each point in points
+        """
+        (
+            _,
+            cluster_consecutive_idx,
+        ) = self.get_idxs(points)
+        return cluster_consecutive_idx
+
+    def get_pointcloud(self) -> Tuple[Tensor, ...]:
+        """Returns pointcloud (1 point per occupied voxel)
+
+        Returns:
+            points (Tensor): N x 3
+            features (Tensor): N x D
+            weights (Tensor): N
+        """
+        return self._points, self._features, self._weights, self._rgb
+
+    def clone(self):
+        """
+        Deep copy of object. All internal tensors are cloned individually.
+
+        Returns:
+            new VoxelizedPointcloud object.
+        """
+        other = self.__class__(**{k: getattr(self, k) for k in self._INIT_ARGS})
+        for k in self._INTERNAL_TENSORS:
+            v = getattr(self, k)
+            if torch.is_tensor(v):
+                setattr(other, k, v.clone())
+        return other
+
+    def to(self, device: Union[str, torch.device]):
+        """
+
+        Args:
+          device: Device (as str or torch.device) for the new tensor.
+
+        Returns:
+          self
+        """
+        other = self.clone()
+        for k in self._INTERNAL_TENSORS:
+            v = getattr(self, k)
+            if torch.is_tensor(v):
+                setattr(other, k, v.to(device))
+        return other
+
+    def cpu(self):
+        return self.to("cpu")
+
+    def cuda(self):
+        return self.to("cuda")
+
+    def detach(self):
+        """
+        Detach object. All internal tensors are detached individually.
+
+        Returns:
+            new VoxelizedPointcloud object.
+        """
+        other = self.__class__({k: getattr(self, k) for k in self._INIT_ARGS})
+        for k in self._INTERNAL_TENSORS:
+            v = getattr(self, k)
+            if torch.is_tensor(v):
+                setattr(other, k, v.detach())
+        return other
+
+
+def voxelize(
+    pos: Tensor,
+    voxel_size: float,
+    batch: Optional[Tensor] = None,
+    start: Optional[Union[float, Tensor]] = None,
+    end: Optional[Union[float, Tensor]] = None,
+) -> Tuple[Tensor, ...]:
+    """Returns voxel indices and packed (consecutive) indices for points
+
+    Args:
+        pos (Tensor): [N, 3] locations
+        voxel_size (float): Size (resolution) of each voxel in the grid
+        batch (Optional[Tensor], optional): Batch index of each point in pos. Defaults to None.
+        start (Optional[Union[float, Tensor]], optional): Mins along each coordinate for the voxel grid.
+            Defaults to None, in which case the starts are inferred from min values in pos.
+        end (Optional[Union[float, Tensor]], optional):  Maxes along each coordinate for the voxel grid.
+            Defaults to None, in which case the starts are inferred from max values in pos.
+    Returns:
+        voxel_idx (LongTensor): Idx of each point's voxel coordinate. E.g. [0, 0, 4, 3, 3, 4]
+        cluster_consecutive_idx (LongTensor): Packed idx -- contiguous in cluster ID. E.g. [0, 0, 2, 1, 1, 2]
+        batch_sample: See https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/nn/pool/max_pool.html
+    """
+    voxel_cluster = voxel_grid(pos=pos, batch=batch, size=voxel_size, start=start, end=end)
+    cluster_consecutive_idx, perm = consecutive_cluster(voxel_cluster)
+    batch_sample = batch[perm] if batch is not None else None
+    cluster_idx = voxel_cluster
+    return cluster_idx, cluster_consecutive_idx, batch_sample
+
+
+def scatter_weighted_mean(
+    features: Tensor,
+    weights: Tensor,
+    cluster: Tensor,
+    weights_cluster: Tensor,
+    dim: int,
+) -> Tensor:
+    """_summary_
+
+    Args:
+        features (Tensor): [N, D] features at each point
+        weights (Optional[Tensor], optional): [N,] weights of each point. Defaults to None.
+        cluster (LongTensor): [N] IDs of each point (clusters.max() should be <= N, or you'll OOM)
+        weights_cluster (Tensor): [N,] aggregated weights of each cluster, used to normalize
+        dim (int): Dimension along which to do the reduction -- should be 0
+
+    Returns:
+        Tensor: Agggregated features, weighted by weights and normalized by weights_cluster
+    """
+    assert dim == 0, "Dim != 0 not yet implemented"
+    feature_cluster = scatter(features * weights[:, None], cluster, dim=dim, reduce="sum")
+    feature_cluster = feature_cluster / weights_cluster[:, None]
+    return feature_cluster
+
+
+def reduce_pointcloud(
+    voxel_cluster: Tensor,
+    pos: Tensor,
+    features: Tensor,
+    weights: Optional[Tensor] = None,
+    rgbs: Optional[Tensor] = None,
+    obs_counts: Optional[Tensor] = None,
+    entity_ids: Optional[Tensor] = None,
+    feature_reduce: str = "mean",
+) -> Tuple[Tensor, ...]:
+    """Pools values within each voxel
+
+    Args:
+        voxel_cluster (LongTensor): [N] IDs of each point
+        pos (Tensor): [N, 3] position of each point
+        features (Tensor): [N, D] features at each point
+        weights (Optional[Tensor], optional): [N,] weights of each point. Defaults to None.
+        rgbs (Optional[Tensor], optional): [N, 3] colors of each point. Defaults to None.
+        feature_reduce (str, optional): Feature reduction method. Defaults to 'mean'.
+
+    Raises:
+        NotImplementedError: if unknown reduction method
+
+    Returns:
+        pos_cluster (Tensor): weighted average position within each voxel
+        feature_cluster (Tensor): aggregated feature of each voxel
+        weights_cluster (Tensor): aggregated weights of each voxel
+        rgb_cluster (Tensor): colors of each voxel
+    """
+    if weights is None:
+        weights = torch.ones_like(pos[..., 0])
+    weights_cluster = scatter(weights, voxel_cluster, dim=0, reduce="sum")
+
+    pos_cluster = scatter_weighted_mean(pos, weights, voxel_cluster, weights_cluster, dim=0)
+
+    if rgbs is not None:
+        rgb_cluster = scatter_weighted_mean(rgbs, weights, voxel_cluster, weights_cluster, dim=0)
+    else:
+        rgb_cluster = None
+
+    if obs_counts is not None:
+        obs_count_cluster = scatter(obs_counts, voxel_cluster, dim=0, reduce="max")
+    else:
+        obs_count_cluster = None
+
+    if entity_ids is not None:
+        entity_ids_cluster = scatter(entity_ids, voxel_cluster, dim=0, reduce="max")
+    else:
+        entity_ids_cluster = None
+
+    if features is None:
+        return (
+            pos_cluster,
+            None,
+            weights_cluster,
+            rgb_cluster,
+            obs_count_cluster,
+            entity_ids_cluster,
+        )
+
+    if feature_reduce == "mean":
+        feature_cluster = scatter_weighted_mean(
+            features, weights, voxel_cluster, weights_cluster, dim=0
+        )
+    elif feature_reduce == "max":
+        feature_cluster = scatter(features, voxel_cluster, dim=0, reduce="max")
+    elif feature_reduce == "sum":
+        feature_cluster = scatter(features * weights[:, None], voxel_cluster, dim=0, reduce="sum")
+    else:
+        raise NotImplementedError(f"Unknown feature reduction method {feature_reduce}")
+
+    return (
+        pos_cluster,
+        feature_cluster,
+        weights_cluster,
+        rgb_cluster,
+        obs_count_cluster,
+        entity_ids_cluster,
+    )
+
+
+def scatter3d(
+    voxel_indices: Tensor,
+    weights: Tensor,
+    grid_dimensions: List[int],
+    method: str = "mean",
+) -> Tensor:
+    """Scatter weights into a 3d voxel grid of the appropriate size.
+
+    Args:
+        voxel_indices (LongTensor): [N, 3] indices to scatter values to.
+        weights (FloatTensor): [N] values of equal size to scatter through voxel map.
+        grid_dimenstions (List[int]): sizes of the resulting voxel map, should be 3d.
+
+    Returns:
+        voxels (FloatTensor): [grid_dimensions] voxel map containing combined weights."""
+
+    assert voxel_indices.shape[0] == weights.shape[0], "weights and indices must match"
+    assert len(grid_dimensions) == 3, "this is designed to work only in 3d"
+    assert voxel_indices.shape[-1] == 3, "3d points expected for indices"
+
+    # Calculate a unique index for each voxel in the 3D grid
+    unique_voxel_indices = (
+        voxel_indices[:, 0] * (grid_dimensions[1] * grid_dimensions[2])
+        + voxel_indices[:, 1] * grid_dimensions[2]
+        + voxel_indices[:, 2]
+    )
+
+    # Use scatter to accumulate weights into voxels
+    voxel_weights = scatter(
+        weights,
+        unique_voxel_indices,
+        dim=0,
+        reduce=method,
+        dim_size=grid_dimensions[0] * grid_dimensions[1] * grid_dimensions[2],
+    )
+    return voxel_weights.reshape(*grid_dimensions)
+
+
+def drop_smallest_weight_points(
+    points: Tensor,
+    voxel_size: float = 0.01,
+    drop_prop: float = 0.1,
+    min_points_after_drop: int = 3,
+):
+    voxel_pcd = VoxelizedPointcloud(
+        voxel_size=voxel_size,
+        dim_mins=None,
+        dim_maxs=None,
+        feature_pool_method="mean",
+    )
+    voxel_pcd.add(
+        points=points,
+        features=None,  # instance.point_cloud_features,
+        rgb=None,  # instance.point_cloud_rgb,
+    )
+    orig_points = points
+    points = voxel_pcd._points
+    weights = voxel_pcd._weights
+    assert len(points) > 0, points.shape
+    weights_sorted, sort_idxs = torch.sort(weights, dim=0)
+    points_sorted = points[sort_idxs]
+    weights_cumsum = torch.cumsum(weights_sorted, dim=0)
+    above_cutoff = weights_cumsum >= (drop_prop * weights_cumsum[-1])
+    cutoff_idx = int(above_cutoff.max(dim=0).indices)
+    if len(points_sorted[cutoff_idx:]) < min_points_after_drop:
+        return orig_points
+    # print(f"Reduced {len(orig_points)} -> {len(points)} -> {above_cutoff.sum()}")
+    return points_sorted[cutoff_idx:]
diff --git a/src/stretch/dynav/ok_robot_hw/README.md b/src/stretch/dynav/ok_robot_hw/README.md
new file mode 100644
index 000000000..ab93a5c63
--- /dev/null
+++ b/src/stretch/dynav/ok_robot_hw/README.md
@@ -0,0 +1,43 @@
+# Robot Side Code
+
+Most of the heavy code will be running in the workstation and will communicate with the robot through sockets
+
+## Preparation to run robot side codes
+
+Our hardware codes heavily rely on robot controller provided by [home-robot](https://github.com/facebookresearch/home-robot).
+
+You need to install the home-robot on stretch robot following instructions provided by [home-robot-hw](https://github.com/facebookresearch/home-robot/blob/main/docs/install_robot.md) before running any robot side codes on robot.
+
+To check whether home-robot is installed properly and got familiar with running home-robot based codes, we recommend you try to run [these test scripts](https://github.com/facebookresearch/home-robot/blob/main/tests/hw_manual_test.py).
+
+Once you finished installing home-robot, follow [these steps](../docs/robot-installation.md) to enable OK-Robot use home-robot controllers.
+
+The success of OK-Robot system also depends on robot calibration and accurate urdf file, so make sure you follow [these calibration steps](../docs/robot-calibration.md) to obtain an accurate urdf file for your robot.
+
+## Start home-robot
+
+```
+stretch_robot_home.py
+roslaunch home_robot_hw startup_stretch_hector_slam.launch
+```
+
+## Start Robot Control
+
+```
+python run.py -x1 [x1] -y1 [y1] -x2 [x2] -y2 [y2] -ip [your workstation ip]
+```
+
+- **\[x1, y1\]** - Coordinated of tape on which the base of the robot is on
+- **\[x2, y2\]** - Coordinates of the secondary tape.
+- **ip** - Your workstation ip, the robot will try to communicate with this ip
+- **np** - Navigation port number, the robot will listen to this port number to get planned navigation paths (default: 5555)
+- **mp** - Manipulation port number, the robot will listen to this port number to get estimated manipulation pose (default: 5556)
+
+After running run.py it will go through 4 states in each cycle.
+
+- Picking Navigation
+- Manipulation
+- Placing Navigation
+- Placing
+
+For each navigation stage it asks A \[Object Name\], B \[Near by Object Name\]
diff --git a/src/stretch/dynav/ok_robot_hw/camera.py b/src/stretch/dynav/ok_robot_hw/camera.py
new file mode 100644
index 000000000..d6fa8df07
--- /dev/null
+++ b/src/stretch/dynav/ok_robot_hw/camera.py
@@ -0,0 +1,68 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+## License: Apache 2.0. See LICENSE file in root directory.
+## Copyright(c) 2017 Intel Corporation. All Rights Reserved.
+
+#####################################################
+##              Align Depth to Color               ##
+#####################################################
+
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+class RealSenseCamera:
+    def __init__(self, robot):
+        self.robot = robot
+        self.depth_scale = 0.001
+
+        # Camera intrinsics
+        intrinsics = self.robot.get_camera_K()
+        self.fy = intrinsics[0, 0]
+        self.fx = intrinsics[1, 1]
+        self.cy = intrinsics[0, 2]
+        self.cx = intrinsics[1, 2]
+        print(self.fx, self.fy, self.cx, self.cy)
+
+        # selected ix and iy coordinates
+        self.ix, self.iy = None, None
+
+    def capture_image(self, visualize=False):
+        self.rgb_image, self.depth_image, self.points = self.robot.get_images(compute_xyz=True)
+        self.rgb_image = np.rot90(self.rgb_image, k=1)[:, :, [2, 1, 0]]
+        self.depth_image = np.rot90(self.depth_image, k=1)
+        self.points = np.rot90(self.points, k=1)
+
+        cv2.imwrite("./images/input.jpg", np.rot90(self.rgb_image, k=-1))
+        # cv2.imwrite("depth.jpg", np.rot90(self.depth_image, k=-1)/np.max(self.depth_image))
+        self.rgb_image = cv2.cvtColor(self.rgb_image, cv2.COLOR_BGR2RGB)
+
+        if visualize:
+            fig, ax = plt.subplots(1, 2, figsize=(10, 5))
+            timer = fig.canvas.new_timer(
+                interval=5000
+            )  # creating a timer object and setting an interval of 3000 milliseconds
+            timer.add_callback(lambda: plt.close())
+
+            ax[0].imshow(np.rot90(self.rgb_image, k=-1))
+            ax[0].set_title("Color Image")
+
+            ax[1].imshow(np.rot90(self.depth_image, k=-1))
+            ax[1].set_title("Depth Image")
+
+            plt.savefig("./images/rgb_dpt.png")
+            plt.pause(3)
+            plt.close()
+
+        return self.rgb_image, self.depth_image, self.points
+
+    def pixel2d_to_point3d(self, ix, iy):
+        return self.points[iy, ix][[1, 0, 2]]
diff --git a/src/stretch/dynav/ok_robot_hw/global_parameters.py b/src/stretch/dynav/ok_robot_hw/global_parameters.py
new file mode 100644
index 000000000..574e04959
--- /dev/null
+++ b/src/stretch/dynav/ok_robot_hw/global_parameters.py
@@ -0,0 +1,23 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+POS_TOL = 0.1
+YAW_TOL = 0.2
+
+TOP_CAMERA_NODE = "camera_depth_optical_frame"
+GRIPPER_FINGERTIP_LEFT_NODE = "link_gripper_fingertip_left"
+GRIPPER_FINGERTIP_RIGHT_NODE = "link_gripper_fingertip_right"
+
+INIT_LIFT_POS = 0.45
+INIT_WRIST_PITCH = -1.57
+INIT_ARM_POS = 0
+INIT_WRIST_ROLL = 0
+INIT_WRIST_YAW = 0
+INIT_HEAD_PAN = -1.57
+INIT_HEAD_TILT = -0.65
diff --git a/src/stretch/dynav/ok_robot_hw/image_publisher.py b/src/stretch/dynav/ok_robot_hw/image_publisher.py
new file mode 100644
index 000000000..c373e4d45
--- /dev/null
+++ b/src/stretch/dynav/ok_robot_hw/image_publisher.py
@@ -0,0 +1,83 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import numpy as np
+from PIL import Image as PILImage
+
+from stretch.dynav.ok_robot_hw.utils.communication_utils import (
+    recv_array,
+    send_array,
+    send_depth_img,
+    send_rgb_img,
+)
+
+
+class ImagePublisher:
+    def __init__(self, camera, socket):
+        self.camera = camera
+        self.socket = socket
+
+    def publish_image(self, text, mode, head_tilt=-1):
+        image, depth, points = self.camera.capture_image()
+        # camera_pose = self.camera.robot.head.get_pose_in_base_coords()
+
+        rotated_image = np.rot90(image, k=-1)
+        rotated_depth = np.rot90(depth, k=-1)
+        rotated_point = np.rot90(points, k=-1)
+        PILImage.fromarray(rotated_image).save("./test_rgb.png")
+        np.save("./test_depth", rotated_depth)
+
+        ## Send RGB, depth and camera intrinsics data
+        send_rgb_img(self.socket, rotated_image)
+        print(self.socket.recv_string())
+        send_depth_img(self.socket, rotated_depth)
+        print(self.socket.recv_string())
+        # send_array(self.socket, rotated_image)
+        # print(self.socket.recv_string())
+        # send_array(self.socket, rotated_depth)
+        # print(self.socket.recv_string())
+        send_array(
+            self.socket,
+            np.array(
+                [
+                    self.camera.fy,
+                    self.camera.fx,
+                    self.camera.cy,
+                    self.camera.cx,
+                    int(head_tilt * 100),
+                ]
+            ),
+        )
+        print(self.socket.recv_string())
+
+        ## Sending Object text and Manipulation mode
+        self.socket.send_string(text)
+        print(self.socket.recv_string())
+        self.socket.send_string(mode)
+        print(self.socket.recv_string())
+
+        ## Waiting for the base and camera transforms to center the object vertically and horizontally
+        self.socket.send_string("Waiting for gripper pose/ base and head trans from workstation")
+        translation = recv_array(self.socket)
+        self.socket.send_string("translation received by robot")
+        rotation = recv_array(self.socket)
+        self.socket.send_string("rotation received by robot")
+        add_data = recv_array(self.socket)
+        self.socket.send_string(f"Additional data received robot")
+
+        depth = add_data[0]
+        width = add_data[1]
+        retry = add_data[2]
+        print(f"Additional data received - {add_data}")
+        print("translation: ")
+        print(translation)
+        print("rotation: ")
+        print(rotation)
+        print(self.socket.recv_string())
+        return translation, rotation, depth, width, retry
diff --git a/src/stretch/dynav/ok_robot_hw/robot.py b/src/stretch/dynav/ok_robot_hw/robot.py
new file mode 100644
index 000000000..d65b11cdd
--- /dev/null
+++ b/src/stretch/dynav/ok_robot_hw/robot.py
@@ -0,0 +1,305 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import os
+
+import numpy as np
+import pinocchio as pin
+
+# from urdf_parser_py.urdf import URDF
+from scipy.spatial.transform import Rotation as R
+
+from stretch.dynav.ok_robot_hw.global_parameters import *
+from stretch.dynav.ok_robot_hw.utils import transform_joint_array
+from stretch.motion.kinematics import HelloStretchIdx
+
+OVERRIDE_STATES: dict[str, float] = {}
+
+
+class HelloRobot:
+    def __init__(
+        self,
+        robot,
+        stretch_client_urdf_file="src/stretch/config/urdf",
+        gripper_threshold=7.0,
+        stretch_gripper_max=0.64,
+        stretch_gripper_min=0,
+        end_link="link_gripper_s3_body",
+    ):
+        self.STRETCH_GRIPPER_MAX = stretch_gripper_max
+        self.STRETCH_GRIPPER_MIN = stretch_gripper_min
+        self.urdf_path = os.path.join(stretch_client_urdf_file, "stretch.urdf")
+        self.joints_pin = {"joint_fake": 0}
+
+        self.GRIPPER_THRESHOLD = gripper_threshold
+
+        print("hello robot starting")
+        self.head_joint_list = ["joint_fake", "joint_head_pan", "joint_head_tilt"]
+        self.init_joint_list = [
+            "joint_fake",
+            "joint_lift",
+            "3",
+            "2",
+            "1",
+            "0",
+            "joint_wrist_yaw",
+            "joint_wrist_pitch",
+            "joint_wrist_roll",
+            "joint_gripper_finger_left",
+        ]
+
+        # end_link is the frame of reference node
+        self.end_link = end_link
+        self.set_end_link(end_link)
+
+        # Initialize StretchClient controller
+        self.robot = robot
+        self.robot.switch_to_manipulation_mode()
+        # time.sleep(2)
+
+        # Constraining the robots movement
+        self.clamp = lambda n, minn, maxn: max(min(maxn, n), minn)
+        self.pan, self.tilt = self.robot.get_pan_tilt()
+
+    def set_end_link(self, link):
+        if link == GRIPPER_FINGERTIP_LEFT_NODE or GRIPPER_FINGERTIP_RIGHT_NODE:
+            self.joint_list = self.init_joint_list
+        else:
+            self.joint_list = self.init_joint_list[:-1]
+
+    def get_joints(self):
+        """
+        Returns all the joint names and values involved in forward kinematics of head and gripper
+        """
+        ## Names of all 13 joints
+        joint_names = (
+            self.init_joint_list + ["joint_gripper_finger_right"] + self.head_joint_list[1:]
+        )
+        self.updateJoints()
+        joint_values = list(self.joints.values()) + [0] + list(self.head_joints.values())[1:]
+
+        return joint_names, joint_values
+
+    def move_to_position(
+        self,
+        lift_pos=None,
+        arm_pos=None,
+        base_trans=0.0,
+        wrist_yaw=None,
+        wrist_pitch=None,
+        wrist_roll=None,
+        gripper_pos=None,
+        base_theta=None,
+        head_tilt=None,
+        head_pan=None,
+        blocking=True,
+    ):
+        """
+        Moves the robots, base, arm, lift, wrist and head to a desired position.
+        """
+        if base_theta is not None:
+            self.robot.navigate_to([0, 0, base_theta], relative=True, blocking=True)
+            return
+
+        # Base, arm and lift state update
+        target_state = self.robot.get_six_joints()
+        if not gripper_pos is None:
+            self.CURRENT_STATE = (
+                gripper_pos * (self.STRETCH_GRIPPER_MAX - self.STRETCH_GRIPPER_MIN)
+                + self.STRETCH_GRIPPER_MIN
+            )
+            self.robot.gripper_to(self.CURRENT_STATE, blocking=blocking)
+        if not arm_pos is None:
+            target_state[2] = arm_pos
+        if not lift_pos is None:
+            target_state[1] = lift_pos
+        if base_trans is None:
+            base_trans = 0
+        target_state[0] = base_trans + target_state[0]
+
+        # Wrist state update
+        if not wrist_yaw is None:
+            target_state[3] = wrist_yaw
+        if not wrist_pitch is None:
+            target_state[4] = min(wrist_pitch, 0.1)
+        if not wrist_roll is None:
+            target_state[5] = wrist_roll
+
+        # Actual Movement
+        print("Expected", target_state)
+        print("Actual", self.robot.get_six_joints())
+        print("Error", target_state - self.robot.get_six_joints())
+        # print('Target Position', target_state)
+        # print('pan tilt before', self.robot.get_pan_tilt())
+        self.robot.arm_to(target_state, blocking=blocking, head=np.array([self.pan, self.tilt]))
+        # print('pan tilt after', self.robot.get_pan_tilt())
+        # print('Actual location', self.robot.get_six_joints())
+
+        # Head state update and Movement
+        # target_head_pan, target_head_tilt = self.robot.get_pan_tilt()
+        target_head_pan = self.pan
+        target_head_tilt = self.tilt
+        if not head_tilt is None:
+            target_head_tilt = head_tilt
+            self.tilt = head_tilt
+        if not head_pan is None:
+            target_head_pan = head_pan
+            self.pan = head_pan
+        self.robot.head_to(head_tilt=target_head_tilt, head_pan=target_head_pan, blocking=blocking)
+        # self.pan, self.tilt = self.robot.get_pan_tilt()
+        # time.sleep(0.7)
+
+    def pickup(self, width):
+        """
+        Code for grasping the object
+        Gripper closes gradually until it encounters resistance
+        """
+        next_gripper_pos = width
+        while True:
+            self.robot.gripper_to(
+                max(next_gripper_pos * self.STRETCH_GRIPPER_MAX, -0.2), blocking=True
+            )
+            curr_gripper_pose = self.robot.get_gripper_position()
+            # print('Robot means to move gripper to', next_gripper_pos * self.STRETCH_GRIPPER_MAX)
+            # print('Robot actually moves gripper to', curr_gripper_pose)
+            if next_gripper_pos == -1:
+                break
+
+            if next_gripper_pos > 0:
+                next_gripper_pos -= 0.35
+            else:
+                next_gripper_pos = -1
+
+    def updateJoints(self):
+        """
+        update all the current positions of joints
+        """
+        state = self.robot.get_six_joints()
+        origin_dist = state[0]
+
+        # Head Joints
+        pan, tilt = self.robot.get_pan_tilt()
+
+        self.joints_pin["joint_fake"] = origin_dist
+        self.joints_pin["joint_lift"] = state[1]
+        armPos = state[2]
+        self.joints_pin["joint_arm_l3"] = armPos / 4.0
+        self.joints_pin["joint_arm_l2"] = armPos / 4.0
+        self.joints_pin["joint_arm_l1"] = armPos / 4.0
+        self.joints_pin["joint_arm_l0"] = armPos / 4.0
+        self.joints_pin["joint_wrist_yaw"] = state[3]
+        self.joints_pin["joint_wrist_roll"] = state[5]
+        self.joints_pin["joint_wrist_pitch"] = OVERRIDE_STATES.get("wrist_pitch", state[4])
+        self.joints_pin["joint_gripper_finger_left"] = 0
+
+        self.joints_pin["joint_head_pan"] = pan
+        self.joints_pin["joint_head_tilt"] = tilt
+
+    # following function is used to move the robot to a desired joint configuration
+    def move_to_joints(self, joints, gripper, mode=0):
+        """
+        Given the desired joints movement this function will the joints accordingly
+        """
+        state = self.robot.get_six_joints()
+
+        # clamp rotational joints between -1.57 to 1.57
+        joints["joint_wrist_pitch"] = (joints["joint_wrist_pitch"] + 1.57) % 3.14 - 1.57
+        joints["joint_wrist_yaw"] = (joints["joint_wrist_yaw"] + 1.57) % 3.14 - 1.57
+        joints["joint_wrist_roll"] = (joints["joint_wrist_roll"] + 1.57) % 3.14 - 1.57
+        joints["joint_wrist_pitch"] = self.clamp(joints["joint_wrist_pitch"], -1.57, 0.1)
+        target_state = [
+            joints["joint_fake"],
+            joints["joint_lift"],
+            joints["3"] + joints["2"] + joints["1"] + joints["0"],
+            joints["joint_wrist_yaw"],
+            joints["joint_wrist_pitch"],
+            joints["joint_wrist_roll"],
+        ]
+
+        # print('pan tilt before', self.robot.get_pan_tilt())
+
+        # Moving only the lift first
+        if mode == 1:
+            target1 = state
+            target1[0] = target_state[0]
+            target1[1] = min(1.1, target_state[1] + 0.2)
+            self.robot.arm_to(target1, blocking=True, head=np.array([self.pan, self.tilt]))
+
+        self.robot.arm_to(target_state, blocking=True, head=np.array([self.pan, self.tilt]))
+        self.robot.head_to(head_tilt=self.tilt, head_pan=self.pan, blocking=True)
+
+        self.robot.arm_to(target_state, blocking=True, head=np.array([self.pan, self.tilt]))
+        self.robot.head_to(head_tilt=self.tilt, head_pan=self.pan, blocking=True)
+
+        # print('pan tilt after', self.robot.get_pan_tilt())
+        # print(f"current state {self.robot.get_six_joints()}")
+        # print(f"target state {target_state}")
+        # time.sleep(1)
+
+        # NOTE: below code is to fix the pitch drift issue in current hello-robot. Remove it if there is no pitch drift issue
+        OVERRIDE_STATES["wrist_pitch"] = joints["joint_wrist_pitch"]
+
+    def get_joint_transform(self, node1, node2):
+        """
+        This function takes two nodes from a robot URDF file as input and
+        outputs the coordinate frame of node2 relative to the coordinate frame of node1.
+
+        Mainly used for transforming coordinates from camera frame to gripper frame.
+        """
+
+        # return frame_transform, frame2, frame1
+        self.updateJoints()
+        frame_pin = self.robot.get_frame_pose(self.joints_pin, node1, node2)
+
+        return frame_pin
+
+    def move_to_pose(self, translation_tensor, rotational_tensor, gripper, move_mode=0):
+        """
+        Function to move the gripper to a desired translation and rotation
+        """
+        translation = [translation_tensor[0], translation_tensor[1], translation_tensor[2]]
+        rotation = rotational_tensor
+
+        self.updateJoints()
+
+        q = self.robot.get_joint_positions()
+        q[HelloStretchIdx.WRIST_PITCH] = OVERRIDE_STATES.get(
+            "wrist_pitch", q[HelloStretchIdx.WRIST_PITCH]
+        )
+        pin_pose = self.robot.get_ee_pose(matrix=True, link_name=self.end_link, q=q)
+        pin_rotation, pin_translation = pin_pose[:3, :3], pin_pose[:3, 3]
+        pin_curr_pose = pin.SE3(pin_rotation, pin_translation)
+
+        rot_matrix = R.from_euler("xyz", rotation, degrees=False).as_matrix()
+
+        pin_del_pose = pin.SE3(np.array(rot_matrix), np.array(translation))
+        pin_goal_pose_new = pin_curr_pose * pin_del_pose
+
+        final_pos = pin_goal_pose_new.translation.tolist()
+        final_quat = pin.Quaternion(pin_goal_pose_new.rotation).coeffs().tolist()
+        # print(f"final pos and quat {final_pos}\n {final_quat}")
+
+        full_body_cfg = self.robot.solve_ik(
+            final_pos, final_quat, None, False, custom_ee_frame=self.end_link
+        )
+        if full_body_cfg is None:
+            print("Warning: Cannot find an IK solution for desired EE pose!")
+            return False
+
+        pin_joint_pos = self.robot._extract_joint_pos(full_body_cfg)
+        transform_joint_pos = transform_joint_array(pin_joint_pos)
+
+        self.joint_array1 = transform_joint_pos
+
+        ik_joints = {}
+        for joint_index in range(len(self.joint_array1)):
+            ik_joints[self.joint_list[joint_index]] = self.joint_array1[joint_index]
+
+        # Actual Movement of joints
+        self.move_to_joints(ik_joints, gripper, move_mode)
diff --git a/src/stretch/dynav/ok_robot_hw/utils/__init__.py b/src/stretch/dynav/ok_robot_hw/utils/__init__.py
new file mode 100644
index 000000000..5baa346ae
--- /dev/null
+++ b/src/stretch/dynav/ok_robot_hw/utils/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+from .communication_utils import *
+from .grasper_utils import *
+from .utils import *
diff --git a/src/stretch/dynav/ok_robot_hw/utils/communication_utils.py b/src/stretch/dynav/ok_robot_hw/utils/communication_utils.py
new file mode 100644
index 000000000..6e9775237
--- /dev/null
+++ b/src/stretch/dynav/ok_robot_hw/utils/communication_utils.py
@@ -0,0 +1,65 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import cv2
+import numpy as np
+import zmq
+
+
+# use zmq to send a numpy array
+def send_array(socket, A, flags=0, copy=True, track=False):
+    """send a numpy array with metadata"""
+    A = np.array(A)
+    md = dict(
+        dtype=str(A.dtype),
+        shape=A.shape,
+    )
+    socket.send_json(md, flags | zmq.SNDMORE)
+    return socket.send(np.ascontiguousarray(A), flags, copy=copy, track=track)
+
+
+# use zmq to receive a numpy array
+def recv_array(socket, flags=0, copy=True, track=False):
+    """recv a numpy array"""
+    md = socket.recv_json(flags=flags)
+    msg = socket.recv(flags=flags, copy=copy, track=track)
+    A = np.frombuffer(msg, dtype=md["dtype"])
+    return A.reshape(md["shape"])
+
+
+def send_rgb_img(socket, img):
+    img = img.astype(np.uint8)
+    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 90]
+    _, img_encoded = cv2.imencode(".jpg", img, encode_param)
+    socket.send(img_encoded.tobytes())
+
+
+def recv_rgb_img(socket):
+    img = socket.recv()
+    img = np.frombuffer(img, dtype=np.uint8)
+    img = cv2.imdecode(img, cv2.IMREAD_COLOR)
+    return img
+
+
+def send_depth_img(socket, depth_img):
+    depth_img = (depth_img * 1000).astype(np.uint16)
+    encode_param = [
+        int(cv2.IMWRITE_PNG_COMPRESSION),
+        3,
+    ]  # Compression level from 0 (no compression) to 9 (max compression)
+    _, depth_img_encoded = cv2.imencode(".png", depth_img, encode_param)
+    socket.send(depth_img_encoded.tobytes())
+
+
+def recv_depth_img(socket):
+    depth_img = socket.recv()
+    depth_img = np.frombuffer(depth_img, dtype=np.uint8)
+    depth_img = cv2.imdecode(depth_img, cv2.IMREAD_UNCHANGED)
+    depth_img = depth_img / 1000.0
+    return depth_img
diff --git a/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
new file mode 100644
index 000000000..2b0c7e279
--- /dev/null
+++ b/src/stretch/dynav/ok_robot_hw/utils/grasper_utils.py
@@ -0,0 +1,248 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import time
+
+import numpy as np
+import pinocchio as pin
+
+from stretch.dynav.ok_robot_hw.global_parameters import *
+from stretch.dynav.ok_robot_hw.image_publisher import ImagePublisher
+from stretch.dynav.ok_robot_hw.utils.utils import apply_se3_transform
+
+
+def capture_and_process_image(camera, mode, obj, socket, hello_robot):
+
+    print("Currently in " + mode + " mode and the robot is about to manipulate " + obj + ".")
+
+    image_publisher = ImagePublisher(camera, socket)
+
+    # Centering the object
+    head_tilt_angles = [0, -0.1, 0.1]
+    tilt_retries, side_retries = 1, 0
+    retry_flag = True
+    head_tilt = INIT_HEAD_TILT
+    head_pan = INIT_HEAD_PAN
+
+    while retry_flag:
+        translation, rotation, depth, width, retry_flag = image_publisher.publish_image(
+            obj, mode, head_tilt=head_tilt
+        )
+
+        print(f"retry flag : {retry_flag}")
+        if retry_flag == 1:
+            base_trans = translation[0]
+            head_tilt += rotation[0]
+
+            hello_robot.move_to_position(
+                base_trans=base_trans, head_pan=head_pan, head_tilt=head_tilt
+            )
+            time.sleep(1)
+
+        elif retry_flag != 0 and side_retries == 3:
+            print("Tried in all angles but couldn't succeed")
+            time.sleep(1)
+            return None, None, None, None
+
+        elif side_retries == 2 and tilt_retries == 3:
+            hello_robot.move_to_position(base_trans=0.1, head_tilt=head_tilt)
+            side_retries = 3
+
+        elif retry_flag == 2:
+            if tilt_retries == 3:
+                if side_retries == 0:
+                    hello_robot.move_to_position(base_trans=0.1, head_tilt=head_tilt)
+                    side_retries = 1
+                else:
+                    hello_robot.move_to_position(base_trans=-0.2, head_tilt=head_tilt)
+                    side_retries = 2
+                tilt_retries = 1
+            else:
+                print(f"retrying with head tilt : {head_tilt + head_tilt_angles[tilt_retries]}")
+                hello_robot.move_to_position(
+                    head_pan=head_pan, head_tilt=head_tilt + head_tilt_angles[tilt_retries]
+                )
+                tilt_retries += 1
+                time.sleep(1)
+
+    if mode == "place":
+        translation = np.array([-translation[1], -translation[0], -translation[2]])
+
+    if mode == "pick":
+        return rotation, translation, depth, width
+    else:
+        return rotation, translation
+
+
+def move_to_point(robot, point, base_node, gripper_node, move_mode=1, pitch_rotation=0):
+    """
+    Function for moving the gripper to a specific point
+    """
+    rotation = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+
+    dest_frame = pin.SE3(rotation, point)
+    transform = robot.get_joint_transform(base_node, gripper_node)
+
+    # Rotation from gripper frame frame to gripper frame
+    transformed_frame = transform * dest_frame
+
+    transformed_frame.translation[2] -= 0.2
+
+    robot.move_to_pose(
+        [
+            transformed_frame.translation[0],
+            transformed_frame.translation[1],
+            transformed_frame.translation[2],
+        ],
+        [pitch_rotation, 0, 0],
+        [1],
+        move_mode=move_mode,
+    )
+
+
+def pickup(
+    robot,
+    rotation,
+    translation,
+    base_node,
+    gripper_node,
+    gripper_height=0.03,
+    gripper_depth=0.03,
+    gripper_width=1,
+):
+    """
+    rotation: Relative rotation of gripper pose w.r.t camera
+    translation: Relative translation of gripper pose w.r.t camera
+    base_node: Camera Node
+
+    Supports home robot top down grasping as well
+
+    Graping trajectory steps
+    1. Rotation of gripper
+    2. Lift the gripper
+    3. Move the base such gripper in line with the grasp
+    4. Gradually Move the gripper to the desired position
+    """
+    # Transforming the final point from Model camera frame to robot camera frame
+    pin_point = np.array([-translation[1], -translation[0], translation[2]])
+
+    # Rotation from Camera frame to Model frame
+    rotation1_bottom_mat = np.array(
+        [
+            [0.0000000, -1.0000000, 0.0000000],
+            [-1.0000000, 0.0000000, 0.0000000],
+            [0.0000000, 0.0000000, 1.0000000],
+        ]
+    )
+
+    # Rotation from model frame to pose frame
+    rotation1_mat = np.array(
+        [
+            [rotation[0][0], rotation[0][1], rotation[0][2]],
+            [rotation[1][0], rotation[1][1], rotation[1][2]],
+            [rotation[2][0], rotation[2][1], rotation[2][2]],
+        ]
+    )
+
+    # Rotation from camera frame to pose frame
+    pin_rotation = np.dot(rotation1_bottom_mat, rotation1_mat)
+    # print(f"pin rotation{pin_rotation}")
+
+    # Relative rotation and translation of grasping point relative to camera
+    pin_dest_frame = pin.SE3(np.array(pin_rotation), np.array(pin_point))
+    # print(f"pin dest frame {pin_dest_frame}")
+
+    # Camera to gripper frame transformation
+    pin_cam2gripper_transform = robot.get_joint_transform(base_node, gripper_node)
+
+    # transformed_frame = del_pose * dest_frame
+    pin_transformed_frame = pin_cam2gripper_transform * pin_dest_frame
+    # print(f"pin_transformed frame {pin_transformed_frame}")
+
+    # Lifting the arm to high position as part of pregrasping position
+    print("pan, tilt before", robot.robot.get_pan_tilt())
+    robot.move_to_position(gripper_pos=gripper_width)
+    robot.move_to_position(lift_pos=1.05, head_pan=None, head_tilt=None)
+    print("pan, tilt after", robot.robot.get_pan_tilt())
+
+    # Rotation for aligning Robot gripper frame to Model gripper frame
+    rotation2_top_mat = np.array([[0, 0, 1], [1, 0, 0], [0, -1, 0]])
+
+    # final Rotation of gripper to hold the objcet
+    pin_final_rotation = np.dot(pin_transformed_frame.rotation, rotation2_top_mat)
+    print(f"pin final rotation {pin_final_rotation}")
+
+    rpy_angles = pin.rpy.matrixToRpy(pin_final_rotation)
+    print("pan, tilt before", robot.robot.get_pan_tilt())
+    robot.move_to_pose(
+        [0, 0, 0],
+        [rpy_angles[0], rpy_angles[1], rpy_angles[2]],
+        [1],
+    )
+    print("pan, tilt after", robot.robot.get_pan_tilt())
+
+    # Final grasping point relative to camera
+    pin_cam2gripper_transform = robot.get_joint_transform(base_node, gripper_node)
+    pin_transformed_point1 = apply_se3_transform(pin_cam2gripper_transform, pin_point)
+    # print(f"pin transformed point1 {pin_transformed_point1}")
+
+    # Final grasping point relative to base
+    pin_cam2base_transform = robot.get_joint_transform(base_node, "base_link")
+    pin_base_point = apply_se3_transform(pin_cam2base_transform, pin_point)
+    # print(f"pin base point {pin_base_point}")
+
+    diff_value = (
+        0.226 - gripper_depth - gripper_height
+    )  # 0.228 is the distance between link_Straight_gripper node and the gripper tip
+    pin_transformed_point1[2] -= diff_value
+    ref_diff = diff_value
+
+    # Moving gripper to a point that is 0.2m away from the pose center in the line of gripper
+    print("pan, tilt before", robot.robot.get_pan_tilt())
+    robot.move_to_pose(
+        [pin_transformed_point1[0], pin_transformed_point1[1], pin_transformed_point1[2] - 0.2],
+        [0, 0, 0],
+        [1],
+        move_mode=1,
+    )
+    print("pan, tilt after", robot.robot.get_pan_tilt())
+
+    # Z-Axis of link_straight_gripper points in line of gripper
+    # So, the z co-ordiante of point w.r.t gripper gives the distance of point from gripper
+    pin_base2gripper_transform = robot.get_joint_transform("base_link", gripper_node)
+    pin_transformed_point2 = apply_se3_transform(pin_base2gripper_transform, pin_base_point)
+    curr_diff = pin_transformed_point2[2]
+
+    # The distance between gripper and point is covered gradullay to allow for velocity control when it approaches the object
+    # Lower velocity helps is not topping the light objects
+    diff = abs(curr_diff - ref_diff)
+    if diff > 0.08:
+        dist = diff - 0.08
+        state = robot.robot.get_six_joints()
+        state[1] += 0.02
+        robot.robot.arm_to(state, blocking=True)
+        robot.move_to_pose([0, 0, dist], [0, 0, 0], [1])
+        diff = diff - dist
+
+    while diff > 0.01:
+        dist = min(0.03, diff)
+        robot.move_to_pose([0, 0, dist], [0, 0, 0], [1])
+        diff = diff - dist
+
+    # Now the gripper reached the grasping point and starts picking procedure
+    robot.pickup(gripper_width)
+
+    # Lifts the arm
+    robot.move_to_position(lift_pos=min(robot.robot.get_six_joints()[1] + 0.2, 1.1))
+
+    # Tucks the gripper so that while moving to place it won't collide with any obstacles
+    robot.move_to_position(arm_pos=0.01)
+    robot.move_to_position(wrist_pitch=0.0)
+    robot.move_to_position(lift_pos=min(robot.robot.get_six_joints()[1], 0.9), wrist_yaw=2.5)
+    robot.move_to_position(lift_pos=min(robot.robot.get_six_joints()[1], 0.55))
diff --git a/src/stretch/dynav/ok_robot_hw/utils/utils.py b/src/stretch/dynav/ok_robot_hw/utils/utils.py
new file mode 100644
index 000000000..27ace6fb0
--- /dev/null
+++ b/src/stretch/dynav/ok_robot_hw/utils/utils.py
@@ -0,0 +1,32 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import numpy as np
+
+
+def apply_se3_transform(se3_obj, point):
+    homogeneous_point = np.append(point.flatten(), 1)
+    print(homogeneous_point)
+    transformed_homogeneous_point = se3_obj.homogeneous.dot(homogeneous_point)
+    transformed_point = transformed_homogeneous_point[:3]
+
+    return transformed_point
+
+
+def transform_joint_array(joint_array):
+    n = len(joint_array)
+    new_joint_array = []
+    for i in range(n + 3):
+        if i < 2:
+            new_joint_array.append(joint_array[i])
+        elif i < 6:
+            new_joint_array.append(joint_array[2] / 4.0)
+        else:
+            new_joint_array.append(joint_array[i - 3])
+    return np.array(new_joint_array)
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
new file mode 100644
index 000000000..96ed5cfc3
--- /dev/null
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -0,0 +1,369 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import datetime
+import time
+from typing import Any, Dict
+
+import numpy as np
+import zmq
+
+from stretch.agent import RobotClient
+from stretch.core.parameters import Parameters
+from stretch.dynav.communication_util import recv_array, send_array, send_everything
+from stretch.dynav.ok_robot_hw.camera import RealSenseCamera
+from stretch.dynav.ok_robot_hw.global_parameters import *
+from stretch.dynav.ok_robot_hw.robot import HelloRobot as Manipulation_Wrapper
+from stretch.dynav.ok_robot_hw.utils.grasper_utils import (
+    capture_and_process_image,
+    move_to_point,
+    pickup,
+)
+
+
+class RobotAgentMDP:
+    """Basic demo code. Collects everything that we need to make this work."""
+
+    _retry_on_fail = False
+
+    def __init__(
+        self,
+        robot: RobotClient,
+        parameters: Dict[str, Any],
+        ip: str,
+        image_port: int = 5558,
+        text_port: int = 5556,
+        manip_port: int = 5557,
+        re: int = 1,
+        env_num: int = 1,
+        test_num: int = 1,
+        method: str = "dynamem",
+    ):
+        print("------------------------YOU ARE NOW RUNNING PEIQI CODES V3-----------------")
+        self.re = re
+        if isinstance(parameters, Dict):
+            self.parameters = Parameters(**parameters)
+        elif isinstance(parameters, Parameters):
+            self.parameters = parameters
+        else:
+            raise RuntimeError(f"parameters of unsupported type: {type(parameters)}")
+        self.robot = robot
+        if re == 1:
+            stretch_gripper_max = 0.3
+            end_link = "link_straight_gripper"
+        else:
+            stretch_gripper_max = 0.64
+            end_link = "link_gripper_s3_body"
+        self.transform_node = end_link
+        self.manip_wrapper = Manipulation_Wrapper(
+            self.robot, stretch_gripper_max=stretch_gripper_max, end_link=end_link
+        )
+        self.robot.move_to_nav_posture()
+
+        self.normalize_embeddings = True
+        self.pos_err_threshold = 0.35
+        self.rot_err_threshold = 0.4
+        self.obs_count = 0
+        self.guarantee_instance_is_reachable = parameters.guarantee_instance_is_reachable
+
+        self.image_sender = ImageSender(
+            ip=ip, image_port=image_port, text_port=text_port, manip_port=manip_port
+        )
+        if method == "dynamem":
+            from stretch.dynav.voxel_map_server import ImageProcessor as VoxelMapImageProcessor
+
+            self.image_processor = VoxelMapImageProcessor(
+                rerun=True, static=False, log="env" + str(env_num) + "_" + str(test_num)
+            )  # type: ignore
+        elif method == "mllm":
+            from stretch.dynav.llm_server import ImageProcessor as mLLMImageProcessor
+
+            self.image_processor = mLLMImageProcessor(
+                rerun=True, static=False, log="env" + str(env_num) + "_" + str(test_num)
+            )  # type: ignore
+
+        self.look_around_times: list[float] = []
+        self.execute_times: list[float] = []
+
+        timestamp = f"{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}"
+
+    def look_around(self):
+        print("*" * 10, "Look around to check", "*" * 10)
+        for pan in [0.4, -0.4, -1.2]:
+            for tilt in [-0.6]:
+                self.robot.head_to(pan, tilt, blocking=True)
+                self.update()
+
+    def rotate_in_place(self):
+        print("*" * 10, "Rotate in place", "*" * 10)
+        xyt = self.robot.get_base_pose()
+        self.robot.head_to(head_pan=0, head_tilt=-0.6, blocking=True)
+        for i in range(8):
+            xyt[2] += 2 * np.pi / 8
+            self.robot.navigate_to(xyt, blocking=True)
+            self.update()
+
+    def update(self):
+        """Step the data collector. Get a single observation of the world. Remove bad points, such as those from too far or too near the camera. Update the 3d world representation."""
+        obs = self.robot.get_observation()
+        # self.image_sender.send_images(obs)
+        self.obs_count += 1
+        rgb, depth, K, camera_pose = obs.rgb, obs.depth, obs.camera_K, obs.camera_pose
+        # start_time = time.time()
+        self.image_processor.process_rgbd_images(rgb, depth, K, camera_pose)
+        # end_time = time.time()
+        # print('Image processing takes', end_time - start_time, 'seconds.')
+
+    def execute_action(
+        self,
+        text: str,
+    ):
+        start_time = time.time()
+
+        self.robot.look_front()
+        self.look_around()
+        self.robot.look_front()
+        self.robot.switch_to_navigation_mode()
+
+        start = self.robot.get_base_pose()
+        # print("       Start:", start)
+        # res = self.image_sender.query_text(text, start)
+        res = self.image_processor.process_text(text, start)
+        if len(res) == 0 and text != "" and text is not None:
+            res = self.image_processor.process_text("", start)
+
+        look_around_finish = time.time()
+        look_around_take = look_around_finish - start_time
+        print("Path planning takes ", look_around_take, " seconds.")
+        # self.look_around_times.append(look_around_take)
+        # print(self.look_around_times)
+        # print(sum(self.look_around_times) / len(self.look_around_times))
+
+        if len(res) > 0:
+            print("Plan successful!")
+            if len(res) >= 2 and np.isnan(res[-2]).all():
+                # blocking = text != ''
+                if len(res) > 2:
+                    self.robot.execute_trajectory(
+                        res[:-2],
+                        pos_err_threshold=self.pos_err_threshold,
+                        rot_err_threshold=self.rot_err_threshold,
+                        blocking=True,
+                    )
+
+                execution_finish = time.time()
+                execution_take = execution_finish - look_around_finish
+                print("Executing action takes ", execution_take, " seconds.")
+                self.execute_times.append(execution_take)
+                print(self.execute_times)
+                print(sum(self.execute_times) / len(self.execute_times))
+
+                return True, res[-1]
+            else:
+                self.robot.execute_trajectory(
+                    res,
+                    pos_err_threshold=self.pos_err_threshold,
+                    rot_err_threshold=self.rot_err_threshold,
+                    blocking=False,
+                )
+
+                execution_finish = time.time()
+                execution_take = execution_finish - look_around_finish
+                print("Executing action takes ", execution_take, " seconds.")
+                self.execute_times.append(execution_take)
+                print(self.execute_times)
+                print(sum(self.execute_times) / len(self.execute_times))
+
+                return False, None
+        else:
+            print("Failed. Try again!")
+            return None, None
+
+    def run_exploration(self):
+        """Go through exploration. We use the voxel_grid map created by our collector to sample free space, and then use our motion planner (RRT for now) to get there. At the end, we plan back to (0,0,0).
+
+        Args:
+            visualize(bool): true if we should do intermediate debug visualizations"""
+        status, _ = self.execute_action("")
+        if status is None:
+            print("Exploration failed! Perhaps nowhere to explore!")
+            return False
+        return True
+
+    def navigate(self, text, max_step=10):
+        finished = False
+        step = 0
+        end_point = None
+        while not finished and step < max_step:
+            print("*" * 20, step, "*" * 20)
+            step += 1
+            finished, end_point = self.execute_action(text)
+            if finished is None:
+                print("Navigation failed! The path might be blocked!")
+                return None
+        return end_point
+
+    def place(self, text, init_tilt=INIT_HEAD_TILT, base_node=TOP_CAMERA_NODE):
+        """
+        An API for running placing. By calling this API, human will ask the robot to place whatever it holds
+        onto objects specified by text queries A
+        - hello_robot: a wrapper for home-robot StretchClient controller
+        - socoket: we use this to communicate with workstation to get estimated gripper pose
+        - text: queries specifying target object
+        - transform node: node name for coordinate systems of target gripper pose (usually the coordinate system on the robot gripper)
+        - base node: node name for coordinate systems of estimated gipper poses given by anygrasp
+        """
+        self.robot.switch_to_manipulation_mode()
+        self.robot.look_at_ee()
+        self.manip_wrapper.move_to_position(head_pan=INIT_HEAD_PAN, head_tilt=init_tilt)
+        camera = RealSenseCamera(self.robot)
+
+        rotation, translation = capture_and_process_image(
+            camera=camera,
+            mode="place",
+            obj=text,
+            socket=self.image_sender.manip_socket,
+            hello_robot=self.manip_wrapper,
+        )
+
+        if rotation is None:
+            return False
+
+        # lift arm to the top before the robot extends the arm, prepare the pre-placing gripper pose
+        self.manip_wrapper.move_to_position(lift_pos=1.05)
+        self.manip_wrapper.move_to_position(wrist_yaw=0, wrist_pitch=0)
+
+        # Placing the object
+        move_to_point(self.manip_wrapper, translation, base_node, self.transform_node, move_mode=0)
+        self.manip_wrapper.move_to_position(gripper_pos=1, blocking=True)
+
+        # Lift the arm a little bit, and rotate the wrist roll of the robot in case the object attached on the gripper
+        self.manip_wrapper.move_to_position(
+            lift_pos=min(self.manip_wrapper.robot.get_six_joints()[1] + 0.3, 1.1)
+        )
+        self.manip_wrapper.move_to_position(wrist_roll=2.5, blocking=True)
+        self.manip_wrapper.move_to_position(wrist_roll=-2.5, blocking=True)
+
+        # Wait for some time and shrink the arm back
+        self.manip_wrapper.move_to_position(gripper_pos=1, lift_pos=1.05, arm_pos=0)
+        self.manip_wrapper.move_to_position(wrist_pitch=-1.57)
+
+        # Shift the base back to the original point as we are certain that original point is navigable in navigation obstacle map
+        self.manip_wrapper.move_to_position(
+            base_trans=-self.manip_wrapper.robot.get_six_joints()[0]
+        )
+        return True
+
+    def manipulate(self, text, init_tilt=INIT_HEAD_TILT, base_node=TOP_CAMERA_NODE):
+        """
+        An API for running manipulation. By calling this API, human will ask the robot to pick up objects
+        specified by text queries A
+        - hello_robot: a wrapper for home-robot StretchClient controller
+        - socoket: we use this to communicate with workstation to get estimated gripper pose
+        - text: queries specifying target object
+        - transform node: node name for coordinate systems of target gripper pose (usually the coordinate system on the robot gripper)
+        - base node: node name for coordinate systems of estimated gipper poses given by anygrasp
+        """
+
+        self.robot.switch_to_manipulation_mode()
+        self.robot.look_at_ee()
+
+        gripper_pos = 1
+
+        self.manip_wrapper.move_to_position(
+            arm_pos=INIT_ARM_POS,
+            head_pan=INIT_HEAD_PAN,
+            head_tilt=init_tilt,
+            gripper_pos=gripper_pos,
+            lift_pos=INIT_LIFT_POS,
+            wrist_pitch=INIT_WRIST_PITCH,
+            wrist_roll=INIT_WRIST_ROLL,
+            wrist_yaw=INIT_WRIST_YAW,
+        )
+
+        camera = RealSenseCamera(self.robot)
+
+        rotation, translation, depth, width = capture_and_process_image(
+            camera=camera,
+            mode="pick",
+            obj=text,
+            socket=self.image_sender.manip_socket,
+            hello_robot=self.manip_wrapper,
+        )
+
+        if rotation is None:
+            return False
+
+        if width < 0.05 and self.re == 3:
+            gripper_width = 0.45
+        elif width < 0.075 and self.re == 3:
+            gripper_width = 0.6
+        else:
+            gripper_width = 1
+
+        if input("Do you want to do this manipulation? Y or N ") != "N":
+            pickup(
+                self.manip_wrapper,
+                rotation,
+                translation,
+                base_node,
+                self.transform_node,
+                gripper_depth=depth,
+                gripper_width=gripper_width,
+            )
+
+        # Shift the base back to the original point as we are certain that original point is navigable in navigation obstacle map
+        self.manip_wrapper.move_to_position(
+            base_trans=-self.manip_wrapper.robot.get_six_joints()[0]
+        )
+
+        return True
+
+    def save(self):
+        with self.image_processor.voxel_map_lock:
+            self.image_processor.write_to_pickle()
+
+
+class ImageSender:
+    def __init__(
+        self,
+        stop_and_photo=False,
+        ip="100.108.67.79",
+        image_port=5560,
+        text_port=5561,
+        manip_port=5557,
+        color_name="/camera/color",
+        depth_name="/camera/aligned_depth_to_color",
+        camera_name="/camera_pose",
+        slop_time_seconds=0.05,
+        queue_size=100,
+    ):
+        context = zmq.Context()
+        self.img_socket = context.socket(zmq.REQ)
+        self.img_socket.connect("tcp://" + str(ip) + ":" + str(image_port))
+        self.text_socket = context.socket(zmq.REQ)
+        self.text_socket.connect("tcp://" + str(ip) + ":" + str(text_port))
+        self.manip_socket = context.socket(zmq.REQ)
+        self.manip_socket.connect("tcp://" + str(ip) + ":" + str(manip_port))
+
+    def query_text(self, text, start):
+        self.text_socket.send_string(text)
+        self.text_socket.recv_string()
+        send_array(self.text_socket, start)
+        return recv_array(self.text_socket)
+
+    def send_images(self, obs):
+        rgb = obs.rgb
+        depth = obs.depth
+        camer_K = obs.camera_K
+        camera_pose = obs.camera_pose
+        # data = np.concatenate((depth.shape, rgb.flatten(), depth.flatten(), camer_K.flatten(), camera_pose.flatten()))
+        # send_array(self.img_socket, data)
+        send_everything(self.img_socket, rgb, depth, camer_K, camera_pose)
+        # self.img_socket.recv_string()
diff --git a/src/stretch/dynav/run_manip.py b/src/stretch/dynav/run_manip.py
new file mode 100644
index 000000000..5bd3e6202
--- /dev/null
+++ b/src/stretch/dynav/run_manip.py
@@ -0,0 +1,87 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import click
+import numpy as np
+
+from stretch.agent import RobotClient
+
+# Mapping and perception
+from stretch.core.parameters import get_parameters
+from stretch.dynav import RobotAgentMDP
+
+
+def compute_tilt(camera_xyz, target_xyz):
+    """
+    a util function for computing robot head tilts so the robot can look at the target object after navigation
+    - camera_xyz: estimated (x, y, z) coordinates of camera
+    - target_xyz: estimated (x, y, z) coordinates of the target object
+    """
+    vector = camera_xyz - target_xyz
+    return -np.arctan2(vector[2], np.linalg.norm(vector[:2]))
+
+
+@click.command()
+@click.option("--ip", default="100.108.67.79", type=str)
+@click.option("--manual-wait", default=False, is_flag=True)
+@click.option("--random-goals", default=False, is_flag=True)
+@click.option("--explore-iter", default=-1)
+@click.option("--re", default=1, type=int)
+@click.option(
+    "--input-path",
+    type=click.Path(),
+    default=None,
+    help="Input path with default value 'output.npy'",
+)
+def main(
+    ip,
+    manual_wait,
+    navigate_home: bool = False,
+    explore_iter: int = 5,
+    re: int = 1,
+    input_path: str = None,
+    **kwargs,
+):
+    """
+    Including only some selected arguments here.
+
+    Args:
+        random_goals(bool): randomly sample frontier goals instead of looking for closest
+    """
+    click.echo("Will connect to a Stretch robot and collect a short trajectory.")
+    robot = RobotClient(robot_ip="100.79.44.11")
+
+    print("- Load parameters")
+    parameters = get_parameters("dynav_config.yaml")
+    # print(parameters)
+    if explore_iter >= 0:
+        parameters["exploration_steps"] = explore_iter
+
+    print("- Start robot agent with data collection")
+    demo = RobotAgentMDP(robot, parameters, ip=ip, re=re)
+
+    while input("STOP? Y/N") != "Y":
+        if input("You want to run manipulation: y/n") == "y":
+            text = input("Enter object name: ")
+            theta = -0.6
+            demo.manipulate(text, theta)
+
+        if input("You want to run placing: y/n") == "y":
+            text = input("Enter receptacle name: ")
+            theta = -0.6
+            demo.place(text, theta)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/stretch/dynav/run_ok_nav.py b/src/stretch/dynav/run_ok_nav.py
new file mode 100644
index 000000000..28260ce9b
--- /dev/null
+++ b/src/stretch/dynav/run_ok_nav.py
@@ -0,0 +1,174 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import click
+import cv2
+import numpy as np
+
+from stretch.agent import RobotClient
+
+# Mapping and perception
+from stretch.core.parameters import get_parameters
+from stretch.dynav import RobotAgentMDP
+
+
+def compute_tilt(camera_xyz, target_xyz):
+    """
+    a util function for computing robot head tilts so the robot can look at the target object after navigation
+    - camera_xyz: estimated (x, y, z) coordinates of camera
+    - target_xyz: estimated (x, y, z) coordinates of the target object
+    """
+    if not isinstance(camera_xyz, np.ndarray):
+        camera_xyz = np.array(camera_xyz)
+    if not isinstance(target_xyz, np.ndarray):
+        target_xyz = np.array(target_xyz)
+    vector = camera_xyz - target_xyz
+    return -np.arctan2(vector[2], np.linalg.norm(vector[:2]))
+
+
+@click.command()
+@click.option("--ip", default="100.108.67.79", type=str)
+@click.option("--manual-wait", default=False, is_flag=True)
+@click.option("--random-goals", default=False, is_flag=True)
+@click.option("--explore-iter", default=-1)
+@click.option("--re", default=1, type=int)
+@click.option("--method", default="dynamem", type=str)
+@click.option("--env", default=1, type=int)
+@click.option("--test", default=1, type=int)
+@click.option(
+    "--input-path",
+    type=click.Path(),
+    default=None,
+    help="Input path with default value 'output.npy'",
+)
+def main(
+    ip,
+    manual_wait,
+    navigate_home: bool = False,
+    explore_iter: int = 5,
+    re: int = 1,
+    method: str = "dynamem",
+    env: int = 1,
+    test: int = 1,
+    input_path: str = None,
+    **kwargs,
+):
+    """
+    Including only some selected arguments here.
+
+    Args:
+        random_goals(bool): randomly sample frontier goals instead of looking for closest
+    """
+    click.echo("Will connect to a Stretch robot and collect a short trajectory.")
+    robot = RobotClient(robot_ip="100.79.44.11")
+
+    print("- Load parameters")
+    parameters = get_parameters("dynav_config.yaml")
+    # print(parameters)
+    if explore_iter >= 0:
+        parameters["exploration_steps"] = explore_iter
+    object_to_find, location_to_place = None, None
+    robot.move_to_nav_posture()
+    robot.set_velocity(v=30.0, w=15.0)
+
+    print("- Start robot agent with data collection")
+    demo = RobotAgentMDP(robot, parameters, ip=ip, re=re, env_num=env, test_num=test, method=method)
+
+    if input_path is None:
+        demo.rotate_in_place()
+    else:
+        demo.image_processor.read_from_pickle(input_path)
+
+    demo.save()
+
+    # def keep_looking_around():
+    #     while True:
+    #         # We don't want too many images in our memory
+    #         time.sleep(0.8)
+    #         if robot.get_six_joints()[2] > 0.7 or not robot.in_navigation_mode():
+    #             continue
+    #         demo.update()
+
+    # img_thread = threading.Thread(target=keep_looking_around)
+    # img_thread.daemon = True
+    # img_thread.start()
+
+    while True:
+        mode = input("select mode? E/N/S")
+        if mode == "S":
+            demo.image_processor.write_to_pickle()
+            break
+        if mode == "E":
+            robot.switch_to_navigation_mode()
+            for epoch in range(explore_iter):
+                print("\n", "Exploration epoch ", epoch, "\n")
+                if not demo.run_exploration():
+                    print("Exploration failed! Quitting!")
+                    continue
+        else:
+            text = None
+            point = None
+            if input("You want to run manipulation: y/n") != "n":
+                robot.move_to_nav_posture()
+                robot.switch_to_navigation_mode()
+                text = input("Enter object name: ")
+                point = demo.navigate(text)
+                if point is None:
+                    print("Navigation Failure!")
+                cv2.imwrite(text + ".jpg", robot.get_observation().rgb[:, :, [2, 1, 0]])
+
+            if input("You want to run manipulation: y/n") != "n":
+                robot.switch_to_navigation_mode()
+                xyt = robot.get_base_pose()
+                xyt[2] = xyt[2] + np.pi / 2
+                robot.navigate_to(xyt, blocking=True)
+
+                robot.switch_to_manipulation_mode()
+                if text is None:
+                    text = input("Enter object name: ")
+                camera_xyz = robot.get_head_pose()[:3, 3]
+                if point is not None:
+                    theta = compute_tilt(camera_xyz, point)
+                else:
+                    theta = -0.6
+                demo.manipulate(text, theta)
+                robot.look_front()
+
+            text = None
+            point = None
+            if input("You want to run placing: y/n") != "n":
+                robot.switch_to_navigation_mode()
+                text = input("Enter receptacle name: ")
+                point = demo.navigate(text)
+                if point is None:
+                    print("Navigation Failure")
+                cv2.imwrite(text + ".jpg", robot.get_observation().rgb[:, :, [2, 1, 0]])
+
+            if input("You want to run placing: y/n") != "n":
+                robot.switch_to_navigation_mode()
+                xyt = robot.get_base_pose()
+                xyt[2] = xyt[2] + np.pi / 2
+                robot.navigate_to(xyt, blocking=True)
+
+                robot.switch_to_manipulation_mode()
+                if text is None:
+                    text = input("Enter receptacle name: ")
+                camera_xyz = robot.get_head_pose()[:3, 3]
+                if point is not None:
+                    theta = compute_tilt(camera_xyz, point)
+                else:
+                    theta = -0.6
+                demo.place(text, theta)
+                robot.move_to_nav_posture()
+
+            demo.save()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/stretch/dynav/scannet.py b/src/stretch/dynav/scannet.py
new file mode 100644
index 000000000..c8009658d
--- /dev/null
+++ b/src/stretch/dynav/scannet.py
@@ -0,0 +1,217 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+CLASS_LABELS_200 = (
+    "wall",
+    "background",
+    "object",
+    "range hood",
+    "door",
+    "stove",
+    "bathroom stall",
+    "divider",
+    "clothes dryer",
+    "file cabinet",
+    "cabinet",
+    "bathroom cabinet",
+    "blackboard",
+    "curtain",
+    "blinds",
+    "tv",
+    "mattress",
+    "bathtub",
+    "glass",
+    "shower",
+    "power outlet",
+    "water dispenser",
+    "bulletin board",
+    "printer",
+    "windowsill",
+    "bookshelf",
+    "dresser",
+    "storage organizer",
+    "fireplace",
+    "fire alarm",
+    "piano",
+    "shelf",
+    "kitchen counter",
+    "washing machine",
+    "stairs",
+    "paper cutter",
+    "sink",
+    "window",
+    "refrigerator",
+    "counter",
+    "closet",
+    "bathroom vanity",
+    "radiator",
+    "vent",
+    "kitchen cabinet",
+    "water cooler",
+    "doorframe",
+    "closet door",
+    "table",
+    "shower head",
+    "spray",
+    "inhaler",
+    "guitar case",
+    "plunger",
+    "toilet paper dispenser",
+    "adapter",
+    "soy sauce",
+    "pipe",
+    "bottle",
+    "scale",
+    "paper towel",
+    "paper towel roll",
+    "scissors",
+    "tape",
+    "chopsticks",
+    "case of water bottles",
+    "hand sanitizer",
+    "laptop",
+    "alcohol disinfection",
+    "keyboard",
+    "coffee maker",
+    "light",
+    "toaster",
+    "stuffed animal",
+    "toilet seat cover dispenser",
+    "ironing board",
+    "fire extinguisher",
+    "fruit",
+    "container",
+    "bag",
+    "oven",
+    "body wash",
+    "bucket",
+    "cd case",
+    "tray",
+    "bowl",
+    "speaker",
+    "crates",
+    "projector",
+    "book",
+    "school bag",
+    "laundry detergent",
+    "clothes",
+    "candle",
+    "basket",
+    "face wash",
+    "notebook",
+    "purse",
+    "trash bin",
+    "paper bag",
+    "package",
+    "disinfecting wipes",
+    "recycling bin",
+    "headphones",
+    "mouse",
+    "shower gel",
+    "dustpan",
+    "cup",
+    "vacuum cleaner",
+    "dish rack",
+    "coffee kettle",
+    "plants",
+    "rag",
+    "can",
+    "cushion",
+    "monitor",
+    "fan",
+    "tube",
+    "box",
+    "ball",
+    "bicycle",
+    "guitar",
+    "trash can",
+    "hand sanitizers",
+    "paper towel dispenser",
+    "whiteboard",
+    "bin",
+    "potted plant",
+    "tennis",
+    "soap dish",
+    "structure",
+    "calendar",
+    "dumbbell",
+    "fish oil",
+    "ottoman",
+    "stool",
+    "hand wash",
+    "lamp",
+    "toaster oven",
+    "music stand",
+    "water bottle",
+    "clock",
+    "charger",
+    "picture",
+    "bascketball",
+    "microwave",
+    "screwdriver",
+    "rack",
+    "apple",
+    "suitcase",
+    "ladder",
+    "ping pong ball",
+    "dishwasher",
+    "storage container",
+    "toilet paper holder",
+    "coat rack",
+    "soap dispenser",
+    "banana",
+    "toilet paper",
+    "mug",
+    "marker pen",
+    "hat",
+    "aerosol",
+    "luggage",
+    "poster",
+    "bed",
+    "cart",
+    "light switch",
+    "backpack",
+    "power strip",
+    "baseball",
+    "mustard",
+    "water pitcher",
+    "couch",
+    "beverage",
+    "toy",
+    "salt",
+    "plant",
+    "pillow",
+    "broom",
+    "pepper",
+    "muffins",
+    "multivitamin",
+    "towel",
+    "storage bin",
+    "nightstand",
+    "telephone",
+    "tissue box",
+    "hair dryer",
+    "mirror",
+    "sign",
+    "plate",
+    "tripod",
+    "chair",
+    "plastic bag",
+    "umbrella",
+    "paper",
+    "laundry hamper",
+    "food",
+    "jacket",
+    "computer tower",
+    "keyboard piano",
+    "person",
+    "machine",
+    "projector screen",
+    "shoe",
+)
diff --git a/src/stretch/dynav/voxel_map_localizer.py b/src/stretch/dynav/voxel_map_localizer.py
new file mode 100644
index 000000000..69bef8bf2
--- /dev/null
+++ b/src/stretch/dynav/voxel_map_localizer.py
@@ -0,0 +1,418 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+from typing import Optional
+
+import clip
+import numpy as np
+import torch
+import torch.nn.functional as F
+from sklearn.cluster import DBSCAN
+from torch import Tensor
+
+# from ultralytics import YOLOWorld
+from transformers import AutoModel, AutoProcessor, Owlv2ForObjectDetection
+
+from stretch.dynav.mapping_utils.voxelized_pcd import VoxelizedPointcloud
+
+
+def get_inv_intrinsics(intrinsics):
+    # return intrinsics.double().inverse().to(intrinsics)
+    fx, fy, ppx, ppy = (
+        intrinsics[..., 0, 0],
+        intrinsics[..., 1, 1],
+        intrinsics[..., 0, 2],
+        intrinsics[..., 1, 2],
+    )
+    inv_intrinsics = torch.zeros_like(intrinsics)
+    inv_intrinsics[..., 0, 0] = 1.0 / fx
+    inv_intrinsics[..., 1, 1] = 1.0 / fy
+    inv_intrinsics[..., 0, 2] = -ppx / fx
+    inv_intrinsics[..., 1, 2] = -ppy / fy
+    inv_intrinsics[..., 2, 2] = 1.0
+    return inv_intrinsics
+
+
+def get_xyz(depth, pose, intrinsics):
+    """Returns the XYZ coordinates for a set of points.
+
+    Args:
+        depth: The depth array, with shape (B, 1, H, W)
+        pose: The pose array, with shape (B, 4, 4)
+        intrinsics: The intrinsics array, with shape (B, 3, 3)
+
+    Returns:
+        The XYZ coordinates of the projected points, with shape (B, H, W, 3)
+    """
+    if not isinstance(depth, torch.Tensor):
+        depth = torch.from_numpy(depth)
+    if not isinstance(pose, torch.Tensor):
+        pose = torch.from_numpy(pose)
+    if not isinstance(intrinsics, torch.Tensor):
+        intrinsics = torch.from_numpy(intrinsics)
+    while depth.ndim < 4:
+        depth = depth.unsqueeze(0)
+    while pose.ndim < 3:
+        pose = pose.unsqueeze(0)
+    while intrinsics.ndim < 3:
+        intrinsics = intrinsics.unsqueeze(0)
+    (bsz, _, height, width), device, dtype = depth.shape, depth.device, intrinsics.dtype
+
+    # Gets the pixel grid.
+    xs, ys = torch.meshgrid(
+        torch.arange(0, width, device=device, dtype=dtype),
+        torch.arange(0, height, device=device, dtype=dtype),
+        indexing="xy",
+    )
+    xy = torch.stack([xs, ys], dim=-1).flatten(0, 1).unsqueeze(0).repeat_interleave(bsz, 0)
+    xyz = torch.cat((xy, torch.ones_like(xy[..., :1])), dim=-1)
+
+    # Applies intrinsics and extrinsics.
+    # xyz = xyz @ intrinsics.inverse().transpose(-1, -2)
+    xyz = xyz @ get_inv_intrinsics(intrinsics).transpose(-1, -2)
+    xyz = xyz * depth.flatten(1).unsqueeze(-1)
+    xyz = (xyz[..., None, :] * pose[..., None, :3, :3]).sum(dim=-1) + pose[..., None, :3, 3]
+
+    xyz = xyz.unflatten(1, (height, width))
+
+    return xyz
+
+
+class VoxelMapLocalizer:
+    def __init__(
+        self,
+        voxel_map_wrapper=None,
+        exist_model=True,
+        clip_model=None,
+        processor=None,
+        device="cuda",
+        siglip=True,
+    ):
+        print("Localizer V3")
+        self.voxel_map_wrapper = voxel_map_wrapper
+        self.device = device
+        # self.clip_model, self.preprocessor = clip.load(model_config, device=device)
+        self.siglip = siglip
+        if clip_model is not None and processor is not None:
+            self.clip_model, self.preprocessor = clip_model, processor
+        elif not self.siglip:
+            self.clip_model, self.preprocessor = clip.load("ViT-B/16", device=self.device)
+            self.clip_model.eval()
+        else:
+            self.clip_model = AutoModel.from_pretrained("google/siglip-so400m-patch14-384").to(
+                self.device
+            )
+            self.preprocessor = AutoProcessor.from_pretrained("google/siglip-so400m-patch14-384")
+            self.clip_model.eval()
+        self.voxel_pcd = VoxelizedPointcloud(voxel_size=0.05).to(self.device)
+        # self.exist_model = YOLOWorld("yolov8l-worldv2.pt")
+        self.existence_checking_model = exist_model
+        if exist_model:
+            print("WE ARE USING OWLV2!")
+            self.exist_processor = AutoProcessor.from_pretrained(
+                "google/owlv2-base-patch16-ensemble"
+            )
+            self.exist_model = Owlv2ForObjectDetection.from_pretrained(
+                "google/owlv2-base-patch16-ensemble"
+            ).to(self.device)
+        else:
+            print("YOU ARE USING NOTHING!")
+
+    def add(
+        self,
+        points: Tensor,
+        features: Optional[Tensor],
+        rgb: Optional[Tensor],
+        weights: Optional[Tensor] = None,
+        obs_count: Optional[Tensor] = None,
+        # weight_decay: float = 0.95
+    ):
+        points = points.to(self.device)
+        if features is not None:
+            features = features.to(self.device)
+        if rgb is not None:
+            rgb = rgb.to(self.device)
+        if weights is not None:
+            weights = weights.to(self.device)
+        # if weight_decay is not None and self.voxel_pcd._weights is not None:
+        #     self.voxel_pcd._weights *= weight_decay
+        self.voxel_pcd.add(
+            points=points, features=features, rgb=rgb, weights=weights, obs_count=obs_count
+        )
+
+    def calculate_clip_and_st_embeddings_for_queries(self, queries):
+        if isinstance(queries, str):
+            queries = [queries]
+        if self.siglip:
+            inputs = self.preprocessor(text=queries, padding="max_length", return_tensors="pt")
+            for input in inputs:
+                inputs[input] = inputs[input].to(self.clip_model.device)
+            all_clip_tokens = self.clip_model.get_text_features(**inputs)
+        else:
+            text = clip.tokenize(queries).to(self.clip_model.device)
+            all_clip_tokens = self.clip_model.encode_text(text)
+        # text = clip.tokenize(queries).to(self.device)
+        # all_clip_tokens = self.clip_model.encode_text(text)
+        all_clip_tokens = F.normalize(all_clip_tokens, p=2, dim=-1)
+        return all_clip_tokens
+
+    def find_alignment_over_model(self, queries):
+        clip_text_tokens = self.calculate_clip_and_st_embeddings_for_queries(queries)
+        points, features, weights, _ = self.voxel_pcd.get_pointcloud()
+        if points is None:
+            return None
+        features = F.normalize(features, p=2, dim=-1)
+        point_alignments = clip_text_tokens.float() @ features.float().T
+
+        # print(point_alignments.shape)
+        return point_alignments
+
+    def find_alignment_for_A(self, A):
+        points, features, _, _ = self.voxel_pcd.get_pointcloud()
+        alignments = self.find_alignment_over_model(A).cpu()
+        return points[alignments.argmax(dim=-1)].detach().cpu()
+
+    def find_obs_id_for_A(self, A):
+        obs_counts = self.voxel_pcd._obs_counts
+        alignments = self.find_alignment_over_model(A).cpu()
+        return obs_counts[alignments.argmax(dim=-1)].detach().cpu()
+
+    def compute_coord(self, text, obs_id, threshold=0.2):
+        # print(obs_id, len(self.voxel_map_wrapper.observations))
+        if obs_id <= 0 or obs_id > len(self.voxel_map_wrapper.observations):
+            return None
+        rgb = self.voxel_map_wrapper.observations[obs_id - 1].rgb
+        pose = self.voxel_map_wrapper.observations[obs_id - 1].camera_pose
+        depth = self.voxel_map_wrapper.observations[obs_id - 1].depth
+        K = self.voxel_map_wrapper.observations[obs_id - 1].camera_K
+        xyzs = get_xyz(depth, pose, K)[0]
+        # rgb[depth >= 2.5] = 0
+        rgb = rgb.permute(2, 0, 1).to(torch.uint8)
+        inputs = self.exist_processor(
+            text=[["a photo of a " + text]], images=rgb, return_tensors="pt"
+        )
+        for input in inputs:
+            inputs[input] = inputs[input].to("cuda")
+
+        with torch.no_grad():
+            outputs = self.exist_model(**inputs)
+
+        target_sizes = torch.Tensor([rgb.size()[-2:]]).to(self.device)
+        results = self.exist_processor.image_processor.post_process_object_detection(
+            outputs, threshold=threshold, target_sizes=target_sizes
+        )[0]
+        # if len(results['scores']) == 0:
+        #     return None
+        # tl_x, tl_y, br_x, br_y = results['boxes'][torch.argmax(results['scores'])]
+        # w, h = depth.shape
+        # tl_x, tl_y, br_x, br_y = int(max(0, tl_x.item())), int(max(0, tl_y.item())), int(min(h, br_x.item())), int(min(w, br_y.item()))
+        # return torch.median(xyzs[tl_y: br_y, tl_x: br_x].reshape(-1, 3), dim = 0).values
+        for idx, (score, bbox) in enumerate(
+            sorted(zip(results["scores"], results["boxes"]), key=lambda x: x[0], reverse=True)
+        ):
+
+            tl_x, tl_y, br_x, br_y = bbox
+            w, h = depth.shape
+            tl_x, tl_y, br_x, br_y = (
+                int(max(0, tl_x.item())),
+                int(max(0, tl_y.item())),
+                int(min(h, br_x.item())),
+                int(min(w, br_y.item())),
+            )
+
+            if torch.median(depth[tl_y:br_y, tl_x:br_x].reshape(-1)) < 3:
+                return torch.median(xyzs[tl_y:br_y, tl_x:br_x].reshape(-1, 3), dim=0).values
+        return None
+
+    def verify_point(self, A, point, distance_threshold=0.1, similarity_threshold=0.14):
+        if isinstance(point, np.ndarray):
+            point = torch.from_numpy(point)
+        points, _, _, _ = self.voxel_pcd.get_pointcloud()
+        distances = torch.linalg.norm(point - points.detach().cpu(), dim=-1)
+        if torch.min(distances) > distance_threshold:
+            print("Points are so far from other points!")
+            return False
+        alignments = self.find_alignment_over_model(A).detach().cpu()[0]
+        if torch.max(alignments[distances <= distance_threshold]) < similarity_threshold:
+            print("Points close the the point are not similar to the text!")
+        return torch.max(alignments[distances < distance_threshold]) >= similarity_threshold
+
+    def localize_A(self, A, debug=True, return_debug=False):
+        # centroids, extends, similarity_max_list, points, obs_max_list, debug_text = self.find_clusters_for_A(A, return_obs_counts = True, debug = debug)
+        # if len(centroids) == 0:
+        #     if not debug:
+        #         return None
+        #     else:
+        #         return None, debug_text
+        # target_point = None
+        # obs = None
+        # similarity = None
+        # point = None
+        # for idx, (centroid, obs, similarity, point) in enumerate(sorted(zip(centroids, obs_max_list, similarity_max_list, points), key=lambda x: x[2], reverse=True)):
+        #     res = self.compute_coord(A, obs)
+        #     if res is not None:
+        #         target_point = res
+        #         debug_text += '#### - Object is detected in observations where instance' + str(idx + 1) + ' comes from. **😃** Directly navigate to it.\n'
+        #         break
+        #     if self.siglip:
+        #         cosine_similarity_check = similarity > 0.14
+        #     else:
+        #         cosine_similarity_check = similarity > 0.3
+        #     if cosine_similarity_check:
+        #         target_point = centroid
+
+        #         debug_text += '#### - Instance ' +  str(idx + 1) + ' has high cosine similarity (' + str(round(similarity, 3)) +  '). **😃** Directly navigate to it.\n'
+
+        #         break
+
+        #     debug_text += '#### - Cannot verify whether this instance is the target. **😞** \n'
+
+        # if target_point is None:
+        #     debug_text += '#### - All instances are not the target! Maybe target object has not been observed yet. **😭**\n'
+        # if not debug:
+        #     return target_point
+        # elif not return_debug:
+        #     return target_point, debug_text
+        # else:
+        #     return target_point, debug_text, obs, point
+        points, _, _, _ = self.voxel_pcd.get_pointcloud()
+        alignments = self.find_alignment_over_model(A).cpu()
+        # alignments = alignments[0, points[:, -1] >= 0.1]
+        # points = points[points[:, -1] >= 0.1]
+        point = points[alignments.argmax(dim=-1)].detach().cpu().squeeze()
+        obs_counts = self.voxel_pcd._obs_counts
+        image_id = obs_counts[alignments.argmax(dim=-1)].detach().cpu()
+        debug_text = ""
+        target_point = None
+
+        res = self.compute_coord(A, image_id)
+        # res = None
+        if res is not None:
+            target_point = res
+            debug_text += (
+                "#### - Object is detected in observations . **😃** Directly navigate to it.\n"
+            )
+        else:
+            # debug_text += '#### - Directly ignore this instance is the target. **😞** \n'
+            if self.siglip:
+                cosine_similarity_check = alignments.max().item() > 0.14
+            else:
+                cosine_similarity_check = alignments.max().item() > 0.3
+            if cosine_similarity_check:
+                target_point = point
+
+                debug_text += (
+                    "#### - The point has high cosine similarity. **😃** Directly navigate to it.\n"
+                )
+            else:
+                debug_text += "#### - Cannot verify whether this instance is the target. **😞** \n"
+        # print('target_point', target_point)
+        if not debug:
+            return target_point
+        elif not return_debug:
+            return target_point, debug_text
+        else:
+            return target_point, debug_text, image_id, point
+
+    def find_clusters_for_A(self, A, return_obs_counts=False, debug=False, turning_point=None):
+
+        debug_text = ""
+
+        points, features, _, _ = self.voxel_pcd.get_pointcloud()
+        alignments = self.find_alignment_over_model(A).cpu().reshape(-1).detach().numpy()
+        if turning_point is None:
+            if self.siglip:
+                turning_point = min(0.14, alignments[np.argsort(alignments)[-20]])
+            else:
+                turning_point = min(0.3, alignments[np.argsort(alignments)[-20]])
+        mask = alignments >= turning_point
+        alignments = alignments[mask]
+        points = points[mask]
+        if len(points) == 0:
+
+            debug_text += (
+                "### - No instance found! Maybe target object has not been observed yet. **😭**\n"
+            )
+
+            output = [[], [], [], []]
+            if return_obs_counts:
+                output.append([])
+            if debug:
+                output.append(debug_text)
+
+            return output
+        else:
+            if return_obs_counts:
+                obs_ids = self.voxel_pcd._obs_counts.detach().cpu().numpy()[mask]
+                centroids, extends, similarity_max_list, points, obs_max_list = find_clusters(
+                    points.detach().cpu().numpy(), alignments, obs=obs_ids
+                )
+                output = [centroids, extends, similarity_max_list, points, obs_max_list]
+            else:
+                centroids, extends, similarity_max_list, points = find_clusters(
+                    points.detach().cpu().numpy(), alignments, obs=None
+                )
+                output = [centroids, extends, similarity_max_list, points]
+
+            debug_text += (
+                "### - Found " + str(len(centroids)) + " instances that might be target object.\n"
+            )
+            if debug:
+                output.append(debug_text)
+
+            return output
+
+
+def find_clusters(vertices: np.ndarray, similarity: np.ndarray, obs=None):
+    # Calculate the number of top values directly
+    top_positions = vertices
+    # top_values = probability_over_all_points[top_indices].flatten()
+
+    # Apply DBSCAN clustering
+    dbscan = DBSCAN(eps=0.25, min_samples=5)
+    clusters = dbscan.fit(top_positions)
+    labels = clusters.labels_
+
+    # Initialize empty lists to store centroids and extends of each cluster
+    centroids = []
+    extends = []
+    similarity_list = []
+    points = []
+    obs_max_list = []
+
+    for cluster_id in set(labels):
+        if cluster_id == -1:  # Ignore noise
+            continue
+
+        members = top_positions[labels == cluster_id]
+        centroid = np.mean(members, axis=0)
+
+        similarity_values = similarity[labels == cluster_id]
+        simiarity = np.max(similarity_values)
+
+        if obs is not None:
+            obs_values = obs[labels == cluster_id]
+            obs_max = np.max(obs_values)
+
+        sx = np.max(members[:, 0]) - np.min(members[:, 0])
+        sy = np.max(members[:, 1]) - np.min(members[:, 1])
+        sz = np.max(members[:, 2]) - np.min(members[:, 2])
+
+        # Append centroid and extends to the lists
+        centroids.append(centroid)
+        extends.append((sx, sy, sz))
+        similarity_list.append(simiarity)
+        points.append(members)
+        if obs is not None:
+            obs_max_list.append(obs_max)
+
+    if obs is not None:
+        return centroids, extends, similarity_list, points, obs_max_list
+    else:
+        return centroids, extends, similarity_list, points
diff --git a/src/stretch/dynav/voxel_map_server.py b/src/stretch/dynav/voxel_map_server.py
new file mode 100644
index 000000000..433e4b899
--- /dev/null
+++ b/src/stretch/dynav/voxel_map_server.py
@@ -0,0 +1,917 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+import datetime
+import os
+import pickle
+import threading
+
+# import wget
+import time
+from io import BytesIO
+from pathlib import Path
+
+import clip
+import cv2
+import numpy as np
+import rerun as rr
+import scipy
+import torch
+
+# from stretch.utils.morphology import get_edges
+import torch.nn.functional as F
+import wget
+from matplotlib import pyplot as plt
+from PIL import Image
+from sam2.build_sam import build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+from torchvision import transforms
+from transformers import AutoModel, AutoProcessor
+
+# from segment_anything import sam_model_registry, SamPredictor
+# from transformers import AutoProcessor, OwlViTForObjectDetection
+from ultralytics import YOLOWorld
+
+from stretch.core import get_parameters
+from stretch.dynav.communication_util import load_socket, recv_everything
+from stretch.dynav.mapping_utils.a_star import AStar
+from stretch.dynav.mapping_utils.voxel import SparseVoxelMap
+from stretch.dynav.mapping_utils.voxel_map import SparseVoxelMapNavigationSpace
+from stretch.dynav.scannet import CLASS_LABELS_200
+from stretch.dynav.voxel_map_localizer import VoxelMapLocalizer
+
+
+def get_inv_intrinsics(intrinsics):
+    # return intrinsics.double().inverse().to(intrinsics)
+    fx, fy, ppx, ppy = (
+        intrinsics[..., 0, 0],
+        intrinsics[..., 1, 1],
+        intrinsics[..., 0, 2],
+        intrinsics[..., 1, 2],
+    )
+    inv_intrinsics = torch.zeros_like(intrinsics)
+    inv_intrinsics[..., 0, 0] = 1.0 / fx
+    inv_intrinsics[..., 1, 1] = 1.0 / fy
+    inv_intrinsics[..., 0, 2] = -ppx / fx
+    inv_intrinsics[..., 1, 2] = -ppy / fy
+    inv_intrinsics[..., 2, 2] = 1.0
+    return inv_intrinsics
+
+
+def get_xyz(depth, pose, intrinsics):
+    """Returns the XYZ coordinates for a set of points.
+
+    Args:
+        depth: The depth array, with shape (B, 1, H, W)
+        pose: The pose array, with shape (B, 4, 4)
+        intrinsics: The intrinsics array, with shape (B, 3, 3)
+
+    Returns:
+        The XYZ coordinates of the projected points, with shape (B, H, W, 3)
+    """
+    if not isinstance(depth, torch.Tensor):
+        depth = torch.from_numpy(depth)
+    if not isinstance(pose, torch.Tensor):
+        pose = torch.from_numpy(pose)
+    if not isinstance(intrinsics, torch.Tensor):
+        intrinsics = torch.from_numpy(intrinsics)
+    while depth.ndim < 4:
+        depth = depth.unsqueeze(0)
+    while pose.ndim < 3:
+        pose = pose.unsqueeze(0)
+    while intrinsics.ndim < 3:
+        intrinsics = intrinsics.unsqueeze(0)
+    (bsz, _, height, width), device, dtype = depth.shape, depth.device, intrinsics.dtype
+
+    # Gets the pixel grid.
+    xs, ys = torch.meshgrid(
+        torch.arange(0, width, device=device, dtype=dtype),
+        torch.arange(0, height, device=device, dtype=dtype),
+        indexing="xy",
+    )
+    xy = torch.stack([xs, ys], dim=-1).flatten(0, 1).unsqueeze(0).repeat_interleave(bsz, 0)
+    xyz = torch.cat((xy, torch.ones_like(xy[..., :1])), dim=-1)
+
+    # Applies intrinsics and extrinsics.
+    # xyz = xyz @ intrinsics.inverse().transpose(-1, -2)
+    xyz = xyz @ get_inv_intrinsics(intrinsics).transpose(-1, -2)
+    xyz = xyz * depth.flatten(1).unsqueeze(-1)
+    xyz = (xyz[..., None, :] * pose[..., None, :3, :3]).sum(dim=-1) + pose[..., None, :3, 3]
+
+    xyz = xyz.unflatten(1, (height, width))
+
+    return xyz
+
+
+class ImageProcessor:
+    def __init__(
+        self,
+        vision_method="mask*lip",
+        siglip=True,
+        device="cuda",
+        min_depth=0.25,
+        max_depth=2.5,
+        img_port=5558,
+        text_port=5556,
+        open_communication=True,
+        rerun=True,
+        static=True,
+        log=None,
+        image_shape=(400, 300),
+    ):
+        self.static = static
+        self.siglip = siglip
+        current_datetime = datetime.datetime.now()
+        if log is None:
+            self.log = "debug_" + current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
+        else:
+            self.log = log
+        self.rerun = rerun
+        if self.rerun:
+            if self.static:
+                rr.init(self.log, spawn=False)
+                rr.connect("100.108.67.79:9876")
+            else:
+                rr.init(self.log, spawn=True)
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.obs_count = 0
+        # There are three vision methods:
+        # 1. 'mask*lip' Following the idea of https://arxiv.org/abs/2112.01071, remove the last layer of any VLM and obtain the dense features
+        # 2. 'mask&*lip' Following the idea of https://mahis.life/clip-fields/, extract segmentation mask and assign a vision-language feature to it
+        self.vision_method = vision_method
+        # If cuda is not available, then device will be forced to be cpu
+        if not torch.cuda.is_available():
+            device = "cpu"
+        self.device = device
+
+        self.create_obstacle_map()
+        self.create_vision_model()
+
+        self.voxel_map_lock = (
+            threading.Lock()
+        )  # Create a lock for synchronizing access to `self.voxel_map_localizer`
+
+        self.traj = None
+        self.image_shape = image_shape
+
+        if open_communication:
+            self.img_socket = load_socket(img_port)
+            self.text_socket = load_socket(text_port)
+
+            self.img_thread = threading.Thread(target=self._recv_image)
+            self.img_thread.daemon = True
+            self.img_thread.start()
+
+    def create_obstacle_map(self):
+        print("- Load parameters")
+        parameters = get_parameters("dynav_config.yaml")
+        self.default_expand_frontier_size = parameters["default_expand_frontier_size"]
+        self.voxel_map = SparseVoxelMap(
+            resolution=parameters["voxel_size"],
+            local_radius=parameters["local_radius"],
+            obs_min_height=parameters["obs_min_height"],
+            obs_max_height=parameters["obs_max_height"],
+            obs_min_density=parameters["obs_min_density"],
+            exp_min_density=parameters["exp_min_density"],
+            min_depth=self.min_depth,
+            max_depth=self.max_depth,
+            pad_obstacles=parameters["pad_obstacles"],
+            add_local_radius_points=parameters.get("add_local_radius_points", default=True),
+            remove_visited_from_obstacles=parameters.get(
+                "remove_visited_from_obstacles", default=False
+            ),
+            smooth_kernel_size=parameters.get("filters/smooth_kernel_size", -1),
+            use_median_filter=parameters.get("filters/use_median_filter", False),
+            median_filter_size=parameters.get("filters/median_filter_size", 5),
+            median_filter_max_error=parameters.get("filters/median_filter_max_error", 0.01),
+            use_derivative_filter=parameters.get("filters/use_derivative_filter", False),
+            derivative_filter_threshold=parameters.get("filters/derivative_filter_threshold", 0.5),
+        )
+        self.space = SparseVoxelMapNavigationSpace(
+            self.voxel_map,
+            # step_size=parameters["step_size"],
+            rotation_step_size=parameters["rotation_step_size"],
+            dilate_frontier_size=parameters[
+                "dilate_frontier_size"
+            ],  # 0.6 meters back from every edge = 12 * 0.02 = 0.24
+            dilate_obstacle_size=parameters["dilate_obstacle_size"],
+        )
+
+        # Create a simple motion planner
+        self.planner = AStar(self.space)
+        self.value_map = torch.zeros(self.voxel_map.grid_size)
+
+    def create_vision_model(self):
+        if not self.siglip:
+            self.clip_model, self.clip_preprocess = clip.load("ViT-B/16", device=self.device)
+            self.clip_model.eval()
+        else:
+            self.clip_model = AutoModel.from_pretrained("google/siglip-so400m-patch14-384").to(
+                self.device
+            )
+            self.clip_preprocess = AutoProcessor.from_pretrained("google/siglip-so400m-patch14-384")
+            self.clip_model.eval()
+        if self.vision_method != "mask*lip":
+            sam_checkpoint = f"./sam2_hiera_small.pt"
+            sam_config = "sam2_hiera_s.yaml"
+            if not os.path.exists(sam_checkpoint):
+                wget.download(
+                    "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_small.pt",
+                    out=sam_checkpoint,
+                )
+            sam2_model = build_sam2(
+                sam_config, sam_checkpoint, device=self.device, apply_postprocessing=False
+            )
+            self.mask_predictor = SAM2ImagePredictor(sam2_model)
+        self.yolo_model = YOLOWorld("yolov8s-worldv2.pt")
+        self.texts = CLASS_LABELS_200
+        self.yolo_model.set_classes(self.texts)
+        self.voxel_map_localizer = VoxelMapLocalizer(
+            self.voxel_map,
+            clip_model=self.clip_model,
+            processor=self.clip_preprocess,
+            device=self.device,
+            siglip=self.siglip,
+        )
+
+    def process_text(self, text, start_pose):
+        """
+        Process the text query and return the trajectory for the robot to follow.
+        """
+
+        if self.rerun:
+            rr.log("/object", rr.Clear(recursive=True), static=self.static)
+            rr.log("/robot_start_pose", rr.Clear(recursive=True), static=self.static)
+            rr.log("/direction", rr.Clear(recursive=True), static=self.static)
+            rr.log("robot_monologue", rr.Clear(recursive=True), static=self.static)
+            rr.log(
+                "/Past_observation_most_similar_to_text",
+                rr.Clear(recursive=True),
+                static=self.static,
+            )
+            if not self.static:
+                rr.connect("100.108.67.79:9876")
+        debug_text = ""
+        mode = "navigation"
+        obs = None
+        localized_point = None
+        waypoints = None
+
+        if text is not None and text != "" and self.traj is not None:
+            print("saved traj", self.traj)
+            traj_target_point = self.traj[-1]
+            if self.voxel_map_localizer.verify_point(text, traj_target_point):
+                localized_point = traj_target_point
+                debug_text += "## Last visual grounding results looks fine so directly use it.\n"
+
+        if waypoints is None:
+            # Do visual grounding
+            if text is not None and text != "" and localized_point is None:
+                with self.voxel_map_lock:
+                    (
+                        localized_point,
+                        debug_text,
+                        obs,
+                        pointcloud,
+                    ) = self.voxel_map_localizer.localize_A(text, debug=True, return_debug=True)
+                if localized_point is not None:
+                    rr.log(
+                        "/object",
+                        rr.Points3D(
+                            [localized_point[0], localized_point[1], 1.5],
+                            colors=torch.Tensor([1, 0, 0]),
+                            radii=0.1,
+                        ),
+                        static=self.static,
+                    )
+            # Do Frontier based exploration
+            if text is None or text == "" or localized_point is None:
+                debug_text += "## Navigation fails, so robot starts exploring environments.\n"
+                localized_point = self.sample_frontier(start_pose, text)
+                mode = "exploration"
+                rr.log(
+                    "/object",
+                    rr.Points3D([0, 0, 0], colors=torch.Tensor([1, 0, 0]), radii=0),
+                    static=self.static,
+                )
+                print("\n", localized_point, "\n")
+
+            if localized_point is None:
+                return []
+
+            if len(localized_point) == 2:
+                localized_point = np.array([localized_point[0], localized_point[1], 0])
+
+            point = self.sample_navigation(start_pose, localized_point)
+
+            if self.rerun:
+                buf = BytesIO()
+                plt.savefig(buf, format="png")
+                buf.seek(0)
+                img = Image.open(buf)
+                img = np.array(img)
+                buf.close()
+                rr.log("2d_map", rr.Image(img), static=self.static)
+            else:
+                if text != "":
+                    plt.savefig(self.log + "/debug_" + text + str(self.obs_count) + ".png")
+                else:
+                    plt.savefig(self.log + "/debug_exploration" + str(self.obs_count) + ".png")
+            plt.clf()
+
+            if obs is not None and mode == "navigation":
+                rgb = self.voxel_map.observations[obs - 1].rgb
+                if not self.rerun:
+                    if isinstance(rgb, torch.Tensor):
+                        rgb = np.array(rgb)
+                    cv2.imwrite(self.log + "/debug_" + text + ".png", rgb[:, :, [2, 1, 0]])
+                else:
+                    rr.log(
+                        "/Past_observation_most_similar_to_text", rr.Image(rgb), static=self.static
+                    )
+            waypoints = None
+
+            if point is None:
+                print("Unable to find any target point, some exception might happen")
+            else:
+                print("Target point is", point)
+                res = self.planner.plan(start_pose, point)
+            if res.success:
+                waypoints = [pt.state for pt in res.trajectory]
+            else:
+                waypoints = None
+                print("[FAILURE]", res.reason)
+        # If we are navigating to some object of interest, send (x, y, z) of
+        # the object so that we can make sure the robot looks at the object after navigation
+        traj = []
+        if waypoints is not None:
+            finished = len(waypoints) <= 10 and mode == "navigation"
+            if finished:
+                self.traj = None
+            else:
+                self.traj = waypoints[8:] + [[np.nan, np.nan, np.nan], localized_point]
+            if not finished:
+                waypoints = waypoints[:8]
+            traj = self.planner.clean_path_for_xy(waypoints)
+            if finished:
+                traj.append([np.nan, np.nan, np.nan])
+                if isinstance(localized_point, torch.Tensor):
+                    localized_point = localized_point.tolist()
+                traj.append(localized_point)
+            print("Planned trajectory:", traj)
+
+        if self.rerun:
+            if text is not None and text != "":
+                debug_text = "### The goal is to navigate to " + text + ".\n" + debug_text
+            else:
+                debug_text = "### I have not received any text query from human user.\n ### So, I plan to explore the environment with Frontier-based exploration.\n"
+            debug_text = "# Robot's monologue: \n" + debug_text
+            rr.log(
+                "robot_monologue",
+                rr.TextDocument(debug_text, media_type=rr.MediaType.MARKDOWN),
+                static=self.static,
+            )
+
+        if traj is not None:
+            origins = []
+            vectors = []
+            for idx in range(len(traj)):
+                if idx != len(traj) - 1:
+                    origins.append([traj[idx][0], traj[idx][1], 1.5])
+                    vectors.append(
+                        [traj[idx + 1][0] - traj[idx][0], traj[idx + 1][1] - traj[idx][1], 0]
+                    )
+            rr.log(
+                "/direction",
+                rr.Arrows3D(
+                    origins=origins, vectors=vectors, colors=torch.Tensor([0, 1, 0]), radii=0.1
+                ),
+                static=self.static,
+            )
+            rr.log(
+                "/robot_start_pose",
+                rr.Points3D(
+                    [start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1
+                ),
+                static=self.static,
+            )
+
+        # self.write_to_pickle()
+        return traj
+
+    def sample_navigation(self, start, point, mode="navigation"):
+        # plt.clf()
+        obstacles, _ = self.voxel_map.get_2d_map()
+        plt.imshow(obstacles)
+        if point is None:
+            start_pt = self.planner.to_pt(start)
+            # plt.scatter(start_pt[1], start_pt[0], s = 10)
+            return None
+        goal = self.space.sample_target_point(
+            start, point, self.planner, exploration=mode != "navigation"
+        )
+        print("point:", point, "goal:", goal)
+        obstacles, explored = self.voxel_map.get_2d_map()
+        start_pt = self.planner.to_pt(start)
+        plt.scatter(start_pt[1], start_pt[0], s=15, c="b")
+        point_pt = self.planner.to_pt(point)
+        plt.scatter(point_pt[1], point_pt[0], s=15, c="r")
+        if goal is not None:
+            goal_pt = self.planner.to_pt(goal)
+            plt.scatter(goal_pt[1], goal_pt[0], s=10, c="g")
+        return goal
+
+    def sample_frontier(self, start_pose=[0, 0, 0], text=None):
+        with self.voxel_map_lock:
+            if text is not None and text != "":
+                (
+                    index,
+                    time_heuristics,
+                    alignments_heuristics,
+                    total_heuristics,
+                ) = self.space.sample_exploration(
+                    start_pose, self.planner, self.voxel_map_localizer, text, debug=False
+                )
+            else:
+                index, time_heuristics, _, total_heuristics = self.space.sample_exploration(
+                    start_pose, self.planner, None, None, debug=False
+                )
+                alignments_heuristics = time_heuristics
+
+        obstacles, explored = self.voxel_map.get_2d_map()
+        # plt.clf()
+        # plt.imshow(obstacles * 0.5 + explored * 0.5)
+        # plt.scatter(index[1], index[0], s = 20, c = 'r')
+        return self.voxel_map.grid_coords_to_xyt(torch.tensor([index[0], index[1]]))
+
+    def _recv_image(self):
+        while True:
+            # data = recv_array(self.img_socket)
+            rgb, depth, intrinsics, pose = recv_everything(self.img_socket)
+            print("Image received")
+            start_time = time.time()
+            self.process_rgbd_images(rgb, depth, intrinsics, pose)
+            process_time = time.time() - start_time
+            print("Image processing takes", process_time, "seconds")
+            print("processing took " + str(process_time) + " seconds")
+
+    def forward_one_block(self, resblocks, x):
+        q, k, v = None, None, None
+        y = resblocks.ln_1(x)
+        y = F.linear(y, resblocks.attn.in_proj_weight, resblocks.attn.in_proj_bias)
+        N, L, C = y.shape
+        y = y.view(N, L, 3, C // 3).permute(2, 0, 1, 3).reshape(3 * N, L, C // 3)
+        y = F.linear(y, resblocks.attn.out_proj.weight, resblocks.attn.out_proj.bias)
+        q, k, v = y.tensor_split(3, dim=0)
+        v += x
+        v = v + resblocks.mlp(resblocks.ln_2(v))
+
+        return v
+
+    def forward_one_block_siglip(self, resblocks, x):
+        q, k, v = None, None, None
+        x = F.linear(x, resblocks.in_proj_weight, resblocks.in_proj_bias)
+        N, L, C = x.shape
+        x = x.view(N, L, 3, C // 3).permute(2, 0, 1, 3).reshape(3 * N, L, C // 3)
+        x = F.linear(x, resblocks.out_proj.weight, resblocks.out_proj.bias)
+        q, k, v = x.tensor_split(3, dim=0)
+
+        return v
+
+    def extract_mask_clip_features(self, x, image_shape):
+        if self.siglip:
+            with torch.no_grad():
+                output = self.clip_model.vision_model(x["pixel_values"], output_hidden_states=True)
+            feat = output.last_hidden_state
+            feat = self.forward_one_block_siglip(self.clip_model.vision_model.head.attention, feat)
+            feat = self.clip_model.vision_model.head.layernorm(feat)
+            feat = feat + self.clip_model.vision_model.head.mlp(feat)
+            feat = feat.detach().cpu()
+            with torch.no_grad():
+                N, L, H, W = self.clip_model.vision_model.embeddings.patch_embedding(
+                    x["pixel_values"]
+                ).shape
+            feat = feat.reshape(N, H, W, L).permute(0, 3, 1, 2)
+        else:
+            with torch.no_grad():
+                x = self.clip_model.visual.conv1(x)
+                N, L, H, W = x.shape
+                x = x.reshape(x.shape[0], x.shape[1], -1)
+                x = x.permute(0, 2, 1)
+                x = torch.cat(
+                    [
+                        self.clip_model.visual.class_embedding.to(x.dtype)
+                        + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device),
+                        x,
+                    ],
+                    dim=1,
+                )
+                x = x + self.clip_model.visual.positional_embedding.to(x.dtype)
+                x = self.clip_model.visual.ln_pre(x)
+                x = x.permute(1, 0, 2)
+                for idx in range(self.clip_model.visual.transformer.layers):
+                    if idx == self.clip_model.visual.transformer.layers - 1:
+                        break
+                    x = self.clip_model.visual.transformer.resblocks[idx](x)
+                x = self.forward_one_block(self.clip_model.visual.transformer.resblocks[-1], x)
+                x = x[1:]
+                x = x.permute(1, 0, 2)
+                x = self.clip_model.visual.ln_post(x)
+                x = x @ self.clip_model.visual.proj
+                feat = x.reshape(N, H, W, -1).permute(0, 3, 1, 2)
+        feat = F.interpolate(feat, image_shape, mode="bilinear", align_corners=True)
+        feat = F.normalize(feat, dim=1)
+        return feat.permute(0, 2, 3, 1)
+
+    def run_mask_clip(self, rgb, mask, world_xyz):
+
+        with torch.no_grad():
+            results = self.yolo_model.predict(
+                rgb.permute(1, 2, 0)[:, :, [2, 1, 0]].numpy(), conf=0.1, verbose=False
+            )
+            xyxy_tensor = results[0].boxes.xyxy
+            if len(xyxy_tensor) == 0:
+                return
+            bounding_boxes = torch.stack(
+                sorted(
+                    xyxy_tensor, key=lambda box: (box[2] - box[0]) * (box[3] - box[1]), reverse=True
+                ),
+                dim=0,
+            )
+            bbox_mask = torch.zeros_like(mask)
+            for box in bounding_boxes:
+                tl_x, tl_y, br_x, br_y = box
+                bbox_mask[
+                    max(int(tl_y), 0) : min(int(br_y), rgb.shape[1]),
+                    max(int(tl_x), 0) : min(int(br_x), rgb.shape[2]),
+                ] = 1
+            bbox_mask = bbox_mask.bool()
+            # print(mask, bbox_mask)
+            mask = torch.logical_or(mask, ~bbox_mask)
+
+            if not self.siglip:
+                if self.device == "cpu":
+                    input = (
+                        self.clip_preprocess(transforms.ToPILImage()(rgb))
+                        .unsqueeze(0)
+                        .to(self.device)
+                    )
+                else:
+                    input = (
+                        self.clip_preprocess(transforms.ToPILImage()(rgb))
+                        .unsqueeze(0)
+                        .to(self.device)
+                        .half()
+                    )
+            else:
+                input = self.clip_preprocess(images=rgb, padding="max_length", return_tensors="pt")
+                for i in input:
+                    input[i] = input[i].to(self.device)
+            features = self.extract_mask_clip_features(input, rgb.shape[-2:])[0].cpu()
+
+        valid_xyz = world_xyz[~mask]
+        features = features[~mask]
+        valid_rgb = rgb.permute(1, 2, 0)[~mask]
+        if len(valid_xyz) != 0:
+            self.add_to_voxel_pcd(valid_xyz, features, valid_rgb)
+
+    def run_owl_sam_clip(self, rgb, mask, world_xyz):
+        with torch.no_grad():
+            results = self.yolo_model.predict(
+                rgb.permute(1, 2, 0)[:, :, [2, 1, 0]].numpy(), conf=0.1, verbose=False
+            )
+            xyxy_tensor = results[0].boxes.xyxy
+            if len(xyxy_tensor) == 0:
+                return
+
+            self.mask_predictor.set_image(rgb.permute(1, 2, 0).numpy())
+            # bounding_boxes = torch.stack(sorted(results[0]['boxes'], key=lambda box: (box[2] - box[0]) * (box[3] - box[1]), reverse = True), dim = 0)
+            bounding_boxes = torch.stack(
+                sorted(
+                    xyxy_tensor, key=lambda box: (box[2] - box[0]) * (box[3] - box[1]), reverse=True
+                ),
+                dim=0,
+            )
+            # transformed_boxes = self.mask_predictor.transform.apply_boxes_torch(bounding_boxes.detach().to(self.device), rgb.shape[-2:])
+            # masks, _, _= self.mask_predictor.predict_torch(
+            masks, _, _ = self.mask_predictor.predict(
+                point_coords=None,
+                point_labels=None,
+                # boxes=transformed_boxes,
+                box=bounding_boxes,
+                multimask_output=False,
+            )
+            if masks.ndim == 3:
+                masks = torch.Tensor(masks).bool()
+            else:
+                masks = torch.Tensor(masks[:, 0, :, :]).bool()
+
+            # Debug code, visualize all bounding boxes and segmentation masks
+
+            image_vis = np.ascontiguousarray(np.array(rgb.permute(1, 2, 0)).astype(np.uint8))
+            segmentation_color_map = np.zeros(image_vis.shape, dtype=np.uint8)
+            # cv2.imwrite('clean_' + str(self.obs_count) + '.jpg', cv2.cvtColor(image_vis, cv2.COLOR_RGB2BGR))
+            for idx, box in enumerate(bounding_boxes):
+                tl_x, tl_y, br_x, br_y = box
+                tl_x, tl_y, br_x, br_y = tl_x.item(), tl_y.item(), br_x.item(), br_y.item()
+                cv2.rectangle(
+                    image_vis, (int(tl_x), int(tl_y)), (int(br_x), int(br_y)), (255, 0, 0), 2
+                )
+            image_vis = cv2.cvtColor(image_vis, cv2.COLOR_RGB2BGR)
+            for vis_mask in masks:
+                segmentation_color_map[vis_mask.detach().cpu().numpy()] = [0, 255, 0]
+            image_vis = cv2.addWeighted(image_vis, 0.7, segmentation_color_map, 0.3, 0)
+            if not self.rerun:
+                cv2.imwrite(self.log + "/seg" + str(self.obs_count) + ".jpg", image_vis)
+            # else:
+            #     rr.log('Segmentation mask', rr.Image(image_vis[:, :, [2, 1, 0]]))
+
+            crops = []
+            if not self.siglip:
+                for (box, mask) in zip(bounding_boxes, masks):
+                    tl_x, tl_y, br_x, br_y = box
+                    crops.append(
+                        self.clip_preprocess(
+                            transforms.ToPILImage()(
+                                rgb[
+                                    :,
+                                    max(int(tl_y), 0) : min(int(br_y), rgb.shape[1]),
+                                    max(int(tl_x), 0) : min(int(br_x), rgb.shape[2]),
+                                ]
+                            )
+                        )
+                    )
+                features = self.clip_model.encode_image(torch.stack(crops, dim=0).to(self.device))
+            else:
+                for box in bounding_boxes:
+                    tl_x, tl_y, br_x, br_y = box
+                    crops.append(
+                        rgb[
+                            :,
+                            max(int(tl_y), 0) : min(int(br_y), rgb.shape[1]),
+                            max(int(tl_x), 0) : min(int(br_x), rgb.shape[2]),
+                        ]
+                    )
+                inputs = self.clip_preprocess(
+                    images=crops, padding="max_length", return_tensors="pt"
+                ).to(self.device)
+                features = self.clip_model.get_image_features(**inputs)
+            features = F.normalize(features, dim=-1).cpu()
+
+        for idx, (sam_mask, feature) in enumerate(zip(masks.cpu(), features.cpu())):
+            valid_mask = torch.logical_and(~mask, sam_mask)
+            valid_xyz = world_xyz[valid_mask]
+            if valid_xyz.shape[0] == 0:
+                continue
+            feature = feature.repeat(valid_xyz.shape[0], 1)
+            valid_rgb = rgb.permute(1, 2, 0)[valid_mask]
+            self.add_to_voxel_pcd(valid_xyz, feature, valid_rgb)
+
+    def add_to_voxel_pcd(self, valid_xyz, feature, valid_rgb, weights=None, threshold=0.95):
+        # Adding all points to voxelizedPointCloud is useless and expensive, we should exclude threshold of all points
+        selected_indices = torch.randperm(len(valid_xyz))[: int((1 - threshold) * len(valid_xyz))]
+        if len(selected_indices) == 0:
+            return
+        if valid_xyz is not None:
+            valid_xyz = valid_xyz[selected_indices]
+        if feature is not None:
+            feature = feature[selected_indices]
+        if valid_rgb is not None:
+            valid_rgb = valid_rgb[selected_indices]
+        if weights is not None:
+            weights = weights[selected_indices]
+        with self.voxel_map_lock:
+            self.voxel_map_localizer.add(
+                points=valid_xyz,
+                features=feature,
+                rgb=valid_rgb,
+                weights=weights,
+                obs_count=self.obs_count,
+            )
+
+    def process_rgbd_images(self, rgb, depth, intrinsics, pose):
+        # import time
+        # start_time = time.time()
+        if not os.path.exists(self.log):
+            os.mkdir(self.log)
+        self.obs_count += 1
+
+        cv2.imwrite(self.log + "/rgb" + str(self.obs_count) + ".jpg", rgb[:, :, [2, 1, 0]])
+        np.save(self.log + "/rgb" + str(self.obs_count) + ".npy", rgb)
+        np.save(self.log + "/depth" + str(self.obs_count) + ".npy", depth)
+        np.save(self.log + "/intrinsics" + str(self.obs_count) + ".npy", intrinsics)
+        np.save(self.log + "/pose" + str(self.obs_count) + ".npy", pose)
+
+        rgb, depth = torch.from_numpy(rgb), torch.from_numpy(depth)
+        rgb = rgb.permute(2, 0, 1).to(torch.uint8)
+
+        if self.image_shape is not None:
+            h, w = self.image_shape
+            h_image, w_image = depth.shape
+            rgb = F.interpolate(
+                rgb.unsqueeze(0), size=self.image_shape, mode="bilinear", align_corners=False
+            ).squeeze()
+            depth = F.interpolate(
+                depth.unsqueeze(0).unsqueeze(0),
+                size=self.image_shape,
+                mode="bilinear",
+                align_corners=False,
+            ).squeeze()
+            intrinsics[0, 0] *= w / w_image
+            intrinsics[1, 1] *= h / h_image
+            intrinsics[0, 2] *= w / w_image
+            intrinsics[1, 2] *= h / h_image
+
+        world_xyz = get_xyz(depth, pose, intrinsics).squeeze()
+
+        median_depth = torch.from_numpy(scipy.ndimage.median_filter(depth, size=5))
+        median_filter_error = (depth - median_depth).abs()
+        # edges = get_edges(depth, threshold=0.5)
+        valid_depth = torch.logical_and(depth < self.max_depth, depth > self.min_depth)
+        valid_depth = (
+            valid_depth
+            & (median_filter_error < 0.01).bool()
+            # & ~edges
+        )
+
+        with self.voxel_map_lock:
+            if self.vision_method == "mask&*lip":
+                min_samples_clear = 3
+            else:
+                min_samples_clear = 10
+            self.voxel_map_localizer.voxel_pcd.clear_points(
+                depth,
+                torch.from_numpy(intrinsics),
+                torch.from_numpy(pose),
+                min_samples_clear=min_samples_clear,
+            )
+            self.voxel_map.voxel_pcd.clear_points(
+                depth, torch.from_numpy(intrinsics), torch.from_numpy(pose)
+            )
+
+        if self.vision_method == "mask&*lip":
+            self.run_owl_sam_clip(rgb, ~valid_depth, world_xyz)
+        else:
+            self.run_mask_clip(rgb, ~valid_depth, world_xyz)
+
+        self.voxel_map.add(
+            camera_pose=torch.Tensor(pose),
+            rgb=torch.Tensor(rgb).permute(1, 2, 0),
+            depth=torch.Tensor(depth),
+            camera_K=torch.Tensor(intrinsics),
+        )
+        obs, exp = self.voxel_map.get_2d_map()
+        if self.rerun:
+            # if not self.static:
+            #     rr.set_time_sequence("frame", self.obs_count)
+            # rr.log('robot_pov', rr.Image(rgb.permute(1, 2, 0)), static = self.static)
+            if self.voxel_map.voxel_pcd._points is not None:
+                rr.log(
+                    "Obstalce_map/pointcloud",
+                    rr.Points3D(
+                        self.voxel_map.voxel_pcd._points.detach().cpu(),
+                        colors=self.voxel_map.voxel_pcd._rgb.detach().cpu() / 255.0,
+                        radii=0.03,
+                    ),
+                    static=self.static,
+                )
+            if self.voxel_map_localizer.voxel_pcd._points is not None:
+                rr.log(
+                    "Semantic_memory/pointcloud",
+                    rr.Points3D(
+                        self.voxel_map_localizer.voxel_pcd._points.detach().cpu(),
+                        colors=self.voxel_map_localizer.voxel_pcd._rgb.detach().cpu() / 255.0,
+                        radii=0.03,
+                    ),
+                    static=self.static,
+                )
+            # rr.log("Obstalce_map/2D_obs_map", rr.Image(obs.int() * 127 + exp.int() * 127))
+        else:
+            cv2.imwrite(
+                self.log + "/debug_" + str(self.obs_count) + ".jpg",
+                np.asarray(obs.int() * 127 + exp.int() * 127),
+            )
+        # end_time = time.time()
+        # print('image processing takes ' + str(end_time - start_time) + ' seconds.')
+
+    def read_from_pickle(self, pickle_file_name, num_frames: int = -1):
+        print("Reading from ", pickle_file_name)
+        rr.init("Debug", spawn=True)
+        if isinstance(pickle_file_name, str):
+            pickle_file_name = Path(pickle_file_name)
+        assert pickle_file_name.exists(), f"No file found at {pickle_file_name}"
+        with pickle_file_name.open("rb") as f:
+            data = pickle.load(f)
+        for i, (camera_pose, xyz, rgb, feats, depth, base_pose, K, world_xyz,) in enumerate(
+            zip(
+                data["camera_poses"],
+                data["xyz"],
+                data["rgb"],
+                data["feats"],
+                data["depth"],
+                data["base_poses"],
+                data["camera_K"],
+                data["world_xyz"],
+            )
+        ):
+            # Handle the case where we dont actually want to load everything
+            if num_frames > 0 and i >= num_frames:
+                break
+
+            camera_pose = self.voxel_map.fix_data_type(camera_pose)
+            xyz = self.voxel_map.fix_data_type(xyz)
+            rgb = self.voxel_map.fix_data_type(rgb)
+            depth = self.voxel_map.fix_data_type(depth)
+            intrinsics = self.voxel_map.fix_data_type(K)
+            if feats is not None:
+                feats = self.voxel_map.fix_data_type(feats)
+            base_pose = self.voxel_map.fix_data_type(base_pose)
+            self.voxel_map.voxel_pcd.clear_points(depth, intrinsics, camera_pose)
+            self.voxel_map.add(
+                camera_pose=camera_pose,
+                xyz=xyz,
+                rgb=rgb,
+                feats=feats,
+                depth=depth,
+                base_pose=base_pose,
+                camera_K=K,
+            )
+            # points = self.voxel_map.voxel_pcd._points.detach().cpu()
+            # rgb = self.voxel_map.voxel_pcd._rgb.detach().cpu() / 255.
+            # rr.log("Obstalce_map/pointcloud", rr.Points3D(points[points[:, 2] > 0.1], \
+            #                                                   colors=rgb[points[:, 2] > 0.1], radii=0.03))
+            # plt.clf()
+            # obstacles, _ = self.voxel_map.get_2d_map()
+            # plt.imshow(obstacles)
+            # grid_coord = self.voxel_map.xy_to_grid_coords(camera_pose[:2, 3])
+            # plt.scatter(grid_coord[0], grid_coord[1], color = 'r')
+            # buf = BytesIO()
+            # plt.savefig(buf, format='png')
+            # buf.seek(0)
+            # img = Image.open(buf)
+            # img = np.array(img)
+            # buf.close()
+            # rr.log('2d_map', rr.Image(img))
+            # rr.log('robot_point', rr.Points3D(camera_pose[:3, 3], colors=[1, 0, 0], radii=0.2))
+
+            self.obs_count += 1
+        self.voxel_map_localizer.voxel_pcd._points = data["combined_xyz"]
+        self.voxel_map_localizer.voxel_pcd._features = data["combined_feats"]
+        self.voxel_map_localizer.voxel_pcd._weights = data["combined_weights"]
+        self.voxel_map_localizer.voxel_pcd._rgb = data["combined_rgb"]
+        self.voxel_map_localizer.voxel_pcd._obs_counts = data["obs_id"]
+        self.voxel_map_localizer.voxel_pcd._entity_ids = data["entity_id"]
+        self.voxel_map_localizer.voxel_pcd.obs_count = max(
+            self.voxel_map_localizer.voxel_pcd._obs_counts
+        ).item()
+        self.voxel_map.voxel_pcd.obs_count = max(
+            self.voxel_map_localizer.voxel_pcd._obs_counts
+        ).item()
+
+    def write_to_pickle(self):
+        """Write out to a pickle file. This is a rough, quick-and-easy output for debugging, not intended to replace the scalable data writer in data_tools for bigger efforts."""
+        if not os.path.exists("debug"):
+            os.mkdir("debug")
+        filename = "debug/" + self.log + ".pkl"
+        data = {}
+        data["camera_poses"] = []
+        data["camera_K"] = []
+        data["base_poses"] = []
+        data["xyz"] = []
+        data["world_xyz"] = []
+        data["rgb"] = []
+        data["depth"] = []
+        data["feats"] = []
+        for frame in self.voxel_map.observations:
+            # add it to pickle
+            # TODO: switch to using just Obs struct?
+            data["camera_poses"].append(frame.camera_pose)
+            data["base_poses"].append(frame.base_pose)
+            data["camera_K"].append(frame.camera_K)
+            data["xyz"].append(frame.xyz)
+            data["world_xyz"].append(frame.full_world_xyz)
+            data["rgb"].append(frame.rgb)
+            data["depth"].append(frame.depth)
+            data["feats"].append(frame.feats)
+            for k, v in frame.info.items():
+                if k not in data:
+                    data[k] = []
+                data[k].append(v)
+        (
+            data["combined_xyz"],
+            data["combined_feats"],
+            data["combined_weights"],
+            data["combined_rgb"],
+        ) = self.voxel_map_localizer.voxel_pcd.get_pointcloud()
+        data["obs_id"] = self.voxel_map_localizer.voxel_pcd._obs_counts
+        data["entity_id"] = self.voxel_map_localizer.voxel_pcd._entity_ids
+        with open(filename, "wb") as f:
+            pickle.dump(data, f)
+        print("write all data to", filename)
diff --git a/src/stretch/motion/base/ik_solver_base.py b/src/stretch/motion/base/ik_solver_base.py
index cea88080e..40d6ade69 100644
--- a/src/stretch/motion/base/ik_solver_base.py
+++ b/src/stretch/motion/base/ik_solver_base.py
@@ -31,7 +31,9 @@ def get_num_controllable_joints(self) -> int:
         """returns number of controllable joints under this solver's purview"""
         raise NotImplementedError()
 
-    def compute_fk(self, q) -> Tuple[np.ndarray, np.ndarray]:
+    def compute_fk(
+        self, q, link_name=None, ignore_missing_joints=False
+    ) -> Tuple[np.ndarray, np.ndarray]:
         """given joint values return end-effector position and quaternion associated with it"""
         raise NotImplementedError()
 
diff --git a/src/stretch/motion/constants.py b/src/stretch/motion/constants.py
index 7cd1403d5..cd057a882 100644
--- a/src/stretch/motion/constants.py
+++ b/src/stretch/motion/constants.py
@@ -15,7 +15,8 @@
 
 # Stretch stuff
 PLANNER_STRETCH_URDF = get_full_config_path("urdf/planner_calibrated.urdf")
-MANIP_STRETCH_URDF = get_full_config_path("urdf/stretch_manip_mode.urdf")
+# MANIP_STRETCH_URDF = get_full_config_path("urdf/stretch_manip_mode.urdf")
+MANIP_STRETCH_URDF = get_full_config_path("urdf/stretch.urdf")
 
 # This is the gripper, and the distance in the gripper frame to where the fingers will roughly meet
 STRETCH_GRASP_FRAME = "link_grasp_center"
diff --git a/src/stretch/motion/kinematics.py b/src/stretch/motion/kinematics.py
index fdd560e1f..fe1391d1a 100644
--- a/src/stretch/motion/kinematics.py
+++ b/src/stretch/motion/kinematics.py
@@ -134,7 +134,8 @@ class HelloStretchKinematics:
     default_ee_link_name = "link_grasp_center"
 
     default_manip_mode_controlled_joints = [
-        "base_x_joint",
+        # "base_x_joint",
+        "joint_fake",
         "joint_lift",
         "joint_arm_l3",
         "joint_arm_l2",
@@ -420,14 +421,14 @@ def interpolate(self, q0, qg, step=None, xy_tol=0.05, theta_tol=0.01):
         for qi, ai in self.interpolate_arm(qi, qg, step):
             yield qi, ai
 
-    def manip_fk(self, q: np.ndarray = None) -> Tuple[np.ndarray, np.ndarray]:
+    def manip_fk(self, q: np.ndarray = None, node: str = None) -> Tuple[np.ndarray, np.ndarray]:
         """manipulator specific forward kinematics; uses separate URDF than the full-body fk() method"""
         assert q.shape == (self.dof,)
 
         if "pinocchio" in self._ik_type:
             q = self._ros_pose_to_pinocchio(q)
 
-        ee_pos, ee_quat = self.manip_ik_solver.compute_fk(q)
+        ee_pos, ee_quat = self.manip_ik_solver.compute_fk(q, node)
         return ee_pos.copy(), ee_quat.copy()
 
     def update_head(self, qi: np.ndarray, look_at) -> np.ndarray:
diff --git a/src/stretch/motion/pinocchio_ik_solver.py b/src/stretch/motion/pinocchio_ik_solver.py
index 69dada759..9b887463a 100644
--- a/src/stretch/motion/pinocchio_ik_solver.py
+++ b/src/stretch/motion/pinocchio_ik_solver.py
@@ -99,7 +99,7 @@ def get_all_joint_names(self) -> List[str]:
         return [self.model.names[i + 1] for i in range(self.model.nq)]
 
     def _qmap_control2model(
-        self, q_input: np.ndarray, ignore_missing_joints: bool = False
+        self, q_input: Union[np.ndarray, dict], ignore_missing_joints: bool = False
     ) -> np.ndarray:
         """returns a full joint configuration from a partial joint configuration"""
         q_out = self.q_neutral.copy()
@@ -134,6 +134,35 @@ def _qmap_model2control(self, q_input: np.ndarray) -> np.ndarray:
 
         return q_out
 
+    def get_frame_pose(
+        self,
+        config: Union[np.ndarray, dict],
+        node_a: str,
+        node_b: str,
+        ignore_missing_joints: bool = False,
+    ):
+        """
+        Get a transformation matrix transforming from node_a frame to node_b frame
+        """
+        q_model = self._qmap_control2model(config, ignore_missing_joints=ignore_missing_joints)
+        # print('q_model', q_model)
+        pinocchio.forwardKinematics(self.model, self.data, q_model)
+        frame_idx1 = [f.name for f in self.model.frames].index(node_a)
+        frame_idx2 = [f.name for f in self.model.frames].index(node_b)
+        # print(frame_idx1)
+        # print(frame_idx2)
+        # print(self.model.getFrameId(node_a))
+        # print(self.model.getFrameId(node_b))
+        # frame_idx1 = self.model.getFrameId(node_a)
+        # frame_idx2 = self.model.getFrameId(node_b)
+        pinocchio.updateFramePlacement(self.model, self.data, frame_idx1)
+        placement_frame1 = self.data.oMf[frame_idx1]
+        pinocchio.updateFramePlacement(self.model, self.data, frame_idx2)
+        placement_frame2 = self.data.oMf[frame_idx2]
+        # print('pin 1', placement_frame1)
+        # print('pin 2', placement_frame2)
+        return placement_frame2.inverse() * placement_frame1
+
     def compute_fk(
         self, config: np.ndarray, link_name: str = None, ignore_missing_joints: bool = False
     ) -> Tuple[np.ndarray, np.ndarray]:
@@ -182,8 +211,11 @@ def compute_ik(
             max iterations: time budget in number of steps; included for compatibility with pb
         """
         i = 0
-        _ee_frame = self.ee_frame_idx if custom_ee_frame is None else custom_ee_frame
-        _ee_frame_idx = [f.name for f in self.model.frames].index(_ee_frame)
+        if custom_ee_frame is not None:
+            _ee_frame_idx = [f.name for f in self.model.frames].index(custom_ee_frame)
+        else:
+            _ee_frame_idx = self.ee_frame_idx
+
         if q_init is None:
             q = self.q_neutral.copy()
             if num_attempts > 1:
@@ -199,7 +231,6 @@ def compute_ik(
         while True:
             pinocchio.forwardKinematics(self.model, self.data, q)
             pinocchio.updateFramePlacement(self.model, self.data, _ee_frame_idx)
-
             dMi = desired_ee_pose.actInv(self.data.oMf[_ee_frame_idx])
             err = pinocchio.log(dMi).vector
             if verbose:
diff --git a/src/stretch/requirements.txt b/src/stretch/requirements.txt
new file mode 100644
index 000000000..339a82c8a
--- /dev/null
+++ b/src/stretch/requirements.txt
@@ -0,0 +1,66 @@
+torch_cluster == 1.6.2
+torch_geometric
+ultralytics
+wget
+# SAM2
+# conda install pyg -c pyg
+google-generativeai
+
+# openai
+# zmq
+# torch
+# ultralytics
+# rerun-sdk
+# numpy <1.24 # certain deprecated operations were used in other deps
+# scipy
+# opencv-python
+# pybullet
+# trimesh
+# pytest
+# open3d
+# scikit-image
+# scikit-fmm
+# scikit-learn
+# numpy-quaternion
+# natsort
+# # Neural networks
+# openai-clip
+# timm
+# pandas >= 2.1.3
+# mss
+# # visualizations and debugging
+# matplotlib
+# # Command line tools
+# click
+# transformers
+# data tools
+# yacs
+# # h5py
+# imageio
+# pygifsicle
+# pynput
+# torch_geometric
+# torch_cluster
+# atomicwrites
+# lxml
+# transforms3d
+
+# git+https://github.com/facebookresearch/segment-anything-2.git
+
+# accelerate
+# wget
+# setuptools==69.5.1
+# loguru
+# sophuspy==1.2.0
+# sentencepiece
+# protobuf
+# packaging
+# pyparsing
+# cycler
+# kiwisolver
+# six
+# importlib_metadata
+# pytz
+# psutil
+# transforms3d
+# lxml
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 6804c31f2..8c15f7087 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -219,6 +219,7 @@ def __init__(
                 vectors=[[1, 0, 0], [0, 1, 0], [0, 0, 1]],
                 colors=[[255, 0, 0], [0, 255, 0], [0, 0, 255]],
             ),
+            static=True,
         )
 
         self.bbox_colors_memory = {}
@@ -252,7 +253,8 @@ def log_head_camera(self, obs: Observations):
             obs (Observations): Observation dataclass
         """
         rr.set_time_seconds("realtime", time.time())
-        rr.log("world/head_camera/rgb", rr.Image(obs.rgb))
+        rr.log("world/head_camera/rgb", rr.Image(obs.rgb), static=True)
+        # rr.log("world/head_camera/depth", rr.DepthImage(obs["depth"]), static=True)
 
         if self.show_camera_point_clouds:
             head_xyz = obs.get_xyz_in_world_frame().reshape(-1, 3)
@@ -294,7 +296,12 @@ def log_robot_xyt(self, obs: Observations):
             labels="robot",
             colors=[255, 0, 0, 255],
         )
-        rr.log("world/robot/arrow", rb_arrow)
+        rr.log("world/robot/arrow", rb_arrow, static=True)
+        rr.log(
+            "world/robot/blob",
+            rr.Points3D([0, 0, 0], colors=[255, 0, 0, 255], radii=0.13),
+            static=True,
+        )
         rr.log(
             "world/robot",
             rr.Transform3D(
@@ -302,6 +309,7 @@ def log_robot_xyt(self, obs: Observations):
                 rotation=rr.RotationAxisAngle(axis=[0, 0, 1], radians=theta),
                 axis_length=0.7,
             ),
+            static=True,
         )
 
     def log_ee_frame(self, obs):
@@ -315,8 +323,14 @@ def log_ee_frame(self, obs):
         ee_arrow = rr.Arrows3D(
             origins=[0, 0, 0], vectors=[0.2, 0, 0], radii=0.02, labels="ee", colors=[0, 255, 0, 255]
         )
+        # rr.log("world/ee/arrow", ee_arrow, static=True)
+        # rr.log(
+        #     "world/ee", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3), static=True
+        # )
         # rr.log("world/ee/arrow", ee_arrow)
-        rr.log("world/ee", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3))
+        rr.log(
+            "world/ee", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3, static=True)
+        )
 
     def log_ee_camera(self, servo):
         """Log end effector camera pose and images
@@ -325,7 +339,7 @@ def log_ee_camera(self, servo):
         """
         rr.set_time_seconds("realtime", time.time())
         # EE Camera
-        rr.log("world/ee_camera/rgb", rr.Image(servo.ee_rgb))
+        rr.log("world/ee_camera/rgb", rr.Image(servo.ee_rgb), static=True)
 
         if self.show_camera_point_clouds:
             ee_xyz = servo.get_ee_xyz_in_world_frame().reshape(-1, 3)
@@ -339,7 +353,7 @@ def log_ee_camera(self, servo):
                 ),
             )
         else:
-            rr.log("world/ee_camera/depth", rr.depthimage(servo.ee_depth))
+            rr.log("world/ee_camera/depth", rr.depthimage(servo.ee_depth), static=True)
 
         if self.show_cameras_in_3d_view:
             rot, trans = decompose_homogeneous_matrix(servo.ee_camera_pose)
@@ -360,7 +374,11 @@ def log_robot_state(self, obs):
         rr.set_time_seconds("realtime", time.time())
         state = obs["joint"]
         for k in HelloStretchIdx.name_to_idx:
-            rr.log(f"robot_state/joint_pose/{k}", rr.Scalar(state[HelloStretchIdx.name_to_idx[k]]))
+            rr.log(
+                f"robot_state/joint_pose/{k}",
+                rr.Scalar(state[HelloStretchIdx.name_to_idx[k]]),
+                static=True,
+            )
 
     def log_robot_transforms(self, obs):
         """
@@ -390,7 +408,10 @@ def update_voxel_map(
         rr.log(
             "world/point_cloud",
             rr.Points3D(
-                positions=points, radii=np.ones(rgb.shape[0]) * world_radius, colors=np.int64(rgb)
+                positions=points,
+                radii=np.ones(rgb.shape[0]) * world_radius,
+                colors=np.int64(rgb),
+                static=True,
             ),
         )
 
@@ -419,6 +440,7 @@ def update_voxel_map(
                 radii=np.ones(points.shape[0]) * obstacle_radius,
                 colors=[255, 0, 0],
             ),
+            static=True,
         )
         rr.log(
             "world/explored",
@@ -427,6 +449,7 @@ def update_voxel_map(
                 radii=np.ones(points.shape[0]) * explored_radius,
                 colors=[255, 255, 255],
             ),
+            static=True,
         )
         t6 = timeit.default_timer()
 
@@ -465,6 +488,7 @@ def update_scene_graph(
                 rr.log(
                     f"world/{instance.id}_{name}",
                     rr.Points3D(positions=point_cloud_rgb, colors=np.int64(pcd_rgb)),
+                    static=True,
                 )
                 half_sizes = [(b[0] - b[1]) / 2 for b in bbox_bounds]
                 bounds.append(half_sizes)
@@ -482,6 +506,7 @@ def update_scene_graph(
                     radii=0.01,
                     colors=colors,
                 ),
+                static=True,
             )
             t1 = timeit.default_timer()
             print("Time to log scene graph objects: ", t1 - t0)
diff --git a/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/modules/nav.py b/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/modules/nav.py
index 6bdc709d6..eb0cd57a3 100644
--- a/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/modules/nav.py
+++ b/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/modules/nav.py
@@ -155,8 +155,8 @@ def set_velocity(self, v, w):
         Directly sets the linear and angular velocity of robot base.
         """
         msg = Twist()
-        msg.linear.x = v
-        msg.angular.z = w
+        msg.linear.x = float(v)
+        msg.angular.z = float(w)
 
         self._ros_client.goto_off_service.call(Trigger.Request())
         self._ros_client.velocity_pub.publish(msg)

From 354bf15c1dcade003815e987f337c65ea79d54c8 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 26 Sep 2024 11:13:41 -0400
Subject: [PATCH 239/410] update

---
 src/stretch/dynav/run_ok_nav.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/stretch/dynav/run_ok_nav.py b/src/stretch/dynav/run_ok_nav.py
index 28260ce9b..b8e16e003 100644
--- a/src/stretch/dynav/run_ok_nav.py
+++ b/src/stretch/dynav/run_ok_nav.py
@@ -41,6 +41,9 @@ def compute_tilt(camera_xyz, target_xyz):
 @click.option("--method", default="dynamem", type=str)
 @click.option("--env", default=1, type=int)
 @click.option("--test", default=1, type=int)
+@click.option(
+    "--robot_ip", type=str, default="", help="Robot IP address (leave empty for saved default)"
+)
 @click.option(
     "--input-path",
     type=click.Path(),
@@ -57,6 +60,7 @@ def main(
     env: int = 1,
     test: int = 1,
     input_path: str = None,
+    robot_ip: str = "",
     **kwargs,
 ):
     """
@@ -66,7 +70,7 @@ def main(
         random_goals(bool): randomly sample frontier goals instead of looking for closest
     """
     click.echo("Will connect to a Stretch robot and collect a short trajectory.")
-    robot = RobotClient(robot_ip="100.79.44.11")
+    robot = RobotClient(robot_ip=robot_ip)
 
     print("- Load parameters")
     parameters = get_parameters("dynav_config.yaml")

From ae00d760dbeec7fae74927a11ef85a408546d964 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 26 Sep 2024 11:23:03 -0400
Subject: [PATCH 240/410] add a script to copy the URDF over

---
 scripts/copy_calibrated_stretch_urdf.sh | 49 +++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 scripts/copy_calibrated_stretch_urdf.sh

diff --git a/scripts/copy_calibrated_stretch_urdf.sh b/scripts/copy_calibrated_stretch_urdf.sh
new file mode 100644
index 000000000..e528df730
--- /dev/null
+++ b/scripts/copy_calibrated_stretch_urdf.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Default user
+USER="hello-robot"
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --robot_ip)
+            ROBOT_IP="$2"
+            shift 2
+            ;;
+        --user)
+            USER="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+# If ROBOT_IP is not set, try to load it from the file
+if [ -z "$ROBOT_IP" ]; then
+    if [ -f ~/.stretch/robot_ip.txt ]; then
+        ROBOT_IP=$(cat ~/.stretch/robot_ip.txt)
+    else
+        echo "Error: Robot IP not provided and ~/.stretch/robot_ip.txt does not exist."
+        exit 1
+    fi
+fi
+
+# Check if ROBOT_IP is set
+if [ -z "$ROBOT_IP" ]; then
+    echo "Error: Robot IP is not set."
+    exit 1
+fi
+
+# Run the scp command
+scp "$USER@$ROBOT_IP:~/ament_ws/src/stretch_ros2/stretch_description/urdf/stretch.urdf" $HOME/.stretch/stretch.urdf
+
+# Check if the scp command was successful
+if [ $? -eq 0 ]; then
+    echo "File successfully copied."
+else
+    echo "Error: Failed to copy the file."
+    exit 1
+fi

From 9e109f9fe696f382b478805ca416056114367c6f Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 26 Sep 2024 11:23:40 -0400
Subject: [PATCH 241/410] updates to urdf

---
 src/stretch/config/urdf/stretch.urdf | 1211 ++++++++++++++++++++++++++
 1 file changed, 1211 insertions(+)
 create mode 100644 src/stretch/config/urdf/stretch.urdf

diff --git a/src/stretch/config/urdf/stretch.urdf b/src/stretch/config/urdf/stretch.urdf
new file mode 100644
index 000000000..7049aa725
--- /dev/null
+++ b/src/stretch/config/urdf/stretch.urdf
@@ -0,0 +1,1211 @@
+<?xml version="1.0"?>
+<robot name="stretch" version="1.0">
+  <link name="fake_link_x">
+    <inertial>
+      <origin rpy="0.0 0.0 0." xyz="0. 0. 0."/>
+      <mass value="0.749143203376"/>
+      <inertia ixx="0.0709854511955" ixy="-0.00433428742758" ixz="-0.000186110788698" iyy="0.000437922053343" iyz="-0.00288788257713" izz="0.0711048085017"/>
+    </inertial>
+  </link>
+  <joint name="joint_fake" type="prismatic">
+    <origin rpy="0. 0. 0." xyz="0. 0. 0."/>
+    <axis xyz="1.0 0.0 0.0"/>
+    <parent link="base_link"/>
+    <child link="fake_link_x"/>
+    <limit effort="100.0" lower="-1.0" upper="1.1" velocity="1.0"/>
+  </joint>
+  <link name="base_link">
+    <inertial>
+      <origin xyz="-0.087526 -0.001626 0.081009" rpy="0.0 0.0 0.0"/>
+      <mass value="17.384389"/>
+      <inertia ixx="0.160002" ixy="0.006758" ixz="0.004621" iyy="0.138068" iyz="0.002208" izz="0.228992"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/base_link.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.79216 0.81961 0.93333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/base_link_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="link_right_wheel">
+    <inertial>
+      <origin xyz="0.0 0.0 0.02765" rpy="0.0 0.0 0.0"/>
+      <mass value="0.20773"/>
+      <inertia ixx="5.4e-05" ixy="0.0" ixz="0.0" iyy="5.4e-05" iyz="0.0" izz="5.1e-05"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_right_wheel.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.79216 0.81961 0.93333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_right_wheel_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_right_wheel" type="continuous">
+    <origin xyz="0.0 -0.17035 0.0508" rpy="-1.5708 1.2717e-16 4.8006e-17"/>
+    <axis xyz="0.0 0.0 1.0"/>
+    <parent link="base_link"/>
+    <child link="link_right_wheel"/>
+    <dynamics damping="21.75" friction="10.48"/>
+  </joint>
+  <link name="link_left_wheel">
+    <inertial>
+      <origin xyz="0.0 0.0 -0.02765" rpy="0.0 0.0 0.0"/>
+      <mass value="0.20773"/>
+      <inertia ixx="5.4e-05" ixy="0.0" ixz="0.0" iyy="5.4e-05" iyz="0.0" izz="5.1e-05"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_left_wheel.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.79216 0.81961 0.93333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_left_wheel_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_left_wheel" type="continuous">
+    <origin xyz="0.0 0.17035 0.0508" rpy="-1.5708 2.6317e-16 -8.2057e-19"/>
+    <axis xyz="0.0 0.0 1.0"/>
+    <parent link="base_link"/>
+    <child link="link_left_wheel"/>
+    <dynamics damping="21.75" friction="10.48"/>
+  </joint>
+  <link name="caster_link">
+    <inertial>
+      <origin xyz="0.0 0.0 0.0" rpy="0 0 0"/>
+      <mass value="0.01"/>
+      <inertia ixx="0.001" ixy="0.0" ixz="0.0" iyy="0.001" iyz="0.0" izz="0.001"/>
+    </inertial>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <sphere radius="0.032"/>
+      </geometry>
+    </collision>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/omni_wheel_m.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1.0"/>
+      </material>
+    </visual>
+  </link>
+  <joint name="caster_joint" type="fixed">
+    <origin xyz="-0.245 0.0 0.032" rpy="-1.5707963267948966 0.0 0.0"/>
+    <axis xyz="0.0 0.0 1.0"/>
+    <parent link="base_link"/>
+    <child link="caster_link"/>
+  </joint>
+  <link name="link_mast">
+    <inertial>
+      <origin xyz="0.000337 0.722201 0.002652" rpy="0.0 0.0 0.0"/>
+      <mass value="1.764017"/>
+      <inertia ixx="0.291286" ixy="0.004149" ixz="-0.000174" iyy="0.001226" iyz="0.000354" izz="0.291429"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_mast.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.79216 0.81961 0.93333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_mast_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_mast" type="fixed">
+    <origin xyz="-0.06647514990519002 0.13621284483783305 0.024180954892509092" rpy="1.5764664063530067 -0.016381260514252594 0.01178702244764962"/>
+    <axis xyz="0.0 0.0 0.0"/>
+    <parent link="fake_link_x"/>
+    <child link="link_mast"/>
+  </joint>
+  <link name="link_lift">
+    <inertial>
+      <origin xyz="-0.020369 0.03438 0.02236" rpy="0.0 0.0 0.0"/>
+      <mass value="1.987167"/>
+      <inertia ixx="0.006518" ixy="-0.001539" ixz="0.000371" iyy="0.004464" iyz="0.000392" izz="0.008092"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_lift.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.79216 0.81961 0.93333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_lift_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_lift" type="prismatic">
+    <origin xyz="-0.037385 0.1666 0.0" rpy="-1.5708 1.5708 0.0"/>
+    <axis xyz="0.0 0.0 1.0"/>
+    <parent link="link_mast"/>
+    <child link="link_lift"/>
+    <limit effort="100.0" lower="0.0" upper="1.1" velocity="1.0"/>
+  </joint>
+  <link name="link_arm_l4">
+    <inertial>
+      <origin xyz="-0.000168 -0.000254 -0.125235" rpy="0.0 0.0 0.0"/>
+      <mass value="0.235626"/>
+      <inertia ixx="0.001518" ixy="-1e-06" ixz="3e-06" iyy="0.001519" iyz="3e-06" izz="0.000176"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_arm_l4.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.79216 0.81961 0.93333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_arm_l4_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_arm_l4" type="fixed">
+    <origin xyz="-0.25029133024448985 -0.00458447145714172 -0.00708171779930003" rpy="1.5704158852185297 -0.021628101399709276 -1.566488735146244"/>
+    <axis xyz="0.0 0.0 0.0"/>
+    <parent link="link_lift"/>
+    <child link="link_arm_l4"/>
+  </joint>
+  <link name="link_arm_l3">
+    <inertial>
+      <origin xyz="-1e-06 -0.000146 -0.11908" rpy="0.0 0.0 0.0"/>
+      <mass value="0.146817"/>
+      <inertia ixx="0.001081" ixy="0.0" ixz="0.0" iyy="0.001081" iyz="-1e-06" izz="7.7e-05"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_arm_l3.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_arm_l3_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_arm_l3" type="prismatic">
+    <origin xyz="0.0 0.0 0.013" rpy="7.68831233799385e-30 2.36716479416092e-30 2.29652732251143e-17"/>
+    <axis xyz="0.0 0.0 1.0"/>
+    <parent link="link_arm_l4"/>
+    <child link="link_arm_l3"/>
+    <limit effort="100.0" lower="0.0" upper="0.13" velocity="1.0"/>
+  </joint>
+  <link name="link_arm_l2">
+    <inertial>
+      <origin xyz="0.0 -7.1e-05 -0.115635" rpy="0.0 0.0 0.0"/>
+      <mass value="0.130565"/>
+      <inertia ixx="0.000943" ixy="0.0" ixz="0.0" iyy="0.000943" iyz="0.0" izz="5.7e-05"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_arm_l2.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_arm_l2_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_arm_l2" type="prismatic">
+    <origin xyz="0.0 0.0 0.013" rpy="0.0 1.57655765344625e-30 -1.66533453693773e-16"/>
+    <axis xyz="0.0 0.0 1.0"/>
+    <parent link="link_arm_l3"/>
+    <child link="link_arm_l2"/>
+    <limit effort="100.0" lower="0.0" upper="0.13" velocity="1.0"/>
+  </joint>
+  <link name="link_arm_l1">
+    <inertial>
+      <origin xyz="-1e-06 -0.000121 -0.113457" rpy="0.0 0.0 0.0"/>
+      <mass value="0.113658"/>
+      <inertia ixx="0.0008" ixy="0.0" ixz="0.0" iyy="0.0008" iyz="0.0" izz="4e-05"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_arm_l1.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.79216 0.81961 0.93333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_arm_l1_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_arm_l1" type="prismatic">
+    <origin xyz="0.0 0.0 0.0129999999999981" rpy="-7.63746778746202e-30 -7.88860905221012e-31 1.11022302462516e-16"/>
+    <axis xyz="0.0 0.0 1.0"/>
+    <parent link="link_arm_l2"/>
+    <child link="link_arm_l1"/>
+    <limit effort="100.0" lower="0.0" upper="0.13" velocity="1.0"/>
+  </joint>
+  <link name="link_arm_l0">
+    <inertial>
+      <origin xyz="0.033681 -0.000847 -0.031723" rpy="0.0 0.0 0.0"/>
+      <mass value="0.427734"/>
+      <inertia ixx="0.001689" ixy="-2.8e-05" ixz="0.000483" iyy="0.002107" iyz="-1.1e-05" izz="0.000571"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_arm_l0.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_arm_l0_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_arm_l0" type="prismatic">
+    <origin xyz="0.0 0.0 -0.0137499999991968" rpy="7.63746778746202e-30 -3.80121128864402e-15 2.62707547767438e-15"/>
+    <axis xyz="0.0 0.0 1.0"/>
+    <parent link="link_arm_l1"/>
+    <child link="link_arm_l0"/>
+    <limit effort="100.0" lower="0.0" upper="0.13" velocity="1.0"/>
+  </joint>
+  <link name="link_wrist_yaw">
+    <inertial>
+      <origin xyz="0.0 -3.9e-05 -0.016495" rpy="0.0 0.0 0.0"/>
+      <mass value="0.054422"/>
+      <inertia ixx="9e-06" ixy="0.0" ixz="0.0" iyy="9e-06" iyz="0.0" izz="7e-06"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_wrist_yaw.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.898039215686275 0.917647058823529 0.929411764705882 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_wrist_yaw_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_wrist_yaw" type="revolute">
+    <origin xyz="0.0830000000000654 -0.0307500000000129 0.0" rpy="-1.5707963267949 0.0 3.14159265358978"/>
+    <axis xyz="0.0 0.0 -1.0"/>
+    <parent link="link_arm_l0"/>
+    <child link="link_wrist_yaw"/>
+    <limit effort="100.0" lower="-1.75" upper="4.0" velocity="1.0"/>
+  </joint>
+  <link name="link_head">
+    <inertial>
+      <origin xyz="0.047462 0.04451 0.016376" rpy="0.0 0.0 0.0"/>
+      <mass value="0.681871"/>
+      <inertia ixx="0.001023" ixy="0.000641" ixz="6.6e-05" iyy="0.002097" iyz="6.5e-05" izz="0.00281"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_head.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_head_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_head" type="fixed">
+    <origin xyz="0.0 1.33 0.0" rpy="1.5707963267949 -1.5707963267949 3.1416"/>
+    <axis xyz="0.0 0.0 0.0"/>
+    <parent link="link_mast"/>
+    <child link="link_head"/>
+  </joint>
+  <link name="link_head_pan">
+    <inertial>
+      <origin xyz="-0.000422 0.013934 -0.0161" rpy="0.0 0.0 0.0"/>
+      <mass value="0.062462"/>
+      <inertia ixx="4.1e-05" ixy="-1e-06" ixz="1e-06" iyy="3.2e-05" iyz="-1.5e-05" izz="2.2e-05"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_head_pan.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_head_pan_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_head_pan" type="revolute">
+    <origin xyz="0.13074266307597607 0.07420750998075656 -0.00513350417101199" rpy="-0.003520824720684823 0.005102536152125969 1.5954937902597492"/>
+    <axis xyz="0.0 0.0 1.0"/>
+    <parent link="link_head"/>
+    <child link="link_head_pan"/>
+    <limit effort="100.0" lower="-3.9" upper="1.5" velocity="1.0"/>
+  </joint>
+  <link name="link_head_tilt">
+    <inertial>
+      <origin xyz="0.003907 -0.02418 0.030154" rpy="0.0 0.0 0.0"/>
+      <mass value="0.283548"/>
+      <inertia ixx="0.000316" ixy="-6e-06" ixz="1e-06" iyy="0.000143" iyz="-2e-06" izz="0.00031"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_head_tilt.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_head_tilt_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_head_tilt" type="revolute">
+    <origin xyz="-0.0035212881570080862 0.028631846943567664 -0.06228419925250002" rpy="1.576606681440738 -0.022262092337627104 0.0823152205525689"/>
+    <axis xyz="0.0 0.0 1.0"/>
+    <parent link="link_head_pan"/>
+    <child link="link_head_tilt"/>
+    <limit effort="100.0" lower="-1.53" upper="0.79" velocity="1.0"/>
+  </joint>
+  <link name="link_aruco_right_base">
+    <inertial>
+      <origin xyz="1.3878e-17 0.0 -0.000125" rpy="0.0 0.0 0.0"/>
+      <mass value="3.59999999990368e-06"/>
+      <inertia ixx="0.0" ixy="0.0" ixz="0.0" iyy="0.0" iyz="0.0" izz="0.0"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_aruco_right_base.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_aruco_right_base.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_aruco_right_base" type="fixed">
+    <origin xyz="-0.0015028 -0.1304972 0.1597482" rpy="0.0 0.0 -1.5707963267949"/>
+    <axis xyz="0.0 0.0 0.0"/>
+    <parent link="base_link"/>
+    <child link="link_aruco_right_base"/>
+  </joint>
+  <link name="link_aruco_left_base">
+    <inertial>
+      <origin xyz="2.7756e-17 0.0 -0.000125" rpy="0.0 0.0 0.0"/>
+      <mass value="3.59999999990368e-06"/>
+      <inertia ixx="0.0" ixy="0.0" ixz="0.0" iyy="0.0" iyz="0.0" izz="0.0"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_aruco_left_base.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_aruco_left_base.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_aruco_left_base" type="fixed">
+    <origin xyz="-0.00500000000000014 0.1304972 0.1597482" rpy="0.0 0.0 -1.5707963267949"/>
+    <axis xyz="0.0 0.0 0.0"/>
+    <parent link="base_link"/>
+    <child link="link_aruco_left_base"/>
+  </joint>
+  <link name="link_aruco_shoulder">
+    <inertial>
+      <origin xyz="-2.77555756156289e-17 2.56739074444567e-16 -0.000125000000000042" rpy="0.0 0.0 0.0"/>
+      <mass value="0.0"/>
+      <inertia ixx="0.0" ixy="0.0" ixz="0.0" iyy="0.0" iyz="0.0" izz="0.0"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_aruco_shoulder.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_aruco_shoulder.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_aruco_shoulder" type="fixed">
+    <origin xyz="-0.0133768876375287 0.0558540528812078 0.0861368272417975" rpy="-1.53998860117704e-29 3.55962409571165e-15 0.0"/>
+    <axis xyz="0.0 0.0 0.0"/>
+    <parent link="link_lift"/>
+    <child link="link_aruco_shoulder"/>
+  </joint>
+  <link name="link_aruco_top_wrist">
+    <inertial>
+      <origin xyz="-1.3531e-15 -3.4972e-15 -0.000125" rpy="0.0 0.0 0.0"/>
+      <mass value="0.0"/>
+      <inertia ixx="0.0" ixy="0.0" ixz="0.0" iyy="0.0" iyz="0.0" izz="0.0"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_aruco_top_wrist.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_aruco_top_wrist.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_aruco_top_wrist" type="fixed">
+    <origin xyz="0.0472500000000019 0.0292850000000015 0.0" rpy="1.5707963267949 -8.03728587323464e-15 3.14159265358979"/>
+    <axis xyz="0.0 0.0 0.0"/>
+    <parent link="link_arm_l0"/>
+    <child link="link_aruco_top_wrist"/>
+  </joint>
+  <link name="link_aruco_inner_wrist">
+    <inertial>
+      <origin xyz="8.32667268468867e-17 1.77635683940025e-15 -0.000125000000000264" rpy="0.0 0.0 0.0"/>
+      <mass value="0.0"/>
+      <inertia ixx="0.0" ixy="0.0" ixz="0.0" iyy="0.0" iyz="0.0" izz="0.0"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_aruco_inner_wrist.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_aruco_inner_wrist.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_aruco_inner_wrist" type="fixed">
+    <origin xyz="0.0472499999999947 -0.0119000000000034 -0.0272499999991938" rpy="3.14159265358979 4.23377442363088e-14 3.14159265358979"/>
+    <axis xyz="0.0 0.0 0.0"/>
+    <parent link="link_arm_l0"/>
+    <child link="link_aruco_inner_wrist"/>
+  </joint>
+  <material name="aluminum">
+    <color rgba="0.5 0.5 0.5 1.0"/>
+  </material>
+  <material name="plastic">
+    <color rgba="0.1 0.1 0.1 1.0"/>
+  </material>
+  <joint name="camera_joint" type="fixed">
+    <origin xyz="0.025159233182831274 -0.013080581974219114 0.01874679694626267" rpy="0.004999649358199923 -0.0061657616189068065 -0.029873082278392422"/>
+    <parent link="link_head_tilt"/>
+    <child link="camera_bottom_screw_frame"/>
+  </joint>
+  <link name="camera_bottom_screw_frame"/>
+  <joint name="camera_link_joint" type="fixed">
+    <origin xyz="0.010600000000000002 0.0175 0.0125" rpy="0.0 0.0 0.0"/>
+    <parent link="camera_bottom_screw_frame"/>
+    <child link="camera_link"/>
+  </joint>
+  <link name="camera_link">
+    <inertial>
+      <origin xyz="0.0 0.0 0.0" rpy="0 0 0"/>
+      <mass value="0.072"/>
+      <inertia ixx="0.003881243" ixy="0.0" ixz="0.0" iyy="0.00049894" iyz="0.0" izz="0.003879257"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0043 -0.0175 0.0" rpy="1.5707963267948966 0.0 1.5707963267948966"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/d435.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <origin xyz="0.0 -0.0175 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <box size="0.02505 0.09 0.025"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="camera_depth_joint" type="fixed">
+    <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+    <parent link="camera_link"/>
+    <child link="camera_depth_frame"/>
+  </joint>
+  <link name="camera_depth_frame"/>
+  <joint name="camera_depth_optical_joint" type="fixed">
+    <origin xyz="0.0 0.0 0.0" rpy="-1.5707963267948966 0.0 -1.5707963267948966"/>
+    <parent link="camera_depth_frame"/>
+    <child link="camera_depth_optical_frame"/>
+  </joint>
+  <link name="camera_depth_optical_frame"/>
+  <joint name="camera_infra1_joint" type="fixed">
+    <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+    <parent link="camera_link"/>
+    <child link="camera_infra1_frame"/>
+  </joint>
+  <link name="camera_infra1_frame"/>
+  <joint name="camera_infra1_optical_joint" type="fixed">
+    <origin xyz="0.0 0.0 0.0" rpy="-1.5707963267948966 0.0 -1.5707963267948966"/>
+    <parent link="camera_infra1_frame"/>
+    <child link="camera_infra1_optical_frame"/>
+  </joint>
+  <link name="camera_infra1_optical_frame"/>
+  <joint name="camera_infra2_joint" type="fixed">
+    <origin xyz="0.0 -0.05 0.0" rpy="0.0 0.0 0.0"/>
+    <parent link="camera_link"/>
+    <child link="camera_infra2_frame"/>
+  </joint>
+  <link name="camera_infra2_frame"/>
+  <joint name="camera_infra2_optical_joint" type="fixed">
+    <origin xyz="0.0 0.0 0.0" rpy="-1.5707963267948966 0.0 -1.5707963267948966"/>
+    <parent link="camera_infra2_frame"/>
+    <child link="camera_infra2_optical_frame"/>
+  </joint>
+  <link name="camera_infra2_optical_frame"/>
+  <joint name="camera_color_joint" type="fixed">
+    <origin xyz="0.0 0.015 0.0" rpy="0.0 0.0 0.0"/>
+    <parent link="camera_link"/>
+    <child link="camera_color_frame"/>
+  </joint>
+  <link name="camera_color_frame"/>
+  <joint name="camera_color_optical_joint" type="fixed">
+    <origin xyz="0.0 0.0 0.0" rpy="-1.5707963267948966 0.0 -1.5707963267948966"/>
+    <parent link="camera_color_frame"/>
+    <child link="camera_color_optical_frame"/>
+  </joint>
+  <link name="camera_color_optical_frame"/>
+  <link name="camera_accel_frame"/>
+  <link name="camera_accel_optical_frame"/>
+  <link name="camera_gyro_frame"/>
+  <link name="camera_gyro_optical_frame"/>
+  <joint name="camera_accel_joint" type="fixed">
+    <origin xyz="-0.01174 -0.00552 0.0051" rpy="0.0 0.0 0.0"/>
+    <parent link="camera_link"/>
+    <child link="camera_accel_frame"/>
+  </joint>
+  <joint name="camera_accel_optical_joint" type="fixed">
+    <origin xyz="0.0 0.0 0.0" rpy="-1.5707963267948966 0.0 -1.5707963267948966"/>
+    <parent link="camera_accel_frame"/>
+    <child link="camera_accel_optical_frame"/>
+  </joint>
+  <joint name="camera_gyro_joint" type="fixed">
+    <origin xyz="-0.01174 -0.00552 0.0051" rpy="0.0 0.0 0.0"/>
+    <parent link="camera_link"/>
+    <child link="camera_gyro_frame"/>
+  </joint>
+  <joint name="camera_gyro_optical_joint" type="fixed">
+    <origin xyz="0.0 0.0 0.0" rpy="-1.5707963267948966 0.0 -1.5707963267948966"/>
+    <parent link="camera_gyro_frame"/>
+    <child link="camera_gyro_optical_frame"/>
+  </joint>
+  <material name="aluminum_d405">
+    <color rgba="0.5 0.5 0.5 1.0"/>
+  </material>
+  <material name="plastic_d405">
+    <color rgba="0.1 0.1 0.1 1.0"/>
+  </material>
+  <joint name="gripper_camera_joint" type="fixed">
+    <origin xyz="0.0 0.044142 0.042067" rpy="0.0 -1.3963 -1.5708"/>
+    <parent link="link_gripper_s3_body"/>
+    <child link="gripper_camera_bottom_screw_frame"/>
+  </joint>
+  <link name="gripper_camera_bottom_screw_frame"/>
+  <joint name="gripper_camera_link_joint" type="fixed">
+    <origin xyz="0.01085 0.009 0.021" rpy="0.0 0.0 0.0"/>
+    <parent link="gripper_camera_bottom_screw_frame"/>
+    <child link="gripper_camera_link"/>
+  </joint>
+  <link name="gripper_camera_link">
+    <inertial>
+      <origin xyz="0.0 0.0 0.0" rpy="0 0 0"/>
+      <mass value="0.072"/>
+      <inertia ixx="0.003881243" ixy="0.0" ixz="0.0" iyy="0.00049894" iyz="0.0" izz="0.003879257"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0038 -0.009 0.0" rpy="1.5707963267948966 0.0 1.5707963267948966"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/d405.stl" scale="0.001 0.001 0.001"/>
+      </geometry>
+      <material name="aluminum_d405"/>
+    </visual>
+    <collision>
+      <origin xyz="-0.0078 -0.009 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <box size="0.023 0.042 0.042"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="gripper_camera_depth_joint" type="fixed">
+    <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+    <parent link="gripper_camera_link"/>
+    <child link="gripper_camera_depth_frame"/>
+  </joint>
+  <link name="gripper_camera_depth_frame"/>
+  <joint name="gripper_camera_depth_optical_joint" type="fixed">
+    <origin xyz="0.0 0.0 0.0" rpy="-1.5707963267948966 0.0 -1.5707963267948966"/>
+    <parent link="gripper_camera_depth_frame"/>
+    <child link="gripper_camera_depth_optical_frame"/>
+  </joint>
+  <link name="gripper_camera_depth_optical_frame"/>
+  <joint name="gripper_camera_infra1_joint" type="fixed">
+    <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+    <parent link="gripper_camera_link"/>
+    <child link="gripper_camera_infra1_frame"/>
+  </joint>
+  <link name="gripper_camera_infra1_frame"/>
+  <joint name="gripper_camera_infra1_optical_joint" type="fixed">
+    <origin xyz="0.0 0.0 0.0" rpy="-1.5707963267948966 0.0 -1.5707963267948966"/>
+    <parent link="gripper_camera_infra1_frame"/>
+    <child link="gripper_camera_infra1_optical_frame"/>
+  </joint>
+  <link name="gripper_camera_infra1_optical_frame"/>
+  <joint name="gripper_camera_infra2_joint" type="fixed">
+    <origin xyz="0.0 -0.018 0.0" rpy="0.0 0.0 0.0"/>
+    <parent link="gripper_camera_link"/>
+    <child link="gripper_camera_infra2_frame"/>
+  </joint>
+  <link name="gripper_camera_infra2_frame"/>
+  <joint name="gripper_camera_infra2_optical_joint" type="fixed">
+    <origin xyz="0.0 0.0 0.0" rpy="-1.5707963267948966 0.0 -1.5707963267948966"/>
+    <parent link="gripper_camera_infra2_frame"/>
+    <child link="gripper_camera_infra2_optical_frame"/>
+  </joint>
+  <link name="gripper_camera_infra2_optical_frame"/>
+  <joint name="gripper_camera_color_joint" type="fixed">
+    <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+    <parent link="gripper_camera_link"/>
+    <child link="gripper_camera_color_frame"/>
+  </joint>
+  <link name="gripper_camera_color_frame"/>
+  <joint name="gripper_camera_color_optical_joint" type="fixed">
+    <origin xyz="0.0 0.0 0.0" rpy="-1.5707963267948966 0.0 -1.5707963267948966"/>
+    <parent link="gripper_camera_color_frame"/>
+    <child link="gripper_camera_color_optical_frame"/>
+  </joint>
+  <link name="gripper_camera_color_optical_frame"/>
+  <link name="laser">
+    <inertial>
+      <origin xyz="0.011979 -0.000523 -0.01998" rpy="0.0 0.0 0.0"/>
+      <mass value="0.216007"/>
+      <inertia ixx="9.5e-05" ixy="2e-06" ixz="-1.9e-05" iyy="0.000184" iyz="-1e-06" izz="0.000216"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/laser.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.250980392156863 0.250980392156863 0.250980392156863 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/laser_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_laser" type="fixed">
+    <origin xyz="0.004 0.0 0.1664" rpy="0.0 0.0 3.1416"/>
+    <axis xyz="0.0 0.0 0.0"/>
+    <parent link="base_link"/>
+    <child link="laser"/>
+  </joint>
+  <link name="base_imu">
+    <inertial>
+      <origin xyz="0.00300349280517617 0.00149777182047641 -0.00193103885249443" rpy="0.0 0.0 0.0"/>
+      <mass value="0.00092"/>
+      <inertia ixx="3e-08" ixy="0.0" ixz="0.0" iyy="5e-08" iyz="0.0" izz="8e-08"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/base_imu.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.0 0.772549019607843 0.207843137254902 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/base_imu.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_base_imu" type="fixed">
+    <origin xyz="-0.12838 0.0031592 0.1474" rpy="-3.1416 0.0 -1.5708"/>
+    <axis xyz="0.0 0.0 0.0"/>
+    <parent link="base_link"/>
+    <child link="base_imu"/>
+  </joint>
+  <link name="respeaker_base">
+    <inertial>
+      <origin xyz="-0.001594 -0.001282 -0.008098" rpy="0.0 0.0 0.0"/>
+      <mass value="0.025175"/>
+      <inertia ixx="1e-05" ixy="2e-06" ixz="1e-06" iyy="1.2e-05" iyz="0.0" izz="1.5e-05"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/respeaker_base.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.79216 0.81961 0.93333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/respeaker_base.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_respeaker" type="fixed">
+    <origin xyz="0.0 1.37236408874452 0.00303065898329655" rpy="-1.5707963267949 -0.698131700797725 4.93295812652799e-16"/>
+    <axis xyz="0.0 0.0 0.0"/>
+    <parent link="link_mast"/>
+    <child link="respeaker_base"/>
+  </joint>
+  <link name="link_wrist_yaw_bottom">
+    <inertial>
+      <origin xyz="-0.0165399153795968 -0.0446328107020906 -0.0281087384763343" rpy="0.0 0.0 0.0"/>
+      <mass value="0.144548635554334"/>
+      <inertia ixx="4.56525957614798e-05" ixy="-1.08217417869179e-06" ixz="-1.44384040297111e-08" iyy="7.22875259104698e-05" iyz="3.40820143544021e-09" izz="8.76544308203217e-05"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_DW3_wrist_yaw_bottom.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_DW3_wrist_yaw_bottom_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_wrist_yaw_bottom" type="fixed">
+    <origin xyz="0.0 0.0 0.0" rpy="-3.14159265358979 0.0 -1.5707963267949"/>
+    <axis xyz="0.0 0.0 0.0"/>
+    <parent link="link_wrist_yaw"/>
+    <child link="link_wrist_yaw_bottom"/>
+  </joint>
+  <link name="link_wrist_pitch">
+    <inertial>
+      <origin xyz="-0.000711479645697954 -0.0147442681465562 0.0195140764319514" rpy="0.0 0.0 0.0"/>
+      <mass value="0.116590512374845"/>
+      <inertia ixx="5.76204624498191e-05" ixy="-7.78647509857216e-07" ixz="-2.86646997376633e-08" iyy="3.5844353951621e-05" iyz="1.06445058968725e-06" izz="6.36355321976534e-05"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_DW3_wrist_pitch.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="1.0 1.0 1.0 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_DW3_wrist_pitch_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_wrist_pitch" type="revolute">
+    <origin xyz="0.0 -0.019455 -0.0305" rpy="1.5708 0.0 3.1416"/>
+    <axis xyz="0.0 0.0 -1.0"/>
+    <parent link="link_wrist_yaw_bottom"/>
+    <child link="link_wrist_pitch"/>
+    <limit effort="100.0" lower="-1.57" upper="0.56" velocity="1.0"/>
+  </joint>
+  <link name="link_wrist_roll">
+    <inertial>
+      <origin xyz="0.000282866582415386 0.000557472767112266 0.0196635450480892" rpy="0.0 0.0 0.0"/>
+      <mass value="0.0221916689638078"/>
+      <inertia ixx="2.20608319376175e-06" ixy="3.01742816614342e-09" ixz="-1.83537060988685e-08" iyy="2.91436292743443e-06" iyz="-4.5509705441618e-08" izz="4.83731409646016e-06"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_DW3_wrist_roll.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.898039215686275 0.917647058823529 0.929411764705882 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_DW3_wrist_roll_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_wrist_roll" type="revolute">
+    <origin xyz="-0.018859 -0.024 0.01955" rpy="3.1416 1.5708 0.0"/>
+    <axis xyz="0.0 0.0 1.0"/>
+    <parent link="link_wrist_pitch"/>
+    <child link="link_wrist_roll"/>
+    <limit effort="100.0" lower="-3.14" upper="3.14" velocity="1.0"/>
+  </joint>
+  <link name="link_wrist_quick_connect"/>
+  <joint name="joint_wrist_quick_connect" type="fixed">
+    <origin xyz="0.0 0.0 0.0085" rpy="1.5708 0.0 -1.5708"/>
+    <axis xyz="0.0 0.0 0.0"/>
+    <parent link="link_wrist_roll"/>
+    <child link="link_wrist_quick_connect"/>
+  </joint>
+  <link name="link_gripper_s3_body">
+    <inertial>
+      <origin xyz="0.000296364207693828 0.00742489472460528 0.0440194148825503" rpy="0.0 0.0 0.0"/>
+      <mass value="0.288995995074102"/>
+      <inertia ixx="0.000482695285602815" ixy="-3.02029982967392e-06" ixz="6.73858128561175e-07" iyy="0.000213430693267808" iyz="-1.95797907844847e-05" izz="0.000377638305434195"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_SG3_gripper_body.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_SG3_gripper_body_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_gripper_s3_body" type="fixed">
+    <origin xyz="0.0 0.0 0.0209999999993159" rpy="0.0 0.0 -3.14159265358975"/>
+    <axis xyz="0.0 0.0 0.0"/>
+    <parent link="link_wrist_roll"/>
+    <child link="link_gripper_s3_body"/>
+  </joint>
+  <link name="link_gripper_finger_right">
+    <inertial>
+      <origin xyz="-0.107294725536829 -0.00404012121215261 7.86433341395387e-05" rpy="0.0 0.0 0.0"/>
+      <mass value="0.0483698337858479"/>
+      <inertia ixx="3.0603544296956e-06" ixy="-8.13967524445937e-07" ixz="-6.04746355450675e-08" iyy="6.12306677697621e-05" iyz="-1.81592899124487e-08" izz="6.24670188075471e-05"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_SG3_gripper_finger_right.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_SG3_gripper_finger_right_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_gripper_finger_right" type="revolute">
+    <origin xyz="-0.0246929393014972 0.0 0.0371909405167257" rpy="-1.57079632679513 1.48789318732515 3.14159265358956"/>
+    <axis xyz="0.0 0.0 1.0"/>
+    <parent link="link_gripper_s3_body"/>
+    <child link="link_gripper_finger_right"/>
+    <limit effort="0.0" lower="0.0" upper="0.0" velocity="0.0"/>
+  </joint>
+  <link name="link_gripper_fingertip_right">
+    <inertial>
+      <origin xyz="2.8647736627363e-08 -9.47712436305537e-09 0.00812577933915037" rpy="0.0 0.0 0.0"/>
+      <mass value="0.00382162062525088"/>
+      <inertia ixx="4.13411802118931e-07" ixy="3.48038363075757e-16" ixz="-5.14102948448996e-13" iyy="4.13411799815932e-07" iyz="1.76223254858882e-13" izz="7.08442016110623e-07"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_SG3_gripper_fingertip_right.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.250980392156863 0.250980392156863 0.250980392156863 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_SG3_gripper_fingertip_right_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_gripper_fingertip_right" type="fixed">
+    <origin xyz="-0.177784508377902 -0.0106721811879468 0.0" rpy="-1.5707963267949 0.0 0.261799387799172"/>
+    <axis xyz="0.0 0.0 0.0"/>
+    <parent link="link_gripper_finger_right"/>
+    <child link="link_gripper_fingertip_right"/>
+  </joint>
+  <link name="link_aruco_fingertip_right">
+    <inertial>
+      <origin xyz="0.0 1.11022302462516e-16 -0.000133556861164369" rpy="0.0 0.0 0.0"/>
+      <mass value="7.44175884388862e-05"/>
+      <inertia ixx="1.89627721063282e-09" ixy="-7.92118400196357e-24" ixz="-5.31083688038992e-24" iyy="1.89627721063282e-09" iyz="3.1794754795007e-24" izz="3.79185381384273e-09"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_SG3_aruco_fingertip_right.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="1.0 1.0 1.0 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_SG3_aruco_fingertip_right.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_aruco_fingertip_right" type="fixed">
+    <origin xyz="-0.143934974292953 -0.00436381846065068 -8.58640479174944e-05" rpy="-1.76278254451426 0.0 -0.959931088596862"/>
+    <axis xyz="0.0 0.0 0.0"/>
+    <parent link="link_gripper_finger_right"/>
+    <child link="link_aruco_fingertip_right"/>
+  </joint>
+  <link name="link_gripper_finger_left">
+    <inertial>
+      <origin xyz="0.107294715994863 -0.00404011929289155 7.86443400617731e-05" rpy="0.0 0.0 0.0"/>
+      <mass value="0.0483698266031602"/>
+      <inertia ixx="3.06035472000871e-06" ixy="8.13968102368407e-07" ixz="6.04787443258659e-08" iyy="6.12306675643124e-05" iyz="-1.81596824851191e-08" izz="6.24670194741309e-05"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_SG3_gripper_finger_left.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_SG3_gripper_finger_left_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_gripper_finger_left" type="revolute">
+    <origin xyz="0.0245029393015013 0.0 0.0371909405167127" rpy="-1.57079632679523 -1.48789318734672 -3.14159265358946"/>
+    <axis xyz="0.0 0.0 -1.0"/>
+    <parent link="link_gripper_s3_body"/>
+    <child link="link_gripper_finger_left"/>
+    <limit effort="0.0" lower="0.0" upper="0.0" velocity="0.0"/>
+  </joint>
+  <link name="link_gripper_fingertip_left">
+    <inertial>
+      <origin xyz="-3.08483497635237e-08 -1.26956833934599e-09 0.0081257795407928" rpy="0.0 0.0 0.0"/>
+      <mass value="0.00382162054362131"/>
+      <inertia ixx="4.13411804417319e-07" ixy="2.60118266065524e-16" ixz="5.82035131405725e-13" iyy="4.13411805017951e-07" iyz="2.22196449834679e-14" izz="7.08442015318697e-07"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_SG3_gripper_fingertip_left.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.250980392156863 0.250980392156863 0.250980392156863 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_SG3_gripper_fingertip_left_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_gripper_fingertip_left" type="fixed">
+    <origin xyz="0.177784508377884 -0.0106721811879429 0.0" rpy="1.57079632679491 0.0 2.87979326579064"/>
+    <axis xyz="0.0 0.0 0.0"/>
+    <parent link="link_gripper_finger_left"/>
+    <child link="link_gripper_fingertip_left"/>
+  </joint>
+  <link name="link_aruco_fingertip_left">
+    <inertial>
+      <origin xyz="1.66533453693773e-16 1.66533453693773e-16 -0.000133556861164119" rpy="0.0 0.0 0.0"/>
+      <mass value="7.44175884388882e-05"/>
+      <inertia ixx="1.89627721063294e-09" ixy="-5.82454725440012e-23" ixz="6.72490089805544e-24" iyy="1.89627721063281e-09" iyz="1.93870456067116e-24" izz="3.79185381384285e-09"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_SG3_aruco_fingertip_left.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="1.0 1.0 1.0 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_SG3_aruco_fingertip_left.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_aruco_fingertip_left" type="fixed">
+    <origin xyz="0.143934974292929 -0.00436381846064454 -8.58640479230455e-05" rpy="-1.76278254451424 0.0 0.959931088596885"/>
+    <axis xyz="0.0 0.0 0.0"/>
+    <parent link="link_gripper_finger_left"/>
+    <child link="link_aruco_fingertip_left"/>
+  </joint>
+  <link name="link_aruco_d405">
+    <inertial>
+      <origin xyz="-2.94593661126497e-06 6.87711987940043e-06 0.000123133477759885" rpy="0.0 0.0 0.0"/>
+      <mass value="0.000393662237156561"/>
+      <inertia ixx="5.28754465024473e-08" ixy="3.78334824255939e-13" ixz="1.41329092888338e-13" iyy="5.28872681828417e-08" iyz="-3.29924653052859e-13" izz="1.05758729446112e-07"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_SG3_aruco_d405.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="1.0 1.0 1.0 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_SG3_aruco_d405.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_aruco_d405" type="fixed">
+    <origin xyz="0.0 0.0671264013906604 0.0297716109784749" rpy="2.96705999342568 -0.000305721573415565 3.1398588194619"/>
+    <axis xyz="0.0 0.0 0.0"/>
+    <parent link="link_gripper_s3_body"/>
+    <child link="link_aruco_d405"/>
+  </joint>
+  <link name="link_grasp_center"/>
+  <joint name="joint_grasp_center" type="fixed">
+    <origin xyz="0.0 0.0 0.23" rpy="-1.5707963267949 -1.5707963267948 0.0"/>
+    <parent link="link_gripper_s3_body"/>
+    <child link="link_grasp_center"/>
+  </joint>
+  <link name="link_head_nav_cam">
+    <inertial>
+      <origin xyz="-0.000242485173502102 -6.9219046065605e-05 -0.0139988115298306" rpy="0.0 0.0 0.0"/>
+      <mass value="0.028346169831483"/>
+      <inertia ixx="4.03820640553667e-06" ixy="-4.29737468694014e-25" ixz="-3.42669828325649e-22" iyy="4.03772087509678e-06" iyz="-2.25687766186513e-21" izz="5.37818026653469e-06"/>
+    </inertial>
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_SE3_head_nav_cam.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0.0 0.0"/>
+      <geometry>
+        <mesh filename="package://stretch_description/meshes/link_SE3_head_nav_cam_collision.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_head_nav_cam" type="fixed">
+    <origin xyz="0.0402000000000069 -0.0411729746231231 -0.0245785717313561" rpy="0.0 1.5707963267949 0.0"/>
+    <axis xyz="0.0 0.0 0.0"/>
+    <parent link="link_head_tilt"/>
+    <child link="link_head_nav_cam"/>
+  </joint>
+</robot>

From e051a39cf9742343faa4510f0be5b559f4611c7f Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 26 Sep 2024 11:26:47 -0400
Subject: [PATCH 242/410] remove static flags

---
 src/setup.py                       | 1 +
 src/stretch/visualization/rerun.py | 6 +-----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/setup.py b/src/setup.py
index c6829fe65..efa03384a 100644
--- a/src/setup.py
+++ b/src/setup.py
@@ -49,6 +49,7 @@
         "pyusb",
         "schema",
         "overrides",
+        "wget",
         # From openai
         "openai",
         "openai-clip",
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 8c15f7087..5759ef180 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -166,7 +166,6 @@ def log_transforms(self, obs, debug: bool = False):
                     mat3x3=self.link_poses[idx][:3, :3],
                     axis_length=0.0,
                 ),
-                static=False,
             )
         t2 = timeit.default_timer()
         if debug:
@@ -309,7 +308,6 @@ def log_robot_xyt(self, obs: Observations):
                 rotation=rr.RotationAxisAngle(axis=[0, 0, 1], radians=theta),
                 axis_length=0.7,
             ),
-            static=True,
         )
 
     def log_ee_frame(self, obs):
@@ -328,9 +326,7 @@ def log_ee_frame(self, obs):
         #     "world/ee", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3), static=True
         # )
         # rr.log("world/ee/arrow", ee_arrow)
-        rr.log(
-            "world/ee", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3, static=True)
-        )
+        rr.log("world/ee", rr.Transform3D(translation=trans, mat3x3=rot, axis_length=0.3))
 
     def log_ee_camera(self, servo):
         """Log end effector camera pose and images

From 36bbb4ea215808b184acad13cbfb22749d00b37b Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 26 Sep 2024 11:34:54 -0400
Subject: [PATCH 243/410] remove "static"

---
 src/stretch/visualization/rerun.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 5759ef180..0fe4b3a49 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -299,7 +299,6 @@ def log_robot_xyt(self, obs: Observations):
         rr.log(
             "world/robot/blob",
             rr.Points3D([0, 0, 0], colors=[255, 0, 0, 255], radii=0.13),
-            static=True,
         )
         rr.log(
             "world/robot",
@@ -407,7 +406,6 @@ def update_voxel_map(
                 positions=points,
                 radii=np.ones(rgb.shape[0]) * world_radius,
                 colors=np.int64(rgb),
-                static=True,
             ),
         )
 
@@ -436,7 +434,6 @@ def update_voxel_map(
                 radii=np.ones(points.shape[0]) * obstacle_radius,
                 colors=[255, 0, 0],
             ),
-            static=True,
         )
         rr.log(
             "world/explored",
@@ -445,7 +442,6 @@ def update_voxel_map(
                 radii=np.ones(points.shape[0]) * explored_radius,
                 colors=[255, 255, 255],
             ),
-            static=True,
         )
         t6 = timeit.default_timer()
 

From 7820459026fc7bba2252ee9c494f99bb2833ce3e Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 26 Sep 2024 11:46:00 -0400
Subject: [PATCH 244/410] Update code to make sure rerun works etc

---
 src/stretch/agent/zmq_client.py       |  4 ++--
 src/stretch/dynav/run_ok_nav.py       |  2 +-
 src/stretch/dynav/voxel_map_server.py | 25 +++++++++++++++----------
 src/stretch/visualization/rerun.py    |  2 +-
 4 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 05967d45c..f36fa7fd9 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -33,7 +33,7 @@
 from stretch.utils.image import Camera
 from stretch.utils.memory import lookup_address
 from stretch.utils.point_cloud import show_point_cloud
-from stretch.visualization.rerun import RerunVsualizer
+from stretch.visualization.rerun import RerunVisualizer
 
 # TODO: debug code - remove later if necessary
 # import faulthandler
@@ -181,7 +181,7 @@ def __init__(
         self._servo_lock = Lock()
 
         if enable_rerun_server:
-            self._rerun = RerunVsualizer()
+            self._rerun = RerunVisualizer()
         else:
             self._rerun = None
             self._rerun_thread = None
diff --git a/src/stretch/dynav/run_ok_nav.py b/src/stretch/dynav/run_ok_nav.py
index b8e16e003..c1ec0c10d 100644
--- a/src/stretch/dynav/run_ok_nav.py
+++ b/src/stretch/dynav/run_ok_nav.py
@@ -37,7 +37,7 @@ def compute_tilt(camera_xyz, target_xyz):
 @click.option("--manual-wait", default=False, is_flag=True)
 @click.option("--random-goals", default=False, is_flag=True)
 @click.option("--explore-iter", default=-1)
-@click.option("--re", default=1, type=int)
+@click.option("--re", default=3, type=int, help="Choose between stretch RE1, RE2, RE3")
 @click.option("--method", default="dynamem", type=str)
 @click.option("--env", default=1, type=int)
 @click.option("--test", default=1, type=int)
diff --git a/src/stretch/dynav/voxel_map_server.py b/src/stretch/dynav/voxel_map_server.py
index 433e4b899..1e6a2b8d2 100644
--- a/src/stretch/dynav/voxel_map_server.py
+++ b/src/stretch/dynav/voxel_map_server.py
@@ -38,6 +38,7 @@
 # from transformers import AutoProcessor, OwlViTForObjectDetection
 from ultralytics import YOLOWorld
 
+import stretch.utils.logger as logger
 from stretch.core import get_parameters
 from stretch.dynav.communication_util import load_socket, recv_everything
 from stretch.dynav.mapping_utils.a_star import AStar
@@ -113,17 +114,18 @@ class ImageProcessor:
     def __init__(
         self,
         vision_method="mask*lip",
-        siglip=True,
-        device="cuda",
-        min_depth=0.25,
-        max_depth=2.5,
-        img_port=5558,
-        text_port=5556,
-        open_communication=True,
-        rerun=True,
-        static=True,
+        siglip: bool = True,
+        device: str = "cuda",
+        min_depth: float = 0.25,
+        max_depth: float = 2.5,
+        img_port: int = 5558,
+        text_port: int = 5556,
+        open_communication: bool = True,
+        rerun: bool = True,
+        static: bool = True,
         log=None,
         image_shape=(400, 300),
+        rerun_server_memory_limit: str = "4GB",
     ):
         self.static = static
         self.siglip = siglip
@@ -138,7 +140,10 @@ def __init__(
                 rr.init(self.log, spawn=False)
                 rr.connect("100.108.67.79:9876")
             else:
-                rr.init(self.log, spawn=True)
+                # rr.init(self.log, spawn=False)
+                # rr.connect("100.108.67.79:9876")
+                logger.info("Attempting to connect to existing rerun server.")
+
         self.min_depth = min_depth
         self.max_depth = max_depth
         self.obs_count = 0
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 0fe4b3a49..7a3dea6af 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -174,7 +174,7 @@ def log_transforms(self, obs, debug: bool = False):
             print("Total time to log robot transforms (ms): ", 1000 * (t2 - t0))
 
 
-class RerunVsualizer:
+class RerunVisualizer:
 
     camera_point_radius = 0.01
 

From a8a19af23958bed4e4d5fc2b1546ead51693e2f3 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 26 Sep 2024 14:16:17 -0400
Subject: [PATCH 245/410] adding some things

---
 .../dynav/mapping_utils/voxelized_pcd.py      | 56 +++++++++++++--
 src/stretch/utils/voxel.py                    | 70 +++++++++++++++++++
 2 files changed, 120 insertions(+), 6 deletions(-)

diff --git a/src/stretch/dynav/mapping_utils/voxelized_pcd.py b/src/stretch/dynav/mapping_utils/voxelized_pcd.py
index e2898434b..c2f948103 100644
--- a/src/stretch/dynav/mapping_utils/voxelized_pcd.py
+++ b/src/stretch/dynav/mapping_utils/voxelized_pcd.py
@@ -17,12 +17,18 @@
 import torch
 from torch import Tensor
 
-USE_TORCH_GEOMETRIC = True
+import stretch.utils.logger as logger
+
+USE_TORCH_GEOMETRIC = False
 if USE_TORCH_GEOMETRIC:
-    from torch_geometric.nn.pool.consecutive import consecutive_cluster
-    from torch_geometric.nn.pool.voxel_grid import voxel_grid
-    from torch_geometric.utils import scatter
-else:
+    try:
+        from torch_geometric.nn.pool.consecutive import consecutive_cluster
+        from torch_geometric.nn.pool.voxel_grid import voxel_grid
+        from torch_geometric.utils import scatter
+    except:
+        logger.warning("torch_geometric not found, falling back to custom implementation")
+        USE_TORCH_GEOMETRIC = False
+if not USE_TORCH_GEOMETRIC:
     from stretch.utils.torch_geometric import consecutive_cluster, voxel_grid
     from stretch.utils.torch_scatter import scatter
 
@@ -564,7 +570,7 @@ def reduce_pointcloud(
     )
 
 
-def scatter3d(
+def old_scatter3d(
     voxel_indices: Tensor,
     weights: Tensor,
     grid_dimensions: List[int],
@@ -602,6 +608,44 @@ def scatter3d(
     return voxel_weights.reshape(*grid_dimensions)
 
 
+def scatter3d(
+    voxel_indices: Tensor, weights: Tensor, grid_dimensions: List[int], method: Optional[str] = None
+) -> Tensor:
+    """Scatter weights into a 3d voxel grid of the appropriate size.
+
+    Args:
+        voxel_indices (LongTensor): [N, 3] indices to scatter values to.
+        weights (FloatTensor): [N] values of equal size to scatter through voxel map.
+        grid_dimenstions (List[int]): sizes of the resulting voxel map, should be 3d.
+
+    Returns:
+        voxels (FloatTensor): [grid_dimensions] voxel map containing combined weights."""
+
+    assert voxel_indices.shape[0] == weights.shape[0], "weights and indices must match"
+    assert len(grid_dimensions) == 3, "this is designed to work only in 3d"
+    assert voxel_indices.shape[-1] == 3, "3d points expected for indices"
+
+    N, F = weights.shape
+    X, Y, Z = grid_dimensions
+
+    # Compute voxel indices for each point
+    # voxel_indices = (points / voxel_size).long().clamp(min=0, max=torch.tensor(grid_size) - 1)
+    voxel_indices = voxel_indices.clamp(
+        min=torch.zeros(3), max=torch.tensor(grid_dimensions) - 1
+    ).long()
+
+    # Reduce according to min/max/mean or none
+    logger.warning(f"Scattering {N} points into {X}x{Y}x{Z} grid, method={method}")
+
+    # Create empty voxel grid
+    voxel_grid = torch.zeros(*grid_dimensions, F, device=weights.device)
+
+    # Scatter features into voxel grid
+    voxel_grid[voxel_indices[:, 0], voxel_indices[:, 1], voxel_indices[:, 2]] = weights
+    voxel_grid.squeeze_(-1)
+    return voxel_grid
+
+
 def drop_smallest_weight_points(
     points: Tensor,
     voxel_size: float = 0.01,
diff --git a/src/stretch/utils/voxel.py b/src/stretch/utils/voxel.py
index 82cbca90e..2c2f6f940 100644
--- a/src/stretch/utils/voxel.py
+++ b/src/stretch/utils/voxel.py
@@ -31,6 +31,76 @@
     from stretch.utils.torch_geometric import consecutive_cluster, voxel_grid
     from stretch.utils.torch_scatter import scatter
 
+from typing import Literal
+
+import torch
+from torch import Tensor
+
+
+def merge_features(
+    idx: Tensor, features: Tensor, method: Literal["sum", "min", "max", "mean"] = "sum"
+) -> Tensor:
+    """
+    Merge features based on the given indices using the specified method.
+
+    This function takes a tensor of indices and a tensor of features, and merges
+    the features for duplicate indices according to the specified method.
+
+    Args:
+        idx (Tensor): A 1D integer tensor containing indices, possibly with duplicates.
+        features (Tensor): A 2D float tensor of shape (len(idx), feature_dim) containing
+                           feature vectors corresponding to each index.
+        method (Literal['sum', 'min', 'max', 'mean']): The method to use for merging
+                                                       features. Default is 'sum'.
+
+    Returns:
+        Tensor: A 2D tensor of shape (num_unique_idx, feature_dim) containing the
+                merged features.
+
+    Raises:
+        ValueError: If an invalid merge method is specified.
+
+    Example:
+        >>> idx = torch.tensor([0, 1, 0, 2, 1])
+        >>> features = torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0], [9.0, 10.0]])
+        >>> merge_features(idx, features, method='sum')
+        tensor([[ 6.0,  8.0],
+                [12.0, 14.0],
+                [ 7.0,  8.0]])
+    """
+    if idx.dim() != 1:
+        raise ValueError("idx must be a 1D tensor")
+    if features.dim() != 2 or features.size(0) != idx.size(0):
+        raise ValueError("features must be a 2D tensor with shape (len(idx), feature_dim)")
+
+    unique_idx, inverse_idx = torch.unique(idx, return_inverse=True)
+    num_unique = unique_idx.size(0)
+    feature_dim = features.size(1)
+
+    if method == "sum":
+        merged = torch.zeros(num_unique, feature_dim, dtype=features.dtype, device=features.device)
+        merged.index_add_(0, inverse_idx, features)
+    elif method == "min":
+        merged = torch.full(
+            (num_unique, feature_dim), float("inf"), dtype=features.dtype, device=features.device
+        )
+        merged = torch.min(merged.index_copy(0, inverse_idx, features), merged)
+    elif method == "max":
+        merged = torch.full(
+            (num_unique, feature_dim), float("-inf"), dtype=features.dtype, device=features.device
+        )
+        merged = torch.max(merged.index_copy(0, inverse_idx, features), merged)
+    elif method == "mean":
+        merged = torch.zeros(num_unique, feature_dim, dtype=features.dtype, device=features.device)
+        merged.index_add_(0, inverse_idx, features)
+        count = torch.zeros(num_unique, dtype=torch.int, device=features.device)
+        count.index_add_(0, inverse_idx, torch.ones_like(inverse_idx))
+        merged /= count.unsqueeze(1)
+    else:
+        raise ValueError("Invalid merge method. Choose from 'sum', 'min', 'max', or 'mean'.")
+
+    return merged
+
 
 class VoxelizedPointcloud:
     _INTERNAL_TENSORS = [

From dc47258a32c71cb528256550e3fd1341525709b2 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 26 Sep 2024 14:57:53 -0400
Subject: [PATCH 246/410] Fixing typing issues

---
 .../dynav/mapping_utils/voxelized_pcd.py      |  7 ++++++
 src/stretch/utils/voxel.py                    | 25 +++++++++++++------
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/src/stretch/dynav/mapping_utils/voxelized_pcd.py b/src/stretch/dynav/mapping_utils/voxelized_pcd.py
index c2f948103..a74f2fddf 100644
--- a/src/stretch/dynav/mapping_utils/voxelized_pcd.py
+++ b/src/stretch/dynav/mapping_utils/voxelized_pcd.py
@@ -18,6 +18,7 @@
 from torch import Tensor
 
 import stretch.utils.logger as logger
+from stretch.utils.voxel import merge_features
 
 USE_TORCH_GEOMETRIC = False
 if USE_TORCH_GEOMETRIC:
@@ -625,6 +626,9 @@ def scatter3d(
     assert len(grid_dimensions) == 3, "this is designed to work only in 3d"
     assert voxel_indices.shape[-1] == 3, "3d points expected for indices"
 
+    if len(voxel_indices) == 0:
+        return torch.zeros(*grid_dimensions, device=weights.device)
+
     N, F = weights.shape
     X, Y, Z = grid_dimensions
 
@@ -637,6 +641,9 @@ def scatter3d(
     # Reduce according to min/max/mean or none
     logger.warning(f"Scattering {N} points into {X}x{Y}x{Z} grid, method={method}")
 
+    if method is not None and method != "any":
+        merge_features(voxel_indices, weights, method=method)
+
     # Create empty voxel grid
     voxel_grid = torch.zeros(*grid_dimensions, F, device=weights.device)
 
diff --git a/src/stretch/utils/voxel.py b/src/stretch/utils/voxel.py
index 2c2f6f940..ec5008f5b 100644
--- a/src/stretch/utils/voxel.py
+++ b/src/stretch/utils/voxel.py
@@ -38,8 +38,8 @@
 
 
 def merge_features(
-    idx: Tensor, features: Tensor, method: Literal["sum", "min", "max", "mean"] = "sum"
-) -> Tensor:
+    idx: Tensor, features: Tensor, method: str | Literal["sum", "min", "max", "mean"] = "sum"
+) -> Tuple[Tensor, Tensor]:
     """
     Merge features based on the given indices using the specified method.
 
@@ -54,20 +54,28 @@ def merge_features(
                                                        features. Default is 'sum'.
 
     Returns:
-        Tensor: A 2D tensor of shape (num_unique_idx, feature_dim) containing the
-                merged features.
+        Tuple[Tensor, Tensor]: A tuple containing:
+            - A 2D tensor of shape (num_unique_idx, feature_dim) containing the
+              merged features.
+            - A 1D tensor of unique indices corresponding to the merged features.
 
     Raises:
-        ValueError: If an invalid merge method is specified.
+        ValueError: If an invalid merge method is specified or if input tensors
+                    have incorrect dimensions.
 
     Example:
         >>> idx = torch.tensor([0, 1, 0, 2, 1])
         >>> features = torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0], [9.0, 10.0]])
-        >>> merge_features(idx, features, method='sum')
+        >>> unique_idx, merged_features = merge_features(idx, features, method='sum')
+        >>> print(merged_features)
         tensor([[ 6.0,  8.0],
                 [12.0, 14.0],
                 [ 7.0,  8.0]])
+        >>> print(unique_idx)
+        tensor([0, 1, 2])
     """
+    print(idx.dim(), idx.shape)
+    breakpoint()
     if idx.dim() != 1:
         raise ValueError("idx must be a 1D tensor")
     if features.dim() != 2 or features.size(0) != idx.size(0):
@@ -99,7 +107,7 @@ def merge_features(
     else:
         raise ValueError("Invalid merge method. Choose from 'sum', 'min', 'max', or 'mean'.")
 
-    return merged
+    return unique_idx, merged
 
 
 class VoxelizedPointcloud:
@@ -547,6 +555,9 @@ def scatter3d(voxel_indices: Tensor, weights: Tensor, grid_dimensions: List[int]
     assert len(grid_dimensions) == 3, "this is designed to work only in 3d"
     assert voxel_indices.shape[-1] == 3, "3d points expected for indices"
 
+    if len(voxel_indices) == 0:
+        return torch.zeros(*grid_dimensions, device=weights.device)
+
     N, F = weights.shape
     X, Y, Z = grid_dimensions
 

From 71f21d605a7d9be1971834f93e77d173fbff11e4 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 26 Sep 2024 16:24:58 -0400
Subject: [PATCH 247/410] update

---
 .../dynav/mapping_utils/voxelized_pcd.py      |  7 +--
 .../torch_scatter/torch_scatter_helpers.py    | 15 ++++-
 src/stretch/utils/voxel.py                    | 57 +++++++++++++++++--
 3 files changed, 68 insertions(+), 11 deletions(-)

diff --git a/src/stretch/dynav/mapping_utils/voxelized_pcd.py b/src/stretch/dynav/mapping_utils/voxelized_pcd.py
index a74f2fddf..1b1ff783b 100644
--- a/src/stretch/dynav/mapping_utils/voxelized_pcd.py
+++ b/src/stretch/dynav/mapping_utils/voxelized_pcd.py
@@ -639,16 +639,15 @@ def scatter3d(
     ).long()
 
     # Reduce according to min/max/mean or none
-    logger.warning(f"Scattering {N} points into {X}x{Y}x{Z} grid, method={method}")
-
     if method is not None and method != "any":
-        merge_features(voxel_indices, weights, method=method)
+        logger.warning(f"Scattering {N} points into {X}x{Y}x{Z} grid, method={method}")
+        merge_features(voxel_indices, weights, grid_dimensions=grid_dimensions, method=method)
 
     # Create empty voxel grid
     voxel_grid = torch.zeros(*grid_dimensions, F, device=weights.device)
 
     # Scatter features into voxel grid
-    voxel_grid[voxel_indices[:, 0], voxel_indices[:, 1], voxel_indices[:, 2]] = weights
+    voxel_grid[voxel_indices[:, 0], voxel_indices[:, 1], voxel_indices[:, 2]] = weights.float()
     voxel_grid.squeeze_(-1)
     return voxel_grid
 
diff --git a/src/stretch/utils/torch_scatter/torch_scatter_helpers.py b/src/stretch/utils/torch_scatter/torch_scatter_helpers.py
index b1cfcbcd8..db977d8ae 100644
--- a/src/stretch/utils/torch_scatter/torch_scatter_helpers.py
+++ b/src/stretch/utils/torch_scatter/torch_scatter_helpers.py
@@ -109,6 +109,18 @@ def scatter_min(
     return torch.ops.torch_scatter.scatter_min(src, index, dim, out, dim_size)
 
 
+def manual_scatter_max(src, index, dim=-1, out=None, dim_size=None):
+    if out is None:
+        size = list(src.size())
+        if dim_size is not None:
+            size[dim] = dim_size
+        else:
+            size[dim] = int(index.max()) + 1
+        out = src.new_zeros(size)
+
+    return out.scatter_reduce(dim, index, src, reduce="amax")
+
+
 def scatter_max(
     src: torch.Tensor,
     index: torch.Tensor,
@@ -209,7 +221,8 @@ def _scatter(
     elif reduce == "min":
         return scatter_min(src, index, dim, out, dim_size)[0]
     elif reduce == "max":
-        return scatter_max(src, index, dim, out, dim_size)[0]
+        # return scatter_max(src, index, dim, out, dim_size)[0]
+        return manual_scatter_max(src, index, dim, out, dim_size)
     else:
         raise ValueError
 
diff --git a/src/stretch/utils/voxel.py b/src/stretch/utils/voxel.py
index ec5008f5b..98e859b60 100644
--- a/src/stretch/utils/voxel.py
+++ b/src/stretch/utils/voxel.py
@@ -37,8 +37,49 @@
 from torch import Tensor
 
 
+def xyz_to_flat_index(xyz, grid_size):
+    """
+    Convert N x 3 tensor of XYZ coordinates to flat indices.
+
+    Args:
+    xyz (torch.Tensor): N x 3 tensor of XYZ coordinates
+    grid_size (torch.Tensor or list): Size of the grid in each dimension [X, Y, Z]
+
+    Returns:
+    torch.Tensor: N tensor of flat indices
+    """
+    if isinstance(grid_size, list):
+        grid_size = torch.tensor(grid_size)
+
+    return xyz[:, 0] + xyz[:, 1] * grid_size[0] + xyz[:, 2] * grid_size[0] * grid_size[1]
+
+
+def flat_index_to_xyz(flat_index, grid_size):
+    """
+    Convert flat indices to N x 3 tensor of XYZ coordinates.
+
+    Args:
+    flat_index (torch.Tensor): N tensor of flat indices
+    grid_size (torch.Tensor or list): Size of the grid in each dimension [X, Y, Z]
+
+    Returns:
+    torch.Tensor: N x 3 tensor of XYZ coordinates
+    """
+    if isinstance(grid_size, list):
+        grid_size = torch.tensor(grid_size)
+
+    z = flat_index // (grid_size[0] * grid_size[1])
+    y = (flat_index % (grid_size[0] * grid_size[1])) // grid_size[0]
+    x = flat_index % grid_size[0]
+
+    return torch.stack([x, y, z], dim=1)
+
+
 def merge_features(
-    idx: Tensor, features: Tensor, method: str | Literal["sum", "min", "max", "mean"] = "sum"
+    idx: Tensor,
+    features: Tensor,
+    method: str | Literal["sum", "min", "max", "mean"] = "sum",
+    grid_dimensions: Optional[List[int]] = None,
 ) -> Tuple[Tensor, Tensor]:
     """
     Merge features based on the given indices using the specified method.
@@ -74,10 +115,11 @@ def merge_features(
         >>> print(unique_idx)
         tensor([0, 1, 2])
     """
-    print(idx.dim(), idx.shape)
-    breakpoint()
-    if idx.dim() != 1:
-        raise ValueError("idx must be a 1D tensor")
+    if idx.dim() == 2 and idx.shape[-1] == 3:
+        # Convert from voxel indices
+        idx = xyz_to_flat_index(idx, grid_size=grid_dimensions)
+    elif idx.dim() != 1:
+        raise ValueError("idx must be a 1D tensor or a N x 3 tensor; was {}".format(idx.shape))
     if features.dim() != 2 or features.size(0) != idx.size(0):
         raise ValueError("features must be a 2D tensor with shape (len(idx), feature_dim)")
 
@@ -95,7 +137,7 @@ def merge_features(
         merged = torch.min(merged.index_copy(0, inverse_idx, features), merged)
     elif method == "max":
         merged = torch.full(
-            (num_unique, feature_dim), float("-inf"), dtype=features.dtype, device=features.device
+            (num_unique, feature_dim), -1, dtype=features.dtype, device=features.device
         )
         merged = torch.max(merged.index_copy(0, inverse_idx, features), merged)
     elif method == "mean":
@@ -107,6 +149,9 @@ def merge_features(
     else:
         raise ValueError("Invalid merge method. Choose from 'sum', 'min', 'max', or 'mean'.")
 
+    if grid_dimensions is not None:
+        unique_idx = flat_index_to_xyz(unique_idx, grid_size=grid_dimensions)
+
     return unique_idx, merged
 
 

From bbe7d6f2c3ed254cbafb4976d215b09c4be420ea Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 27 Sep 2024 12:27:39 -0400
Subject: [PATCH 248/410] fixing type checker issue

---
 .../utils/data_tools/read_dobbe_format.py     | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/stretch/utils/data_tools/read_dobbe_format.py b/src/stretch/utils/data_tools/read_dobbe_format.py
index a2e9c8cc8..ad8655930 100644
--- a/src/stretch/utils/data_tools/read_dobbe_format.py
+++ b/src/stretch/utils/data_tools/read_dobbe_format.py
@@ -167,7 +167,7 @@ def load_from_raw(
     raw_dir: str | Path,
     out_dir: Optional[str | Path],
     fps: Optional[int] = 15,
-    save_video: bool = False,
+    video: bool = False,
     debug: bool = False,
     max_episodes: Optional[int] = None,
 ):
@@ -200,7 +200,7 @@ def load_from_raw(
     for ep_idx, ep_path in tqdm.tqdm(enumerate(episode_dirs), total=len(episode_dirs)):
 
         # Dictionary for episode data
-        ep_dict = {}
+        ep_dict: Dict[str, Any] = {}
         num_frames = 0
 
         # Parse observation state and action
@@ -228,7 +228,7 @@ def load_from_raw(
             img_key = f"observation.images.{camera}"
             depth_key = f"observation.images.{camera}_depth"
 
-            if save_video:
+            if video:
                 video_path = ep_path / f"{camera}_compressed_video_h264.mp4"
 
                 fname = f"{camera}_episode_{ep_idx:06d}.mp4"
@@ -340,7 +340,7 @@ def from_raw_to_lerobot_format(
     raw_dir: Path | str,
     out_dir: Optional[Path | str] = None,
     fps: Optional[int] = None,
-    save_video: bool = False,
+    video: bool = False,
     debug: bool = False,
 ):
     """
@@ -350,7 +350,7 @@ def from_raw_to_lerobot_format(
         raw_dir (Path | str): Path to raw dobb-e format.
         out_dir (Optional[Path | str], optional): Output directory. Defaults to None.
         fps (Optional[int], optional): Frames per second. Defaults to None.
-        save_video (bool, optional): Save video. Defaults to False.
+        video (bool, optional): Use video. Defaults to False.
         debug (bool, optional): Debug mode. Defaults to False.
     """
 
@@ -366,11 +366,11 @@ def from_raw_to_lerobot_format(
     if fps is None:
         fps = 15
 
-    data_dir, episode_data_index, info = load_from_raw(raw_dir, out_dir, fps, save_video, debug)
-    hf_dataset = to_hf_dataset(data_dir, save_video)
+    data_dir, episode_data_index, info = load_from_raw(raw_dir, out_dir, fps, video, debug)
+    hf_dataset = to_hf_dataset(data_dir, video)
 
     info["fps"] = fps
-    info["video"] = save_video
+    info["video"] = video
 
     return hf_dataset, episode_data_index, info
 
@@ -380,7 +380,7 @@ def from_raw_to_lerobot_format(
     test_path = Path("data/pickup_pink_cup/default_user/default_env/")
 
     data_dir, episode_data_index, info = load_from_raw(
-        test_path, out_dir=None, fps=None, save_video=False, debug=False, max_episodes=1
+        test_path, out_dir=None, fps=None, video=False, debug=False, max_episodes=1
     )
 
     # Pull out the first episode from episode_data_index
@@ -389,7 +389,7 @@ def from_raw_to_lerobot_format(
 
     # Pull out the first image from data_dir
     # This is a PIL image
-    pil_gripper_image = data_dir["observation.images.gripper"]
+    pil_gripper_image = data_dir["observation.images.gripper"][0]
     gripper_image = np.array(pil_gripper_image)
 
     # Pull out the head image from data_dir

From 85f49926a9ba7a93e9af29ffe176f6cd1c194fde Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 27 Sep 2024 15:09:31 -0400
Subject: [PATCH 249/410] Update base

---
 src/stretch/llms/base.py         |  6 ++-
 src/stretch/llms/molmo_client.py | 74 ++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 1 deletion(-)
 create mode 100644 src/stretch/llms/molmo_client.py

diff --git a/src/stretch/llms/base.py b/src/stretch/llms/base.py
index 5c93c2da4..9d77faf91 100644
--- a/src/stretch/llms/base.py
+++ b/src/stretch/llms/base.py
@@ -10,6 +10,8 @@
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Optional, Tuple, Union
 
+import pil
+
 
 class AbstractPromptBuilder(ABC):
     """Abstract base class for a prompt generator."""
@@ -95,7 +97,9 @@ def get_history_as_str(self) -> str:
         return history_str
 
     @abstractmethod
-    def __call__(self, command: str, verbose: bool = False):
+    def __call__(
+        self, command: str, image: Optional[pil.Image.Image] = None, verbose: bool = False
+    ):
         """Interact with the language model to generate a plan."""
 
     def parse(self, content: str) -> List[Tuple[str, str]]:
diff --git a/src/stretch/llms/molmo_client.py b/src/stretch/llms/molmo_client.py
new file mode 100644
index 000000000..b7c4fdf8d
--- /dev/null
+++ b/src/stretch/llms/molmo_client.py
@@ -0,0 +1,74 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+from typing import Any, Dict, Optional, Union
+
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
+
+from stretch.llms.base import AbstractLLMClient, AbstractPromptBuilder
+
+
+class MolmoClient(AbstractLLMClient):
+    def __init__(
+        self,
+        prompt: Union[str, AbstractPromptBuilder],
+        prompt_kwargs: Optional[Dict[str, Any]] = None,
+        model: Optional[str] = None,
+    ):
+
+        if model is None:
+            model = "allenai/MolmoE-1B-0924"
+        super().__init__(prompt, prompt_kwargs)
+        # load the processor
+        self.processor = AutoProcessor.from_pretrained(
+            model, trust_remote_code=True, torch_dtype="auto", device_map="auto"
+        )
+
+        # load the model
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model, trust_remote_code=True, torch_dtype="auto", device_map="auto"
+        )
+
+    def __call__(self, command: str, image: Optional[Image.Image] = None, verbose: bool = False):
+        # process the image and text
+        inputs = self.processor.process(images=[image] if image is not None else [], text=command)
+
+        # move inputs to the correct device and make a batch of size 1
+        inputs = {k: v.to(self.model.device).unsqueeze(0) for k, v in inputs.items()}
+
+        # generate output; maximum 200 new tokens; stop generation when <|endoftext|> is generated
+        output = self.model.generate_from_batch(
+            inputs,
+            GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
+            tokenizer=self.processor.tokenizer,
+        )
+
+        # only get generated tokens; decode them to text
+        generated_tokens = output[0, inputs["input_ids"].size(1) :]
+        generated_text = self.processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
+
+        return generated_text
+
+
+if __name__ == "__main__":
+    import requests
+
+    image = Image.open(requests.get("https://picsum.photos/id/237/536/354", stream=True).raw)
+
+    # import matplotlib.pyplot as plt
+
+    client = MolmoClient(prompt=None)
+
+    generated_text = client(command="Describe this image.", image=image)
+
+    # print the generated text
+    # >>> This photograph captures a small black puppy, likely a Labrador or a similar breed,
+    #     sitting attentively on a weathered wooden deck. The deck, composed of three...
+    print(generated_text)

From 955815d24c3dc21f7ba9e33ae7f678d3e0bdd5ab Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 27 Sep 2024 15:17:05 -0400
Subject: [PATCH 250/410] Update molmo documentation

---
 src/stretch/llms/molmo_client.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/stretch/llms/molmo_client.py b/src/stretch/llms/molmo_client.py
index b7c4fdf8d..c937e898e 100644
--- a/src/stretch/llms/molmo_client.py
+++ b/src/stretch/llms/molmo_client.py
@@ -37,6 +37,14 @@ def __init__(
         )
 
     def __call__(self, command: str, image: Optional[Image.Image] = None, verbose: bool = False):
+        """Run the model on the given command and image.
+
+        Args:
+            command (str): The command to run the model on.
+            image (Optional[Image.Image], optional): The image to run the model on. Defaults to None.
+            verbose (bool, optional): Whether to print the generated text. Defaults to False.
+        """
+
         # process the image and text
         inputs = self.processor.process(images=[image] if image is not None else [], text=command)
 

From 9a0205498bcf0388c45c494e010f245d0e54c39f Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 27 Sep 2024 15:17:37 -0400
Subject: [PATCH 251/410] Update molmo docs

---
 src/stretch/llms/molmo_client.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/stretch/llms/molmo_client.py b/src/stretch/llms/molmo_client.py
index c937e898e..24935e380 100644
--- a/src/stretch/llms/molmo_client.py
+++ b/src/stretch/llms/molmo_client.py
@@ -18,10 +18,16 @@
 class MolmoClient(AbstractLLMClient):
     def __init__(
         self,
-        prompt: Union[str, AbstractPromptBuilder],
+        prompt: Optional[Union[str, AbstractPromptBuilder]] = None,
         prompt_kwargs: Optional[Dict[str, Any]] = None,
         model: Optional[str] = None,
     ):
+        """
+        Args:
+            prompt (Optional[Union[str, AbstractPromptBuilder]], optional): The prompt to use for the model. Defaults to None.
+            prompt_kwargs (Optional[Dict[str, Any]], optional): The keyword arguments for the prompt. Defaults to None.
+            model (Optional[str], optional): The model to use. Defaults to None.
+        """
 
         if model is None:
             model = "allenai/MolmoE-1B-0924"

From 81c92230355f8e51def8d07f7592fa31c1774a48 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 27 Sep 2024 15:18:51 -0400
Subject: [PATCH 252/410] update base

---
 src/stretch/llms/base.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/stretch/llms/base.py b/src/stretch/llms/base.py
index 9d77faf91..ed9c720d4 100644
--- a/src/stretch/llms/base.py
+++ b/src/stretch/llms/base.py
@@ -47,11 +47,13 @@ def __init__(
         prompt: Union[str, AbstractPromptBuilder],
         prompt_kwargs: Optional[Dict[str, Any]] = None,
     ):
-        self.prompt_kwargs = prompt
+        self.prompt_kwargs = prompt_kwargs
         self.reset()
 
         # If the prompt is a string, use it as the prompt. Otherwise, generate the prompt string.
-        if isinstance(prompt, str):
+        if prompt is None:
+            self._prompt = ""
+        elif isinstance(prompt, str):
             self._prompt = prompt
         else:
             if prompt_kwargs is None:

From a1de434bd861fdbb84bc5b8ae46efb52d5a203fe Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 27 Sep 2024 15:42:02 -0400
Subject: [PATCH 253/410] Update one for base

---
 src/stretch/llms/base.py         |  6 ++----
 src/stretch/llms/molmo_client.py | 12 ++++++++++--
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/stretch/llms/base.py b/src/stretch/llms/base.py
index ed9c720d4..a147eff16 100644
--- a/src/stretch/llms/base.py
+++ b/src/stretch/llms/base.py
@@ -10,7 +10,7 @@
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Optional, Tuple, Union
 
-import pil
+from PIL import Image
 
 
 class AbstractPromptBuilder(ABC):
@@ -99,9 +99,7 @@ def get_history_as_str(self) -> str:
         return history_str
 
     @abstractmethod
-    def __call__(
-        self, command: str, image: Optional[pil.Image.Image] = None, verbose: bool = False
-    ):
+    def __call__(self, command: str, image: Optional[Image.Image] = None, verbose: bool = False):
         """Interact with the language model to generate a plan."""
 
     def parse(self, content: str) -> List[Tuple[str, str]]:
diff --git a/src/stretch/llms/molmo_client.py b/src/stretch/llms/molmo_client.py
index 24935e380..fba9e5a3e 100644
--- a/src/stretch/llms/molmo_client.py
+++ b/src/stretch/llms/molmo_client.py
@@ -34,12 +34,20 @@ def __init__(
         super().__init__(prompt, prompt_kwargs)
         # load the processor
         self.processor = AutoProcessor.from_pretrained(
-            model, trust_remote_code=True, torch_dtype="auto", device_map="auto"
+            model,
+            trust_remote_code=True,
+            torch_dtype="auto",
+            device_map="auto",
+            offload_folder="./data",
         )
 
         # load the model
         self.model = AutoModelForCausalLM.from_pretrained(
-            model, trust_remote_code=True, torch_dtype="auto", device_map="auto"
+            model,
+            trust_remote_code=True,
+            torch_dtype="auto",
+            device_map="auto",
+            offload_folder="./data",
         )
 
     def __call__(self, command: str, image: Optional[Image.Image] = None, verbose: bool = False):

From 45fe321588b80f81f0c6cab4ed343f34cecf2d33 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 27 Sep 2024 15:46:19 -0400
Subject: [PATCH 254/410] Update molmo client

---
 src/stretch/llms/molmo_client.py | 6 ++++--
 src/stretch/utils/config.py      | 8 ++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/stretch/llms/molmo_client.py b/src/stretch/llms/molmo_client.py
index fba9e5a3e..a703ac604 100644
--- a/src/stretch/llms/molmo_client.py
+++ b/src/stretch/llms/molmo_client.py
@@ -13,6 +13,7 @@
 from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
 
 from stretch.llms.base import AbstractLLMClient, AbstractPromptBuilder
+from stretch.utils.config import get_offload_path
 
 
 class MolmoClient(AbstractLLMClient):
@@ -32,13 +33,14 @@ def __init__(
         if model is None:
             model = "allenai/MolmoE-1B-0924"
         super().__init__(prompt, prompt_kwargs)
+        save_dir = get_offload_path("molmo")
         # load the processor
         self.processor = AutoProcessor.from_pretrained(
             model,
             trust_remote_code=True,
             torch_dtype="auto",
             device_map="auto",
-            offload_folder="./data",
+            offload_folder=save_dir,
         )
 
         # load the model
@@ -47,7 +49,7 @@ def __init__(
             trust_remote_code=True,
             torch_dtype="auto",
             device_map="auto",
-            offload_folder="./data",
+            offload_folder=save_dir,
         )
 
     def __call__(self, command: str, image: Optional[Image.Image] = None, verbose: bool = False):
diff --git a/src/stretch/utils/config.py b/src/stretch/utils/config.py
index 7c766d3e2..7ece07379 100644
--- a/src/stretch/utils/config.py
+++ b/src/stretch/utils/config.py
@@ -35,6 +35,14 @@ def get_data_path(ext: str) -> str:
     return os.path.join(DATA_ROOT, ext)
 
 
+def get_offload_path(ext: str) -> str:
+    """Returns full path to a particular file in the offload directory"""
+    folder = os.path.join(DATA_ROOT, "offload", ext)
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+    return folder
+
+
 def get_scene_path(ext: str) -> str:
     """Returns full path to a particular file in the scene directory"""
     return os.path.join(DATA_ROOT, "scenes", ext)

From 371059008ddb56001f42243450076ce7b0208d50 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 27 Sep 2024 16:18:30 -0400
Subject: [PATCH 255/410] Update

---
 src/stretch/dynav/run_ok_nav.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/stretch/dynav/run_ok_nav.py b/src/stretch/dynav/run_ok_nav.py
index c1ec0c10d..9315b1ba1 100644
--- a/src/stretch/dynav/run_ok_nav.py
+++ b/src/stretch/dynav/run_ok_nav.py
@@ -104,7 +104,9 @@ def main(
     # img_thread.start()
 
     while True:
-        mode = input("select mode? E/N/S")
+        print("Select mode: E for exploration, N for open-vocabulary navigation, S for save.")
+        mode = input("select mode? E/N/S: ")
+        mode = mode.upper()
         if mode == "S":
             demo.image_processor.write_to_pickle()
             break

From 147d552f66b353afd83558d798241d8f8b97b2c3 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 27 Sep 2024 16:19:10 -0400
Subject: [PATCH 256/410] update

---
 mypy.ini | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 mypy.ini

diff --git a/mypy.ini b/mypy.ini
new file mode 100644
index 000000000..3dc875f9d
--- /dev/null
+++ b/mypy.ini
@@ -0,0 +1,2 @@
+[mypy]
+plugins = numpy.typing.mypy_plugin

From 71c94c8cee93cbdb9d6519b6c36dfcb9326ccc4a Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 30 Sep 2024 15:25:12 -0400
Subject: [PATCH 257/410] various fixes

---
 src/stretch/simulation/mujoco_server.py | 42 ++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/src/stretch/simulation/mujoco_server.py b/src/stretch/simulation/mujoco_server.py
index 2e58d2234..be3c4ab10 100644
--- a/src/stretch/simulation/mujoco_server.py
+++ b/src/stretch/simulation/mujoco_server.py
@@ -14,6 +14,7 @@
 from typing import Any, Dict, Optional
 
 import click
+import mujoco
 import numpy as np
 from overrides import override
 from stretch_mujoco import StretchMujocoSimulator
@@ -76,14 +77,49 @@ class MujocoZmqServer(BaseZmqServer):
     - Stretch_mujoco installation: https://github.com/hello-robot/stretch_mujoco/
     """
 
+    # Do we use a navigation controller command to move the robot back at the start of a Robocasa task?
+    _move_back_at_start: bool = False
+
     hz = CONTROL_HZ
     # How long should the controller report done before we're actually confident that we're done?
     done_t = 0.1
 
+    robocasa_start_offset = np.array([0.5, 0, -np.pi / 2])
+
     # Print debug messages for control loop
     debug_control_loop = False
     debug_set_goal_pose = True
 
+    def set_robot_position(self, xyt: np.ndarray, relative: bool = False) -> None:
+        """Set the robot position in the simulation."""
+
+        # Get mjdata and mjmodel from simulator
+        mjdata = self.robot_sim.mjdata
+        mjmodel = self.robot_sim.mjmodel
+
+        # Compute absolute goal
+        if relative:
+            xyt_base = self.get_base_pose()
+            xyt_goal = xyt_base_to_global(xyt, xyt_base)
+        else:
+            xyt_goal = xyt
+
+        # Get current position from mjdata
+        # TODO: remove debug print
+        # body_names = [mjmodel.body(i).name for i in range(mjmodel.nbody)]
+        # print(body_names)
+        base_pos = mjdata.body("base_link").xpos
+        mjdata.body["base_link"].body_xpos[:2] = xyt_goal[:2]
+
+        # Force the simulator to update
+        mujoco.mj_forward(mjmodel, mjdata)
+
+        # Convert the theta into a quaternion
+
+    def reset(self):
+        """Reset the robot to the initial state."""
+        self.robot_sim.reset_state()
+
     def __init__(
         self,
         *args,
@@ -379,7 +415,11 @@ def start(self, show_viewer_ui: bool = False, robocasa: bool = False):
             # When you start, move the agent back a bit
             # This is a hack!
             time.sleep(1.0)
-            self.set_goal_pose(np.array([-0.5, 0, 0]), relative=True)
+
+            if self._move_back_at_start:
+                self.set_goal_pose(self.robocasa_start_offset, relative=True)
+            else:
+                self.set_robot_position(self.robocasa_start_offset, relative=True)
 
         while self.is_running():
             self._camera_data = self.robot_sim.pull_camera_data()

From 06d860f2dffc734e3b95529a8ece09156b1e5639 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 30 Sep 2024 15:34:59 -0400
Subject: [PATCH 258/410] Trying to add some utilities to mujoco server

---
 src/stretch/simulation/mujoco_server.py | 51 ++++++++++++++++++++-----
 1 file changed, 42 insertions(+), 9 deletions(-)

diff --git a/src/stretch/simulation/mujoco_server.py b/src/stretch/simulation/mujoco_server.py
index be3c4ab10..b3afa72b7 100644
--- a/src/stretch/simulation/mujoco_server.py
+++ b/src/stretch/simulation/mujoco_server.py
@@ -14,7 +14,6 @@
 from typing import Any, Dict, Optional
 
 import click
-import mujoco
 import numpy as np
 from overrides import override
 from stretch_mujoco import StretchMujocoSimulator
@@ -90,8 +89,35 @@ class MujocoZmqServer(BaseZmqServer):
     debug_control_loop = False
     debug_set_goal_pose = True
 
+    def get_body_xyt(self, body_name: str) -> np.ndarray:
+        """Get the se(2) base pose: x, y, and theta"""
+
+        # Get mjdata and mjmodel from simulator
+        mjdata = self.robot_sim.mjdata
+        mjmodel = self.robot_sim.mjmodel
+
+        xyz = mjdata.body(body_name).xpos
+        rotation = mjdata.body("base_link").xmat.reshape(3, 3)
+        theta = np.arctan2(rotation[1, 0], rotation[0, 0])
+        return np.array([xyz[0], xyz[1], theta])
+
     def set_robot_position(self, xyt: np.ndarray, relative: bool = False) -> None:
-        """Set the robot position in the simulation."""
+        """Set the robot position in the simulation
+
+        Args:
+            xyt (np.ndarray): The desired pose of the robot in world coordinates (x, y, theta).
+            relative (bool): If True, the pose is relative to the current pose of the robot.
+        """
+        return self.set_body_position(xyt, "base_link", relative=relative)
+
+    def set_body_position(self, xyt: np.ndarray, body_name: str, relative: bool = False) -> None:
+        """Set the robot position in the simulation.
+
+        Args:
+            xyt (np.ndarray): The desired pose of the robot in world coordinates (x, y, theta).
+            body_name (str): The name of the body to set the position of.
+            relative (bool): If True, the pose is relative to the current pose of the robot.
+        """
 
         # Get mjdata and mjmodel from simulator
         mjdata = self.robot_sim.mjdata
@@ -99,7 +125,7 @@ def set_robot_position(self, xyt: np.ndarray, relative: bool = False) -> None:
 
         # Compute absolute goal
         if relative:
-            xyt_base = self.get_base_pose()
+            xyt_base = self.get_body_xyt(body_name)
             xyt_goal = xyt_base_to_global(xyt, xyt_base)
         else:
             xyt_goal = xyt
@@ -109,12 +135,19 @@ def set_robot_position(self, xyt: np.ndarray, relative: bool = False) -> None:
         # body_names = [mjmodel.body(i).name for i in range(mjmodel.nbody)]
         # print(body_names)
         base_pos = mjdata.body("base_link").xpos
-        mjdata.body["base_link"].body_xpos[:2] = xyt_goal[:2]
-
-        # Force the simulator to update
-        mujoco.mj_forward(mjmodel, mjdata)
-
-        # Convert the theta into a quaternion
+        print(f"Current {body_name} position: {base_pos}")
+        base_pos[0] = xyt_goal[0]
+        base_pos[1] = xyt_goal[1]
+        print(f"Setting {body_name} to {base_pos}")
+        mjdata.body("base_link").xpos = base_pos
+
+        # Convert the theta into a rotation matrix
+        rotation = mjdata.body("base_link").xmat.reshape(3, 3)
+        rotation[0, 0] = np.cos(xyt_goal[2])
+        rotation[0, 1] = -np.sin(xyt_goal[2])
+        rotation[1, 0] = np.sin(xyt_goal[2])
+        rotation[1, 1] = np.cos(xyt_goal[2])
+        mjdata.body("base_link").xmat = rotation.flatten()
 
     def reset(self):
         """Reset the robot to the initial state."""

From dabc9dbf5de3f69af3c242ec1a644f8049cea50d Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 30 Sep 2024 16:04:37 -0400
Subject: [PATCH 259/410] Update robocasa gen

---
 src/stretch/simulation/mujoco_server.py |  25 +-
 src/stretch/simulation/robocasa_gen.py  | 305 ++++++++++++++++++++++++
 2 files changed, 326 insertions(+), 4 deletions(-)
 create mode 100644 src/stretch/simulation/robocasa_gen.py

diff --git a/src/stretch/simulation/mujoco_server.py b/src/stretch/simulation/mujoco_server.py
index b3afa72b7..fc96478a9 100644
--- a/src/stretch/simulation/mujoco_server.py
+++ b/src/stretch/simulation/mujoco_server.py
@@ -19,7 +19,7 @@
 from stretch_mujoco import StretchMujocoSimulator
 
 try:
-    from stretch_mujoco.robocasa_gen import model_generation_wizard
+    from stretch.simulation.robocasa_gen import model_generation_wizard
 except ImportError as e:
     from stretch.utils.logger import error
 
@@ -101,6 +101,21 @@ def get_body_xyt(self, body_name: str) -> np.ndarray:
         theta = np.arctan2(rotation[1, 0], rotation[0, 0])
         return np.array([xyz[0], xyz[1], theta])
 
+    def list_all_bodies(self):
+        """Debug function to list all bodies in the simulation."""
+        model = self.robot_sim.mjmodel
+        data = self.robot_sim.mjdata
+
+        # Loop over all bodies
+        for i in range(model.nbody):
+            body_name = model.body(i).name
+            body_pos = data.body(i).xpos
+            body_quat = data.body(i).xquat
+
+            print(f"Body {i}: {body_name}")
+            print(f"  Position: {body_pos}")
+            print(f"  Orientation (quaternion): {body_quat}")
+
     def set_robot_position(self, xyt: np.ndarray, relative: bool = False) -> None:
         """Set the robot position in the simulation
 
@@ -160,6 +175,7 @@ def __init__(
         scene_model: Optional[str] = None,
         simulation_rate: int = 200,
         config_name: str = "noplan_velocity_sim",
+        objects_info: Optional[Dict[str, Any]] = None,
         **kwargs,
     ):
         super(MujocoZmqServer, self).__init__(*args, **kwargs)
@@ -176,6 +192,7 @@ def __init__(
         else:
             self.robot_sim = StretchMujocoSimulator(scene_path)
         self.simulation_rate = simulation_rate
+        self.objects_info = objects_info
 
         # Hard coded printout rates
         self.report_steps = 1000
@@ -451,8 +468,6 @@ def start(self, show_viewer_ui: bool = False, robocasa: bool = False):
 
             if self._move_back_at_start:
                 self.set_goal_pose(self.robocasa_start_offset, relative=True)
-            else:
-                self.set_robot_position(self.robocasa_start_offset, relative=True)
 
         while self.is_running():
             self._camera_data = self.robot_sim.pull_camera_data()
@@ -677,12 +692,13 @@ def main(
 ):
 
     scene_model = None
+    objects_info = None
     if use_robocasa:
         scene_model, scene_xml, objects_info = model_generation_wizard(
             task=robocasa_task,
             style=robocasa_style,
             layout=robocasa_layout,
-            write_to_file=robocasa_write_to_xml,
+            write_to_file=robocasa_write_to_xml,  # type: ignore
         )
 
     server = MujocoZmqServer(
@@ -697,6 +713,7 @@ def main(
         depth_scaling,
         scene_path=scene_path,
         scene_model=scene_model,
+        objects_info=objects_info,
     )
     try:
         server.start(show_viewer_ui=show_viewer_ui, robocasa=use_robocasa)
diff --git a/src/stretch/simulation/robocasa_gen.py b/src/stretch/simulation/robocasa_gen.py
new file mode 100644
index 000000000..fce997ea5
--- /dev/null
+++ b/src/stretch/simulation/robocasa_gen.py
@@ -0,0 +1,305 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+from collections import OrderedDict
+from typing import Tuple
+
+import click
+import mujoco
+import mujoco.viewer
+import numpy as np
+import robosuite
+from robocasa.models.scenes.scene_registry import StyleType
+from robosuite import load_controller_config
+from stretch_mujoco import StretchMujocoSimulator
+from stretch_mujoco.utils import (
+    get_absolute_path_stretch_xml,
+    insert_line_after_mujoco_tag,
+    replace_xml_tag_value,
+    xml_modify_body_pos,
+    xml_remove_subelement,
+    xml_remove_tag_by_name,
+)
+from termcolor import colored
+
+from stretch.utils.geometry import xyt_base_to_global
+
+
+def get_styles() -> OrderedDict:
+    raw_styles = dict(map(lambda item: (item.value, item.name.lower().capitalize()), StyleType))
+    styles = OrderedDict()
+    for k in sorted(raw_styles.keys()):
+        if k < 0:
+            continue
+        styles[k] = raw_styles[k]
+    return styles
+
+
+"""
+Modified version of robocasa's kitchen scene generation script
+https://github.com/robocasa/robocasa/blob/main/robocasa/demos/demo_kitchen_scenes.py
+"""
+
+
+def choose_option(options, option_name, show_keys=False, default=None, default_message=None):
+    """
+    Prints out environment options, and returns the selected env_name choice
+
+    Returns:
+        str: Chosen environment name
+    """
+    # get the list of all tasks
+
+    if default is None:
+        default = options[0]
+
+    if default_message is None:
+        default_message = default
+
+    # Select environment to run
+    print("{}s:".format(option_name.capitalize()))
+
+    for i, (k, v) in enumerate(options.items()):
+        if show_keys:
+            print("[{}] {}: {}".format(i, k, v))
+        else:
+            print("[{}] {}".format(i, v))
+    print()
+    try:
+        s = input(
+            "Choose an option 0 to {}, or any other key for default ({}): ".format(
+                len(options) - 1,
+                default_message,
+            )
+        )
+        # parse input into a number within range
+        k = min(max(int(s), 0), len(options) - 1)
+        choice = list(options.keys())[k]
+    except Exception:
+        if default is None:
+            choice = options[0]
+        else:
+            choice = default
+        print("Use {} by default.\n".format(choice))
+
+    # Return the chosen environment name
+    return choice
+
+
+def model_generation_wizard(
+    task: str = "PnPCounterToCab",
+    layout: int = None,
+    style: int = None,
+    write_to_file: str = None,
+    robot_spawn_pose: dict = None,
+) -> Tuple[mujoco.MjModel, str, dict]:
+    """
+    Wizard/API to generate a kitchen model for a given task, layout, and style.
+    If layout and style are not provided, it will take you through a wizard to choose them in the terminal.
+    If robot_spawn_pose is not provided, it will spawn the robot to the default pose from robocasa fixtures.
+    You can also write the generated xml model with absolutepaths to a file.
+    The Object placements are made based on the robocasa defined Kitchen task and uses the default randomized
+    placement distribution
+    Args:
+        task (str): task name
+        layout (int): layout id
+        style (int): style id
+        write_to_file (str): write to file
+        robot_spawn_pose (dict): robot spawn pose {pose: [x, y, z], quat: [x, y, z, w]}
+    Returns:
+        Tuple[mujoco.MjModel, str, Dict]: model, xml string and Object placements info
+    """
+    layouts = OrderedDict(
+        [
+            (0, "One wall"),
+            (1, "One wall w/ island"),
+            (2, "L-shaped"),
+            (3, "L-shaped w/ island"),
+            (4, "Galley"),
+            (5, "U-shaped"),
+            (6, "U-shaped w/ island"),
+            (7, "G-shaped"),
+            (8, "G-shaped (large)"),
+            (9, "Wraparound"),
+        ]
+    )
+
+    styles = get_styles()
+    if layout is None:
+        layout = choose_option(
+            layouts, "kitchen layout", default=-1, default_message="random layouts"
+        )
+    else:
+        layout = layout
+
+    if style is None:
+        style = choose_option(styles, "kitchen style", default=-1, default_message="random styles")
+    else:
+        style = style
+
+    if layout == -1:
+        layout = np.random.choice(range(10))
+        print(colored(f"Randomly choosing layout... id: {layout}", "yellow"))
+    if style == -1:
+        style = np.random.choice(range(11))
+        print(colored(f"Randomly choosing style... id: {style}", "yellow"))
+
+    # Create argument configuration
+    # TODO: Figure how to get an env without robot arg
+    config = {
+        "env_name": task,
+        "robots": "PandaMobile",
+        "controller_configs": load_controller_config(default_controller="OSC_POSE"),
+        "translucent_robot": False,
+        "layout_and_style_ids": [[layout, style]],
+    }
+
+    print(colored("Initializing environment...", "yellow"))
+
+    env = robosuite.make(
+        **config,
+        has_offscreen_renderer=False,
+        render_camera=None,
+        ignore_done=True,
+        use_camera_obs=False,
+        control_freq=20,
+    )
+    print(
+        colored(
+            f"Showing configuration:\n    Layout: {layouts[layout]}\n    Style: {styles[style]}",
+            "green",
+        )
+    )
+    print()
+    print(
+        colored(
+            "Spawning environment...\n",
+            "yellow",
+        )
+    )
+    model = env.sim.model._model
+    xml = env.sim.model.get_xml()
+
+    # Add the object placements to the xml
+    click.secho(f"\nMaking Object Placements for task [{task}]...\n", fg="yellow")
+    object_placements_info = {}
+    for i in range(len(env.object_cfgs)):
+        obj_name = env.object_cfgs[i]["name"]
+        category = env.object_cfgs[i]["info"]["cat"]
+        object_placements = env.object_placements
+        print(
+            f"Placing [Object {i}] (category: {category}, body_name: {obj_name}_main) at "
+            f"pos: {np.round(object_placements[obj_name][0],2)} quat: {np.round(object_placements[obj_name][1],2)}"
+        )
+        xml = xml_modify_body_pos(
+            xml,
+            "body",
+            obj_name + "_main",  # Object name ref in the xml
+            pos=object_placements[obj_name][0],
+            quat=object_placements[obj_name][1],
+        )
+        object_placements_info[obj_name + "_main"] = {
+            "cat": category,
+            "pos": object_placements[obj_name][0],
+            "quat": object_placements[obj_name][1],
+        }
+
+    xml, robot_base_fixture_pose = custom_cleanups(xml)
+
+    print("Placing stretch at", robot_base_fixture_pose)
+    # Quaternion to theta
+    pos = [float(p) for p in robot_base_fixture_pose["pos"].split(" ")]
+    quat = [float(q) for q in robot_base_fixture_pose["quat"].split(" ")]
+    theta = 2 * np.arccos(quat[3])
+    xyt_base = [pos[0], pos[1], theta]
+    xyt_new = xyt_base_to_global([-0.5, 0, -np.pi / 2], xyt_base)
+
+    # Convert back to quaternion
+    quat_new = [0, 0, np.sin(xyt_new[2] / 2), np.cos(xyt_new[2] / 2)]
+    pos_new = [xyt_new[0], xyt_new[1], pos[2]]
+    quat_new_str = " ".join([str(q) for q in quat_new])
+    pos_new_str = " ".join([str(p) for p in pos_new])
+    robot_base_fixture_pose = {"pos": pos_new_str, "quat": quat_new_str}
+    print("Placing stretch at", robot_base_fixture_pose)
+
+    if robot_spawn_pose is not None:
+        robot_base_fixture_pose = robot_spawn_pose
+
+    # add stretch to kitchen
+    click.secho("\nMaking Robot Placement...\n", fg="yellow")
+    xml = add_stretch_to_kitchen(xml, robot_base_fixture_pose)
+    model = mujoco.MjModel.from_xml_string(xml)
+
+    if write_to_file is not None:
+        with open(write_to_file, "w") as f:
+            f.write(xml)
+        print(colored(f"Model saved to {write_to_file}", "green"))
+
+    return model, xml, object_placements_info
+
+
+def custom_cleanups(xml: str) -> Tuple[str, dict]:
+    """
+    Custom cleanups to models from robocasa envs to support
+    use with stretch_mujoco package.
+    """
+
+    # make invisible the red/blue boxes around geom/sites of interests found
+    xml = replace_xml_tag_value(xml, "geom", "rgba", "0.5 0 0 0.5", "0.5 0 0 0")
+    xml = replace_xml_tag_value(xml, "geom", "rgba", "0.5 0 0 1", "0.5 0 0 0")
+    xml = replace_xml_tag_value(xml, "site", "rgba", "0.5 0 0 1", "0.5 0 0 0")
+    xml = replace_xml_tag_value(xml, "site", "actuator", "0.3 0.4 1 0.5", "0.3 0.4 1 0")
+    # remove subelements
+    xml = xml_remove_subelement(xml, "actuator")
+    xml = xml_remove_subelement(xml, "sensor")
+
+    # remove option tag element
+    xml = xml_remove_subelement(xml, "option")
+    # xml = xml_remove_subelement(xml, "size")
+
+    # remove robot
+    xml, remove_robot_attrib = xml_remove_tag_by_name(xml, "body", "robot0_base")
+
+    return xml, remove_robot_attrib
+
+
+def add_stretch_to_kitchen(xml: str, robot_pose_attrib: dict) -> str:
+    """
+    Add stretch robot to kitchen xml
+    """
+    print(
+        f"Adding stretch to kitchen at pos: {robot_pose_attrib['pos']} quat: {robot_pose_attrib['quat']}"
+    )
+    stretch_xml_absolute = get_absolute_path_stretch_xml(robot_pose_attrib)
+    # add Stretch xml
+    xml = insert_line_after_mujoco_tag(
+        xml,
+        f' <include file="{stretch_xml_absolute}"/>',
+    )
+    return xml
+
+
+@click.command()
+@click.option("--task", type=str, default="PnPCounterToCab", help="task")
+@click.option("--layout", type=int, default=None, help="kitchen layout (choose number 0-9)")
+@click.option("--style", type=int, default=None, help="kitchen style (choose number 0-11)")
+@click.option("--write-to-file", type=str, default=None, help="write to file")
+def main(task: str, layout: int, style: int, write_to_file: str):
+    model, xml, objects_info = model_generation_wizard(
+        task=task,
+        layout=layout,
+        style=style,
+        write_to_file=write_to_file,
+    )
+    robot_sim = StretchMujocoSimulator(model=model)
+    robot_sim.start()
+
+
+if __name__ == "__main__":
+    main()

From 2ff9fc77a28e5e10527e079e02a1c21dad94effb Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 30 Sep 2024 16:25:46 -0400
Subject: [PATCH 260/410] Fixing geometry so the robot starts at (0,0,0)

---
 src/stretch/simulation/mujoco_server.py | 52 ++++++++++++++++++-------
 src/stretch/utils/geometry/__init__.py  |  3 ++
 src/stretch/utils/geometry/_base.py     | 41 +++++++++++++++++++
 3 files changed, 83 insertions(+), 13 deletions(-)

diff --git a/src/stretch/simulation/mujoco_server.py b/src/stretch/simulation/mujoco_server.py
index fc96478a9..612b1fed2 100644
--- a/src/stretch/simulation/mujoco_server.py
+++ b/src/stretch/simulation/mujoco_server.py
@@ -34,7 +34,7 @@
 from stretch.motion import HelloStretchIdx
 from stretch.motion.control.goto_controller import GotoVelocityController
 from stretch.utils.config import get_control_config
-from stretch.utils.geometry import xyt_base_to_global
+from stretch.utils.geometry import pose_global_to_base, xyt_base_to_global, xyt_global_to_base
 from stretch.utils.image import scale_camera_matrix
 
 # Maps HelloStretchIdx to actuators
@@ -201,6 +201,7 @@ def __init__(
 
         self._camera_data = None
         self._status = None
+        self._initial_xyt = None
 
         # Controller stuff
         # Is the velocity controller active?
@@ -285,7 +286,11 @@ def control_loop_callback(self):
         if self.debug_control_loop:
             print("Control loop callback: ", self.active, self.xyt_goal, vel_odom)
 
-        self.controller.update_pose_feedback(self.get_base_pose())
+        base_xyt = self.get_base_pose()
+        if base_xyt is None:
+            return
+
+        self.controller.update_pose_feedback(base_xyt)
 
         if self.active and self.xyt_goal is not None:
             # Compute control
@@ -376,19 +381,31 @@ def get_joint_state(self):
 
     def get_base_pose(self) -> np.ndarray:
         """Base pose is the SE(2) pose of the base in world coords (x, y, theta)"""
-        return self.robot_sim.get_base_pose()
+        if self._initial_xyt is None:
+            return None
+        xyt = self.robot_sim.get_base_pose()
+        return xyt_global_to_base(xyt, self._initial_xyt)
 
     def get_ee_pose(self) -> np.ndarray:
         """EE pose is the 4x4 matrix of the end effector location in world coords"""
-        return self.robot_sim.get_ee_pose()
+        if self._initial_xyt is None:
+            return None
+        pose = self.robot_sim.get_ee_pose()
+        return pose_global_to_base(pose, self._initial_xyt)
 
     def get_head_camera_pose(self) -> np.ndarray:
         """Get the camera pose in world coords"""
-        return self.robot_sim.get_link_pose("camera_color_optical_frame")
+        if self._initial_xyt is None:
+            return None
+        pose = self.robot_sim.get_link_pose("camera_color_optical_frame")
+        return pose_global_to_base(pose, self._initial_xyt)
 
     def get_ee_camera_pose(self) -> np.ndarray:
         """Get the end effector camera pose in world coords"""
-        return self.robot_sim.get_link_pose("gripper_camera_color_optical_frame")
+        if self._initial_xyt is None:
+            return None
+        pose = self.robot_sim.get_link_pose("gripper_camera_color_optical_frame")
+        return pose_global_to_base(pose, self._initial_xyt)
 
     def set_posture(self, posture: str) -> bool:
         """Set the posture of the robot."""
@@ -451,9 +468,11 @@ def get_control_mode(self) -> str:
         return self.control_mode
 
     @override
-    def start(self, show_viewer_ui: bool = False, robocasa: bool = False):
+    def start(
+        self, show_viewer_ui: bool = False, robocasa: bool = False, headless: bool = False
+    ) -> None:
         self.robot_sim.start(
-            show_viewer_ui
+            show_viewer_ui=show_viewer_ui, headless=headless
         )  # This will start the simulation and open Mujoco-Viewer window
         super().start()
 
@@ -461,12 +480,13 @@ def start(self, show_viewer_ui: bool = False, robocasa: bool = False):
         self._control_thread = threading.Thread(target=self._control_loop_thread)
         self._control_thread.start()
 
-        if robocasa:
-            # When you start, move the agent back a bit
-            # This is a hack!
-            time.sleep(1.0)
+        self._initial_xyt = self.robot_sim.get_base_pose()
 
+        if robocasa:
             if self._move_back_at_start:
+                # When you start, move the agent back a bit
+                # This is a hack!
+                time.sleep(1.0)
                 self.set_goal_pose(self.robocasa_start_offset, relative=True)
 
         while self.is_running():
@@ -666,6 +686,7 @@ def is_running(self) -> bool:
 @click.option("--robocasa-style", type=int, default=1, help="Robocasa style to generate")
 @click.option("--robocasa-layout", type=int, default=1, help="Robocasa layout to generate")
 @click.option("--show-viewer-ui", default=False, help="Show the Mujoco viewer UI", is_flag=True)
+@click.option("--headless", default=False, help="Run the simulation headless", is_flag=True)
 @click.option(
     "--robocasa-write-to-xml",
     default=False,
@@ -689,6 +710,7 @@ def main(
     robocasa_layout: int,
     robocasa_write_to_xml: bool,
     show_viewer_ui: bool,
+    headless: bool = False,
 ):
 
     scene_model = None
@@ -716,7 +738,11 @@ def main(
         objects_info=objects_info,
     )
     try:
-        server.start(show_viewer_ui=show_viewer_ui, robocasa=use_robocasa)
+        server.start(
+            show_viewer_ui=show_viewer_ui,
+            robocasa=use_robocasa,
+            headless=headless,
+        )
 
     except KeyboardInterrupt:
         server.robot_sim.stop()
diff --git a/src/stretch/utils/geometry/__init__.py b/src/stretch/utils/geometry/__init__.py
index 7be0fdbf8..24ccf88e6 100644
--- a/src/stretch/utils/geometry/__init__.py
+++ b/src/stretch/utils/geometry/__init__.py
@@ -15,8 +15,11 @@
 from ._base import (
     normalize_ang_error,
     point_global_to_base,
+    pose2sophus,
+    pose_global_to_base,
     pose_global_to_base_xyt,
     posquat2sophus,
+    sophus2pose,
     sophus2posquat,
     sophus2xyt,
     xyt2sophus,
diff --git a/src/stretch/utils/geometry/_base.py b/src/stretch/utils/geometry/_base.py
index 5188a3478..6bb8df0b7 100644
--- a/src/stretch/utils/geometry/_base.py
+++ b/src/stretch/utils/geometry/_base.py
@@ -54,6 +54,47 @@ def xyt_global_to_base(XYT, current_pose):
     return sophus2xyt(pose_base2target)
 
 
+def pose_global_to_base(pose, current_pose):
+    """
+    Transforms the point cloud into geocentric frame to account for
+    camera position
+    Input:
+        XYZ                     : ...x3
+        current_pose            : base position (x, y, theta (radians))
+    Output:
+        XYZ : ...x3
+    """
+    pose_world2target = pose2sophus(pose)
+    pose_world2base = xyt2sophus(current_pose)
+    pose_base2target = pose_world2base.inverse() * pose_world2target
+    return sophus2pose(pose_base2target)
+
+
+def pose2sophus(pose: np.ndarray) -> sp.SE3:
+    """Convert a 4x4 matrix in se(3) into a Sophus SE3 object.
+
+    Args:
+        pose: np.ndarray of shape (4, 4)
+
+    Returns:
+        sp.SE3
+    """
+    return sp.SE3(pose[:3, :3], pose[:3, 3])
+
+
+def sophus2pose(pose: sp.SE3) -> np.ndarray:
+    """Convert a Sophus SE3 object into a 4x4 matrix in se(3).
+
+    Args:
+        pose: sp.SE3
+
+    Returns:
+        np.ndarray of shape (4, 4)
+    """
+    pose_mat = pose.matrix()
+    return pose_mat
+
+
 def pose_global_to_base_xyt(pose_world2target: Pose, pose_world2base: Pose) -> np.ndarray:
     """Transform a pose from global to base frame, getting the relative xyt.
     Input:

From 628431f32fefa322e734673d15dc9af55010c82c Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 30 Sep 2024 16:50:18 -0400
Subject: [PATCH 261/410] Update

---
 src/stretch/agent/robot_agent.py   |   1 +
 src/stretch/app/map_saver.py       | 116 -----------------------------
 src/stretch/visualization/rerun.py |   2 +-
 3 files changed, 2 insertions(+), 117 deletions(-)
 delete mode 100644 src/stretch/app/map_saver.py

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 9c7e2724b..cc8e2c847 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -367,6 +367,7 @@ def rotate_in_place(
             print(f"==== ROTATE IN PLACE at {x}, {y} ====")
         while i < steps:
             t0 = timeit.default_timer()
+            print(i, x, y, theta)
             self.robot.navigate_to(
                 [x, y, theta + (i * step_size)],
                 relative=False,
diff --git a/src/stretch/app/map_saver.py b/src/stretch/app/map_saver.py
deleted file mode 100644
index d515039b4..000000000
--- a/src/stretch/app/map_saver.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) Hello Robot, Inc.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the LICENSE file in the root directory
-# of this source tree.
-#
-# Some code may be adapted from other open-source works with their respective licenses. Original
-# license information maybe found below, if so.
-
-# (c) 2024 Hello Robot by Atharva Pusalkar
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-import datetime
-
-import click
-
-# Mapping and perception
-from stretch.agent.robot_agent import RobotAgent
-from stretch.agent.zmq_client import HomeRobotZmqClient
-from stretch.core import get_parameters
-from stretch.llms.openai_client import OpenaiClient
-from stretch.llms.prompts import ObjectManipNavPromptBuilder
-from stretch.perception import create_semantic_sensor
-
-
-@click.command()
-@click.option("--local", is_flag=True, help="Run code locally on the robot.")
-@click.option("--recv_port", default=4401, help="Port to receive observations on")
-@click.option("--send_port", default=4402, help="Port to send actions to on the robot")
-@click.option("--robot_ip", default="")
-@click.option("--output-filename", default="stretch_output", type=str)
-@click.option("--explore-iter", default=0)
-@click.option("--spin", default=False, is_flag=True)
-@click.option("--reset", is_flag=True)
-@click.option(
-    "--input_file", default="", type=str, help="Path to input file used instead of robot data"
-)
-@click.option(
-    "--write-instance-images",
-    default=False,
-    is_flag=True,
-    help="write out images of every object we found",
-)
-@click.option("--parameter-file", default="default_planner.yaml")
-@click.option("--reset", is_flag=True, help="Reset the robot to origin before starting")
-@click.option("--frame", default=-1, help="Final frame to read from input file")
-@click.option("--text", default="", help="Text to encode")
-@click.option("-y", "--yes", is_flag=True, help="Skip confirmation")
-@click.option(
-    "--all-matches",
-    is_flag=True,
-    help="Find all objects with a similarity to the query above some threshold",
-)
-# This threshold seems to work ok for Siglip - will not work for e.g. CLIP
-@click.option("--threshold", default=0.05, help="Threshold for similarity when using --all-matches")
-@click.option(
-    "--stationary",
-    is_flag=True,
-    help="Don't move the robot to the instance, if using real robot instead of offline data",
-)
-def main(
-    device_id: int = 0,
-    verbose: bool = True,
-    parameter_file: str = "config/default_planner.yaml",
-    local: bool = True,
-    recv_port: int = 4401,
-    send_port: int = 4402,
-    robot_ip: str = "",
-    reset: bool = False,
-    explore_iter: int = 0,
-    output_filename: str = "stretch_output",
-    spin: bool = False,
-    write_instance_images: bool = False,
-    input_file: str = "",
-    frame: int = -1,
-    text: str = "",
-    yes: bool = False,
-    stationary: bool = False,
-    all_matches: bool = False,
-    threshold: float = 0.5,
-):
-
-    print("- Load parameters")
-    parameters = get_parameters(parameter_file)
-    semantic_sensor = create_semantic_sensor(
-        device_id=device_id,
-        verbose=verbose,
-        category_map_file=parameters["open_vocab_category_map_file"],
-    )
-
-    real_robot = True
-    current_datetime = datetime.datetime.now()
-    formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
-    output_pkl_filename = output_filename + "_" + formatted_datetime + ".pkl"
-
-    robot = HomeRobotZmqClient(
-        robot_ip=robot_ip,
-        recv_port=recv_port,
-        send_port=send_port,
-        use_remote_computer=(not local),
-        parameters=parameters,
-    )
-    robot.move_to_nav_posture()
-    agent = RobotAgent(robot, parameters, semantic_sensor)
-    # agent.voxel_map.read_from_pickle(input_file)
-
-    prompt = ObjectManipNavPromptBuilder()
-    client = OpenaiClient(prompt)
-
-    print("Saving map")
-    robot.save_map("demo_map")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 7a3dea6af..29d4cc501 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -182,7 +182,7 @@ def __init__(
         self,
         display_robot_mesh: bool = True,
         open_browser: bool = True,
-        server_memory_limit: str = "4GB",
+        server_memory_limit: str = "1GB",
         collapse_panels: bool = True,
         show_cameras_in_3d_view: bool = False,
         show_camera_point_clouds: bool = True,

From 43336edc60b5ff420b95855d3a86de6d036cd4c9 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 30 Sep 2024 16:55:02 -0400
Subject: [PATCH 262/410] updates to codebase

---
 src/stretch/agent/robot_agent.py                  | 1 -
 src/stretch/utils/data_tools/read_dobbe_format.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index cc8e2c847..9c7e2724b 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -367,7 +367,6 @@ def rotate_in_place(
             print(f"==== ROTATE IN PLACE at {x}, {y} ====")
         while i < steps:
             t0 = timeit.default_timer()
-            print(i, x, y, theta)
             self.robot.navigate_to(
                 [x, y, theta + (i * step_size)],
                 relative=False,
diff --git a/src/stretch/utils/data_tools/read_dobbe_format.py b/src/stretch/utils/data_tools/read_dobbe_format.py
index ad8655930..6de977c75 100644
--- a/src/stretch/utils/data_tools/read_dobbe_format.py
+++ b/src/stretch/utils/data_tools/read_dobbe_format.py
@@ -38,11 +38,11 @@
 import numpy as np
 import torch
 import tqdm
-from datasets import Dataset, Features, Image, Sequence, Value
 
 import stretch.utils.logger as logger
 
 try:
+    from lerobot.common.datasets import Dataset, Features, Image, Sequence, Value
     from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes
     from lerobot.common.datasets.utils import hf_transform_to_torch
     from lerobot.common.datasets.video_utils import VideoFrame

From 2aea00716d0a7244d7131d83ab71a132f2f39efe Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 30 Sep 2024 17:01:08 -0400
Subject: [PATCH 263/410] update here

---
 .../utils/data_tools/read_dobbe_format.py     | 34 +++++++++++++++++--
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/src/stretch/utils/data_tools/read_dobbe_format.py b/src/stretch/utils/data_tools/read_dobbe_format.py
index 6de977c75..26ce892fb 100644
--- a/src/stretch/utils/data_tools/read_dobbe_format.py
+++ b/src/stretch/utils/data_tools/read_dobbe_format.py
@@ -32,7 +32,7 @@
 import json
 import shutil
 from pathlib import Path
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
 
 import liblzfse
 import numpy as np
@@ -43,7 +43,6 @@
 
 try:
     from lerobot.common.datasets import Dataset, Features, Image, Sequence, Value
-    from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes
     from lerobot.common.datasets.utils import hf_transform_to_torch
     from lerobot.common.datasets.video_utils import VideoFrame
 
@@ -110,6 +109,35 @@
 #     "joint_wrist_yaw",
 #     "stretch_gripper",
 # ]
+
+
+def concatenate_episodes(ep_dicts: list[Dict[str, Any]]) -> Dict[str, Any]:
+    """Concatenate episodes into a single dictionary.
+
+    Args:
+        ep_dicts (List[Dict[str, Any]]): List of episode dictionaries.
+
+    Returns:
+        Dict[str, Any]: Concatenated episode dictionary.
+    """
+    data_dict: Dict[str, torch.Tensor | List[torch.Tensor]] = {}
+
+    keys = ep_dicts[0].keys()
+    for key in keys:
+        if torch.is_tensor(ep_dicts[0][key][0]):
+            data_dict[key] = torch.cat([ep_dict[key] for ep_dict in ep_dicts])
+        else:
+            if key not in data_dict:
+                data_dict[key] = []
+            for ep_dict in ep_dicts:
+                for x in ep_dict[key]:
+                    data_dict[key].append(x)  # type: ignore
+
+    total_frames = data_dict["frame_index"].shape[0]  # type: ignore
+    data_dict["index"] = torch.arange(0, total_frames, 1)
+    return data_dict
+
+
 def check_format(raw_dir):
 
     print("Image sizes set as: ", IMAGE_SIZE)
@@ -308,7 +336,7 @@ def load_from_raw(
     return data_dict, episode_data_index, info
 
 
-def to_hf_dataset(data_dict, video=False) -> Dataset:
+def to_hf_dataset(data_dict, video=False) -> "Dataset":
     features = {}
 
     keys = [key for key in data_dict if "observation.images." in key]

From b4a4d3102b71edb55a399d824ba7de6cb33fc67e Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 30 Sep 2024 17:01:40 -0400
Subject: [PATCH 264/410] Update dobbe format code

---
 src/stretch/utils/data_tools/read_dobbe_format.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/stretch/utils/data_tools/read_dobbe_format.py b/src/stretch/utils/data_tools/read_dobbe_format.py
index 26ce892fb..13cc55047 100644
--- a/src/stretch/utils/data_tools/read_dobbe_format.py
+++ b/src/stretch/utils/data_tools/read_dobbe_format.py
@@ -138,7 +138,12 @@ def concatenate_episodes(ep_dicts: list[Dict[str, Any]]) -> Dict[str, Any]:
     return data_dict
 
 
-def check_format(raw_dir):
+def check_format(raw_dir: Path) -> None:
+    """Check the format of the raw directory.
+
+    Args:
+        raw_dir (Path): Path to raw directory.
+    """
 
     print("Image sizes set as: ", IMAGE_SIZE)
 

From 949672ea62ec596ca37cee2b95880bf80f3b5242 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 30 Sep 2024 17:03:14 -0400
Subject: [PATCH 265/410] update format errors

---
 src/stretch/utils/data_tools/read_dobbe_format.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/stretch/utils/data_tools/read_dobbe_format.py b/src/stretch/utils/data_tools/read_dobbe_format.py
index 13cc55047..ec5524285 100644
--- a/src/stretch/utils/data_tools/read_dobbe_format.py
+++ b/src/stretch/utils/data_tools/read_dobbe_format.py
@@ -48,7 +48,6 @@
 
     lerobot_found = True
 except ImportError:
-    logger.warning("lerobot not found. cannot export.")
     lerobot_found = False
 
 from PIL import Image as PILImage
@@ -229,6 +228,7 @@ def load_from_raw(
     ep_metadata = []
     episode_data_index: Dict[str, Any] = {"from": [], "to": []}
 
+    # Go through all the episodes
     id_from = 0
     for ep_idx, ep_path in tqdm.tqdm(enumerate(episode_dirs), total=len(episode_dirs)):
 
@@ -344,6 +344,9 @@ def load_from_raw(
 def to_hf_dataset(data_dict, video=False) -> "Dataset":
     features = {}
 
+    if not lerobot_found:
+        raise ImportError("lerobot not found. Cannot export.")
+
     keys = [key for key in data_dict if "observation.images." in key]
     for key in keys:
         if video:
@@ -387,6 +390,9 @@ def from_raw_to_lerobot_format(
         debug (bool, optional): Debug mode. Defaults to False.
     """
 
+    if not lerobot_found:
+        raise ImportError("lerobot not found. Cannot export.")
+
     if isinstance(raw_dir, str):
         raw_dir = Path(raw_dir)
 

From 833b22e5e8d804249df8d586d34c9ee705e9aeaf Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 30 Sep 2024 17:11:07 -0400
Subject: [PATCH 266/410] update format

---
 src/stretch/utils/data_tools/read_dobbe_format.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/stretch/utils/data_tools/read_dobbe_format.py b/src/stretch/utils/data_tools/read_dobbe_format.py
index ec5524285..59ccaa584 100644
--- a/src/stretch/utils/data_tools/read_dobbe_format.py
+++ b/src/stretch/utils/data_tools/read_dobbe_format.py
@@ -215,7 +215,7 @@ def load_from_raw(
 
     # Set default fps
     if fps is None:
-        logger.warning("FPS not set. Defaulting to 15.")
+        logger.warning("[DATASET] FPS not set. Defaulting to 15.")
         fps = 15
 
     if max_episodes is not None and (max_episodes < 0 or max_episodes > len(episode_dirs)):
@@ -284,8 +284,7 @@ def load_from_raw(
 
                 images = []
                 for file in rgb_png:
-                    with PILImage.open(file) as f:
-                        images.append(f)
+                    images.append(PILImage.open(file))
 
                 ep_dict[img_key] = images
 
@@ -435,8 +434,6 @@ def from_raw_to_lerobot_format(
     pil_head_image = data_dir["observation.images.head"][0]
     head_image = np.array(pil_head_image)
 
-    breakpoint()
-
     import matplotlib.pyplot as plt
 
     plt.subplot(1, 2, 1)
@@ -447,4 +444,5 @@ def from_raw_to_lerobot_format(
     plt.imshow(head_image)
     plt.title("Head Image")
     plt.axis("off")
+    plt.suptitle("Images")
     plt.show()

From 373b5b5e5faa6b01ef794c775e530bf52bc25946 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 1 Oct 2024 10:12:48 -0400
Subject: [PATCH 267/410] update

---
 docs/simulation.md          | 27 +++++++++++++
 scripts/install_robocasa.sh | 77 +++++++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+)
 create mode 100644 docs/simulation.md
 create mode 100755 scripts/install_robocasa.sh

diff --git a/docs/simulation.md b/docs/simulation.md
new file mode 100644
index 000000000..283c19fea
--- /dev/null
+++ b/docs/simulation.md
@@ -0,0 +1,27 @@
+# Robocasa Installation
+
+You can install Robocasa by following the instructions below, or you can try the [install script](scripts/install_robocasa.sh).
+
+## Install Robosuite
+
+```bash
+git clone https://github.com/ARISE-Initiative/robosuite -b robocasa_v0.1
+cd robosuite
+pip install -e .
+```
+
+## Install Robocasa
+
+```bash
+cd ..
+git clone https://github.com/robocasa/robocasa
+cd robocasa
+pip install -e .
+```
+
+## Download assets
+
+```bash
+python robocasa/scripts/download_kitchen_assets.py   # Caution: Assets to be downloaded are around 5GB.
+python robocasa/scripts/setup_macros.py              # Set up system variables.
+```
diff --git a/scripts/install_robocasa.sh b/scripts/install_robocasa.sh
new file mode 100755
index 000000000..efbb569e5
--- /dev/null
+++ b/scripts/install_robocasa.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+
+set -e
+
+# Initialize variables
+yes_flag=false
+conda_env=""
+
+# Parse command-line options
+while getopts ":y" opt; do
+    case ${opt} in
+        y )
+            yes_flag=true
+            ;;
+        \? )
+            echo "Invalid option: $OPTARG" 1>&2
+            exit 1
+            ;;
+    esac
+done
+
+# Check if running in a conda/mamba environment
+if [[ -n $CONDA_DEFAULT_ENV ]]; then
+    conda_env=$CONDA_DEFAULT_ENV
+elif [[ -n $MAMBA_ROOT_PREFIX ]]; then
+    conda_env=$(basename "$MAMBA_ROOT_PREFIX")
+else
+    echo "Error: No conda/mamba environment detected. Please activate an environment before running this script."
+    exit 1
+fi
+
+echo "Using conda/mamba environment: $conda_env"
+
+# Function to ask for confirmation
+confirm() {
+    if [ "$yes_flag" = true ]; then
+        return 0
+    fi
+    read -r -p "${1:-Are you sure?} [y/N] " response
+    case "$response" in
+        [yY][eE][sS]|[yY])
+            true
+            ;;
+        *)
+            false
+            ;;
+    esac
+}
+
+# Install Robosuite
+if confirm "Do you want to install Robosuite?"; then
+    git clone https://github.com/ARISE-Initiative/robosuite -b robocasa_v0.1
+    cd robosuite
+    pip install -e .
+    cd ..
+fi
+
+# Install Robocasa
+if confirm "Do you want to install Robocasa?"; then
+    git clone https://github.com/robocasa/robocasa
+    cd robocasa
+    pip install -e .
+    cd ..
+fi
+
+# Download assets
+if confirm "Do you want to download assets (around 5GB)?"; then
+    python robocasa/scripts/download_kitchen_assets.py
+fi
+
+# Setup macros
+if confirm "Do you want to set up system variables?"; then
+    python robocasa/scripts/setup_macros.py
+fi
+
+echo "Installation complete!"
+

From 1521651110bd020f7ffb5fec6f66fdb93e66168d Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 1 Oct 2024 10:22:21 -0400
Subject: [PATCH 268/410] installing scripts for robocasa

---
 docs/simulation.md          | 22 ++++++++++++++++++++++
 scripts/install_robocasa.sh | 35 +++++++++++++++++++++++++----------
 2 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/docs/simulation.md b/docs/simulation.md
index 283c19fea..426445783 100644
--- a/docs/simulation.md
+++ b/docs/simulation.md
@@ -19,9 +19,31 @@ cd robocasa
 pip install -e .
 ```
 
+## Install Stretch Mujoco
+
+```bash
+cd ..
+git clone git@github.com:hello-robot/stretch_mujoco.git
+cd stretch_mujoco
+pip install -e .
+```
+
 ## Download assets
 
 ```bash
 python robocasa/scripts/download_kitchen_assets.py   # Caution: Assets to be downloaded are around 5GB.
 python robocasa/scripts/setup_macros.py              # Set up system variables.
 ```
+
+## Run Robocasa
+
+In one terminal start the server:
+
+```bash
+python -m stretch.simulation.mujoco_server --use-robocasa
+```
+
+In another run an app, like mapping:
+
+```bash
+```
diff --git a/scripts/install_robocasa.sh b/scripts/install_robocasa.sh
index efbb569e5..68bf2dcaa 100755
--- a/scripts/install_robocasa.sh
+++ b/scripts/install_robocasa.sh
@@ -47,31 +47,46 @@ confirm() {
     esac
 }
 
-# Install Robosuite
-if confirm "Do you want to install Robosuite?"; then
+if confirm "Do you want to install Robosuite and Robocasa?"; then
+    # Install Robosuite
     git clone https://github.com/ARISE-Initiative/robosuite -b robocasa_v0.1
     cd robosuite
     pip install -e .
     cd ..
-fi
 
-# Install Robocasa
-if confirm "Do you want to install Robocasa?"; then
+    # Install Robocasa
     git clone https://github.com/robocasa/robocasa
     cd robocasa
     pip install -e .
     cd ..
+else
+    echo "Skipping Robosuite and Robocasa installation"
+    # Quit
+    exit 1
 fi
 
 # Download assets
-if confirm "Do you want to download assets (around 5GB)?"; then
-    python robocasa/scripts/download_kitchen_assets.py
-fi
+python robocasa/scripts/download_kitchen_assets.py
 
 # Setup macros
-if confirm "Do you want to set up system variables?"; then
+if confirm "Do you want to set up system variables and macros for Robocasa?"; then
     python robocasa/scripts/setup_macros.py
 fi
 
-echo "Installation complete!"
+# Clone, cd, and install stretch_mujoco repository
+if git clone git@github.com:hello-robot/stretch_mujoco.git; then
+    echo "Successfully cloned stretch_mujoco repository"
+    cd stretch_mujoco
+    if pip install -e .; then
+        echo "Successfully installed stretch_mujoco"
+    else
+        echo "Failed to install stretch_mujoco"
+        exit 1
+    fi
+    cd ..
+else
+    echo "Failed to clone stretch_mujoco repository"
+    exit 1
+fi
 
+echo "Installation complete!"

From 1b1b6306e538419bfc99f930f87396f026d8b173 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 1 Oct 2024 11:34:47 -0400
Subject: [PATCH 269/410] update error

---
 src/stretch/agent/operations/search_for_object.py | 2 +-
 src/stretch/visualization/rerun.py                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/operations/search_for_object.py b/src/stretch/agent/operations/search_for_object.py
index 311a618f7..809038139 100644
--- a/src/stretch/agent/operations/search_for_object.py
+++ b/src/stretch/agent/operations/search_for_object.py
@@ -148,7 +148,7 @@ def run(self) -> None:
                 self.agent.reset_object_plans()
             else:
                 self.error("Failed to find a reachable frontier.")
-                raise RuntimeError("Failed to find a reachable frontier.")
+                self.agent.go_home()
         else:
             self.cheer(f"Found a receptacle!")
             view = self.agent.current_receptacle.get_best_view()
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 29d4cc501..7a3dea6af 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -182,7 +182,7 @@ def __init__(
         self,
         display_robot_mesh: bool = True,
         open_browser: bool = True,
-        server_memory_limit: str = "1GB",
+        server_memory_limit: str = "4GB",
         collapse_panels: bool = True,
         show_cameras_in_3d_view: bool = False,
         show_camera_point_clouds: bool = True,

From 15662a57091e7a4224bc92799f77ed7331014521 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 1 Oct 2024 11:42:11 -0400
Subject: [PATCH 270/410] add some status messages

---
 .../agent/operations/search_for_object.py     |  6 +++++
 src/stretch/core/status.py                    | 24 +++++++++++++++++++
 src/stretch/core/task.py                      | 16 +++++++++++++
 3 files changed, 46 insertions(+)
 create mode 100644 src/stretch/core/status.py

diff --git a/src/stretch/agent/operations/search_for_object.py b/src/stretch/agent/operations/search_for_object.py
index 809038139..8a55eaf26 100644
--- a/src/stretch/agent/operations/search_for_object.py
+++ b/src/stretch/agent/operations/search_for_object.py
@@ -13,6 +13,7 @@
 import torch
 from PIL import Image
 
+import stretch.core.status as status
 from stretch.agent.base import ManagedOperation
 from stretch.mapping.instance import Instance
 
@@ -83,6 +84,8 @@ def run(self) -> None:
 
         # Update world map
         self.intro("Searching for a receptacle on the floor.")
+        self.set_status(status.RUNNING)
+
         # Must move to nav before we can do anything
         self.robot.move_to_nav_posture()
         # Now update the world
@@ -133,6 +136,7 @@ def run(self) -> None:
 
         # If no receptacle, pick a random point nearby and just wander around
         if self.agent.current_receptacle is None:
+            self.set_status(status.EXPLORING)
             print("None found. Try moving to frontier.")
             # Find a point on the frontier and move there
             res = self.agent.plan_to_frontier(start=start)
@@ -148,9 +152,11 @@ def run(self) -> None:
                 self.agent.reset_object_plans()
             else:
                 self.error("Failed to find a reachable frontier.")
+                self.set_status(status.EXPLORATION_IMPOSSIBLE)
                 self.agent.go_home()
         else:
             self.cheer(f"Found a receptacle!")
+            self.set_status(status.SUCCEEDED)
             view = self.agent.current_receptacle.get_best_view()
             image = Image.fromarray(view.get_image())
             image.save("receptacle.png")
diff --git a/src/stretch/core/status.py b/src/stretch/core/status.py
new file mode 100644
index 000000000..745b660b9
--- /dev/null
+++ b/src/stretch/core/status.py
@@ -0,0 +1,24 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+# Description: This file contains the status of the nodes in a LLM task plan
+IDLE = "idle"
+RUNNING = "running"
+EXPLORING = "exploring"
+FAILED = "failed"
+SUCCEEDED = "succeeded"
+CANNOT_START = "cannot_start"
+WAITING = "waiting"
+EXPLORATION_IMPOSSIBLE = "exploration_impossible"
+
+# Some other ones
+RETRYING = "retrying"
+RETRYING_FAILED = "retrying_failed"
+RETRYING_SUCCEEDED = "retrying_succeeded"
+RETRYING_CANNOT_START = "retrying_cannot_start"
diff --git a/src/stretch/core/task.py b/src/stretch/core/task.py
index 45739793d..adef430cf 100644
--- a/src/stretch/core/task.py
+++ b/src/stretch/core/task.py
@@ -46,6 +46,22 @@ def __init__(
                 )
             self.on_failure = self
 
+        self.status = "idle"
+
+    def __str__(self):
+        return self.name
+
+    def __repr__(self):
+        return self.name
+
+    def get_status(self) -> str:
+        """Return the status of the operation."""
+        return self.status
+
+    def set_status(self, status: str):
+        """Set the status of the operation."""
+        self.status = status
+
     @property
     def name(self) -> str:
         """Return the name of the operation."""

From 40b39e02a1087044ddbeb84ce6ffb0c55d3bf5f8 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 1 Oct 2024 11:45:19 -0400
Subject: [PATCH 271/410] updates

---
 src/stretch/agent/operations/search_for_object.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/stretch/agent/operations/search_for_object.py b/src/stretch/agent/operations/search_for_object.py
index 8a55eaf26..de3ef43d7 100644
--- a/src/stretch/agent/operations/search_for_object.py
+++ b/src/stretch/agent/operations/search_for_object.py
@@ -111,6 +111,9 @@ def run(self) -> None:
             self.error(
                 "Robot is in an invalid configuration. It is probably too close to geometry, or localization has failed."
             )
+            self.error(
+                "This means there was an issue with navigation. Please disable the robot and move it to a safe location."
+            )
             breakpoint()
 
         # Check to see if we have a receptacle in the map

From 77ff6f9cb9b0d79f7147ac81b16a78ec070272ed Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 1 Oct 2024 11:56:02 -0400
Subject: [PATCH 272/410] update constants and configurations

---
 src/stretch/motion/constants.py    |  2 +-
 src/stretch/visualization/rerun.py | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/stretch/motion/constants.py b/src/stretch/motion/constants.py
index cd057a882..507323f71 100644
--- a/src/stretch/motion/constants.py
+++ b/src/stretch/motion/constants.py
@@ -44,7 +44,7 @@
 STRETCH_ARM_LIFT = 0.8
 
 look_at_ee = np.array([-np.pi / 2, -np.pi / 4])
-look_front = np.array([0.0, math.radians(-30)])
+look_front = np.array([0.0, -np.pi / 4])
 look_ahead = np.array([0.0, 0.0])
 look_close = np.array([0.0, math.radians(-45)])
 look_down = np.array([0.0, math.radians(-58)])
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 7a3dea6af..0f598df2a 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -176,7 +176,8 @@ def log_transforms(self, obs, debug: bool = False):
 
 class RerunVisualizer:
 
-    camera_point_radius = 0.01
+    camera_point_radius: float = 0.01
+    max_displayed_points_per_camera: int = 5000
 
     def __init__(
         self,
@@ -258,6 +259,11 @@ def log_head_camera(self, obs: Observations):
         if self.show_camera_point_clouds:
             head_xyz = obs.get_xyz_in_world_frame().reshape(-1, 3)
             head_rgb = obs.rgb.reshape(-1, 3)
+            if self.max_displayed_points_per_camera > 0:
+                idx = np.arange(head_xyz.shape[0])
+                np.random.shuffle(idx)
+                head_xyz = head_xyz[idx[: self.max_displayed_points_per_camera]]
+                head_rgb = head_rgb[idx[: self.max_displayed_points_per_camera]]
             rr.log(
                 "world/head_camera/points",
                 rr.Points3D(
@@ -339,6 +345,11 @@ def log_ee_camera(self, servo):
         if self.show_camera_point_clouds:
             ee_xyz = servo.get_ee_xyz_in_world_frame().reshape(-1, 3)
             ee_rgb = servo.ee_rgb.reshape(-1, 3)
+            if self.max_displayed_points_per_camera > 0:
+                idx = np.arange(ee_xyz.shape[0])
+                np.random.shuffle(idx)
+                ee_xyz = ee_xyz[idx[: self.max_displayed_points_per_camera]]
+                ee_rgb = ee_rgb[idx[: self.max_displayed_points_per_camera]]
             rr.log(
                 "world/ee_camera/points",
                 rr.Points3D(

From 16913dfa92ba537879b30649cbc478384dfb0d36 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 1 Oct 2024 12:08:36 -0400
Subject: [PATCH 273/410] Update

---
 src/stretch/llms/prompts/pickup_prompt.py | 29 +++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index a24aa4fd0..4a9235c8f 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -18,11 +18,13 @@
     - You cannot go up or down stairs
 
 When prompted, you will respond using the three actions:
-- pickup(object_name)
-- place(location_name)
-- say(text)
+- pickup(object_name)  # object_name is the name of the object to pick up
+- place(location_name)  # location_name is the name of the receptacle to place object in
+- say(text)  # say something to the user
+- wave()  # wave at a person
+- go_home()  # navigate back to where you started
 
-These are the only three things you will return, and they are your only way to interact with the world. For example:
+These functions and their arguments are the only things you will return - no comments - and they are your only way to interact with the world. For example:
 
 input: "Put the red apple in the cardboard box"
 output:
@@ -31,6 +33,11 @@
 place(cardboard box)
 end()
 
+input: "Hi!"
+output:
+wave()
+say("Hello!")
+end()
 
 You will never say anything other than pickup(), place(), and say(). Remember to be friendly, helpful, and concise. You will always explain what you are going to do before you do it. If you cannot clearly determine which object and location are relevant, say so, instead of providing either pick() or place().
 
@@ -59,6 +66,7 @@
 
 input: "Thank you!"
 output:
+wave()
 say("You're welcome!")
 end()
 
@@ -104,6 +112,12 @@ def parse_response(self, response: str):
                 parsed_commands.append(("pickup", command[7:-1]))
             elif command.startswith("place("):
                 parsed_commands.append(("place", command[6:-1]))
+            elif command.startswith("wave()"):
+                parsed_commands.append(("wave", ""))
+            elif command.startswith("go_home()"):
+                parsed_commands.append(("go_home", ""))
+            elif command.startswith("end()"):
+                break
 
         return parsed_commands
 
@@ -128,3 +142,10 @@ def get_say_this(self, response: List[Tuple[str, str]]) -> str:
             if command == "say":
                 all_messages.append(args)
         return " ".join(all_messages)
+
+    def get_wave(self, response: List[Tuple[str, str]]) -> bool:
+        """Return if the robot should wave."""
+        for command, args in response:
+            if command == "wave":
+                return True
+        return False

From 6188288631613f852ee66cb1fed1f44cb2d27812 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 1 Oct 2024 12:11:45 -0400
Subject: [PATCH 274/410] Updates

---
 src/stretch/llms/prompts/pickup_prompt.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index 4a9235c8f..ea3a50ae0 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -19,6 +19,7 @@
 
 When prompted, you will respond using the three actions:
 - pickup(object_name)  # object_name is the name of the object to pick up
+- explore(5)  # explore the environment for a certain number of steps
 - place(location_name)  # location_name is the name of the receptacle to place object in
 - say(text)  # say something to the user
 - wave()  # wave at a person
@@ -33,10 +34,12 @@
 place(cardboard box)
 end()
 
+You should be friendly. Wave if a person is being nice to you or greeting you. For example:
+
 input: "Hi!"
 output:
-wave()
 say("Hello!")
+wave()
 end()
 
 You will never say anything other than pickup(), place(), and say(). Remember to be friendly, helpful, and concise. You will always explain what you are going to do before you do it. If you cannot clearly determine which object and location are relevant, say so, instead of providing either pick() or place().
@@ -66,8 +69,8 @@
 
 input: "Thank you!"
 output:
-wave()
 say("You're welcome!")
+wave()
 end()
 
 input: "What is your name?"
@@ -102,6 +105,15 @@ def parse_response(self, response: str):
                 commands.append(line)
             elif line.startswith("say("):
                 commands.append(line)
+            elif line.startswith("wave()"):
+                commands.append(line)
+            elif line.startswith("go_home()"):
+                commands.append(line)
+            elif line.startswith("explore()"):
+                commands.append(line)
+            elif line.startswith("end()"):
+                # Stop parsing if we see the end command
+                break
 
         # Now go through commands and parse into a tuple (command, args)
         parsed_commands = []
@@ -116,7 +128,11 @@ def parse_response(self, response: str):
                 parsed_commands.append(("wave", ""))
             elif command.startswith("go_home()"):
                 parsed_commands.append(("go_home", ""))
+            elif command.startswith("explore()"):
+                parsed_commands.append(("explore", ""))
             elif command.startswith("end()"):
+                # Stop parsing if we see the end command
+                # This really shouldn't happen, but just in case
                 break
 
         return parsed_commands

From b55dd7f8b092a7a9c6d170706f66cafabae348ac Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 1 Oct 2024 12:15:14 -0400
Subject: [PATCH 275/410] adding some processing for commands

---
 src/stretch/app/ai_pickup.py              |  2 +-
 src/stretch/llms/prompts/pickup_prompt.py | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index 1f6c57ff6..73b9ca41e 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -161,7 +161,7 @@ def main(
 
         if say_this is not None:
             chat_wrapper.say(say_this)
-            # agent.say(say_this)
+            agent.robot_say(say_this)
 
         if len(target_object) == 0 or len(receptacle) == 0:
             # logger.error("You need to enter a target object and receptacle")
diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index ea3a50ae0..a7ed21241 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -23,6 +23,9 @@
 - place(location_name)  # location_name is the name of the receptacle to place object in
 - say(text)  # say something to the user
 - wave()  # wave at a person
+- nod_head() # nod your head
+- shake_head() # shake your head
+- avert_gaze() # avert your gaze
 - go_home()  # navigate back to where you started
 
 These functions and their arguments are the only things you will return - no comments - and they are your only way to interact with the world. For example:
@@ -111,6 +114,12 @@ def parse_response(self, response: str):
                 commands.append(line)
             elif line.startswith("explore()"):
                 commands.append(line)
+            elif line.startswith("nod_head()"):
+                commands.append(line)
+            elif line.startswith("shake_head()"):
+                commands.append(line)
+            elif line.startswith("avert_gaze()"):
+                commands.append(line)
             elif line.startswith("end()"):
                 # Stop parsing if we see the end command
                 break
@@ -130,6 +139,12 @@ def parse_response(self, response: str):
                 parsed_commands.append(("go_home", ""))
             elif command.startswith("explore()"):
                 parsed_commands.append(("explore", ""))
+            elif command.startswith("nod_head()"):
+                parsed_commands.append(("nod_head", ""))
+            elif command.startswith("shake_head()"):
+                parsed_commands.append(("shake_head", ""))
+            elif command.startswith("avert_gaze()"):
+                parsed_commands.append(("avert_gaze", ""))
             elif command.startswith("end()"):
                 # Stop parsing if we see the end command
                 # This really shouldn't happen, but just in case

From 1a0976d819fe1334d9f0437362c5400f47cc4981 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 1 Oct 2024 12:29:59 -0400
Subject: [PATCH 276/410] update

---
 src/stretch/agent/zmq_client.py                              | 5 +++++
 src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py | 1 +
 2 files changed, 6 insertions(+)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index f36fa7fd9..978595de2 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -1230,6 +1230,11 @@ def is_running(self) -> bool:
         """Is the client running"""
         return not self._finish
 
+    def say(self, text: str):
+        """Send a text message to the robot to say. Will be spoken by the robot's text-to-speech system asynchronously."""
+        next_action = {"say": text}
+        self.send_action(next_action)
+
     def blocking_spin_state(self, verbose: bool = False):
         """Listen for incoming observations and update internal state"""
 
diff --git a/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py b/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py
index e8cf29741..3342ceabf 100755
--- a/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py
+++ b/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py
@@ -145,6 +145,7 @@ def handle_action(self, action: Dict[str, Any]):
             self.client.load_map(action["load_map"])
         elif "say" in action:
             # Text to speech from the robot, not the client/agent device
+            print("Saying:", action["say"])
             self.text_to_speech.say_async(action["say"])
         elif "xyt" in action:
             if self.verbose:

From 0fb9aa26971ca2274831895bc213fda615ae9d0c Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 1 Oct 2024 13:33:12 -0400
Subject: [PATCH 277/410] disable web browser start

---
 src/stretch/visualization/rerun.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 0f598df2a..40cbbd0a8 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -182,7 +182,7 @@ class RerunVisualizer:
     def __init__(
         self,
         display_robot_mesh: bool = True,
-        open_browser: bool = True,
+        open_browser: bool = False,
         server_memory_limit: str = "4GB",
         collapse_panels: bool = True,
         show_cameras_in_3d_view: bool = False,
@@ -195,8 +195,9 @@ def __init__(
             server_memory_limit (str): Server memory limit E.g. 2GB or 20%
             collapse_panels (bool): Set to false to have customizable rerun panels
         """
-        rr.init("Stretch_robot", spawn=False)
-        rr.serve(open_browser=open_browser, server_memory_limit=server_memory_limit)
+        rr.init("Stretch_robot", spawn=(not open_browser))
+        if open_browser:
+            rr.serve(open_browser=open_browser, server_memory_limit=server_memory_limit)
 
         self.display_robot_mesh = display_robot_mesh
         self.show_cameras_in_3d_view = show_cameras_in_3d_view

From b9da19c8e6a7478ac8a0b6f49af296c6eaec98df Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 1 Oct 2024 14:17:51 -0400
Subject: [PATCH 278/410] add executor script to process thigns

---
 .../agent/task/pickup/pickup_executor.py      | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 src/stretch/agent/task/pickup/pickup_executor.py

diff --git a/src/stretch/agent/task/pickup/pickup_executor.py b/src/stretch/agent/task/pickup/pickup_executor.py
new file mode 100644
index 000000000..3a644092e
--- /dev/null
+++ b/src/stretch/agent/task/pickup/pickup_executor.py
@@ -0,0 +1,50 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+from typing import List, Tuple
+
+import stretch.utils.logger as logger
+from stretch.agent.robot_agent import RobotAgent
+from stretch.core import AbstractRobotClient
+
+
+class PickupExecutor:
+    """This class parses commands from the pickup llm bot and sends them to the robot."""
+
+    def __init__(self, robot: AbstractRobotClient, agent: RobotAgent):
+        self.robot = robot
+        self.agent = agent
+
+    def __call__(self, response: List[Tuple[str, str]]) -> None:
+        """Execute the list of commands."""
+        for i, (command, args) in enumerate(response):
+            if command == "say":
+                logger.info(f"{i} {command} {args}")
+                self.agent.robot_say(args)
+            elif command == "pickup":
+                self.agent.pickup(args)
+            elif command == "place":
+                self.agent.place(args)
+            elif command == "wave":
+                self.agent.wave()
+            elif command == "go_home":
+                self.agent.go_home()
+            elif command == "explore":
+                self.agent.explore()
+            elif command == "nod_head":
+                self.agent.nod_head()
+            elif command == "shake_head":
+                self.agent.shake_head()
+            elif command == "avert_gaze":
+                self.agent.avert_gaze()
+            elif command == "end":
+                break
+            else:
+                # logger.error(f"Unknown command: {command}")
+                pass

From 6f8aff46c73c2080aff5a5be9ebf86ed2cb95073 Mon Sep 17 00:00:00 2001
From: Peiqi Liu <leo20021210@163.com>
Date: Tue, 1 Oct 2024 14:48:28 -0400
Subject: [PATCH 279/410] rerun update

---
 src/stretch/dynav/robot_agent_manip_mdp.py |   4 +-
 src/stretch/dynav/run_ok_nav.py            |   3 +-
 src/stretch/dynav/voxel_map_server.py      | 188 ++++++++++++---------
 src/stretch/visualization/rerun.py         |  42 ++++-
 4 files changed, 155 insertions(+), 82 deletions(-)

diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index 409859449..5b028bdd0 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -79,7 +79,9 @@ def __init__(
             from stretch.dynav.voxel_map_server import ImageProcessor as VoxelMapImageProcessor
 
             self.image_processor = VoxelMapImageProcessor(
-                rerun=True, static=False, log="env" + str(env_num) + "_" + str(test_num)
+                rerun=True,
+                rerun_visualizer=self.robot._rerun,
+                log="env" + str(env_num) + "_" + str(test_num),
             )  # type: ignore
         elif method == "mllm":
             from stretch.dynav.llm_server import ImageProcessor as mLLMImageProcessor
diff --git a/src/stretch/dynav/run_ok_nav.py b/src/stretch/dynav/run_ok_nav.py
index 9315b1ba1..ae8b1d729 100644
--- a/src/stretch/dynav/run_ok_nav.py
+++ b/src/stretch/dynav/run_ok_nav.py
@@ -33,7 +33,8 @@ def compute_tilt(camera_xyz, target_xyz):
 
 
 @click.command()
-@click.option("--ip", default="100.108.67.79", type=str)
+# by default you are running these codes on your workstation, not on your robot.
+@click.option("--ip", default="127.0.0.1", type=str)
 @click.option("--manual-wait", default=False, is_flag=True)
 @click.option("--random-goals", default=False, is_flag=True)
 @click.option("--explore-iter", default=-1)
diff --git a/src/stretch/dynav/voxel_map_server.py b/src/stretch/dynav/voxel_map_server.py
index cad55dc47..08571186f 100644
--- a/src/stretch/dynav/voxel_map_server.py
+++ b/src/stretch/dynav/voxel_map_server.py
@@ -14,7 +14,8 @@
 
 # import wget
 import time
-from io import BytesIO
+
+# from io import BytesIO
 from pathlib import Path
 
 import clip
@@ -28,7 +29,8 @@
 import torch.nn.functional as F
 import wget
 from matplotlib import pyplot as plt
-from PIL import Image
+
+# from PIL import Image
 from sam2.build_sam import build_sam2
 from sam2.sam2_image_predictor import SAM2ImagePredictor
 from torchvision import transforms
@@ -122,27 +124,31 @@ def __init__(
         text_port: int = 5556,
         open_communication: bool = True,
         rerun: bool = True,
-        static: bool = True,
+        # static: bool = True,
         log=None,
         image_shape=(400, 300),
         rerun_server_memory_limit: str = "4GB",
+        rerun_visualizer=None,
     ):
-        self.static = static
+        # self.static = static
         self.siglip = siglip
+        self.rerun_visualizer = rerun_visualizer
         current_datetime = datetime.datetime.now()
         if log is None:
             self.log = "debug_" + current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
         else:
             self.log = log
         self.rerun = rerun
-        if self.rerun:
-            if self.static:
-                rr.init(self.log, spawn=False)
-                rr.connect("100.108.67.79:9876")
-            else:
-                # rr.init(self.log, spawn=False)
-                # rr.connect("100.108.67.79:9876")
-                logger.info("Attempting to connect to existing rerun server.")
+        if self.rerun and self.rerun_visualizer is None:
+            rr.init(self.log)
+            logger.info("Starting a rerun server.")
+            # if self.static:
+            #     rr.init(self.log, spawn=False)
+            #     rr.connect("100.108.67.79:9876")
+            # else:
+            #     # rr.init(self.log, spawn=False)
+            #     # rr.connect("100.108.67.79:9876")
+            #     logger.info("Attempting to connect to existing rerun server.")
 
         self.min_depth = min_depth
         self.max_depth = max_depth
@@ -252,17 +258,18 @@ def process_text(self, text, start_pose):
         """
 
         if self.rerun:
-            rr.log("/object", rr.Clear(recursive=True), static=self.static)
-            rr.log("/robot_start_pose", rr.Clear(recursive=True), static=self.static)
-            rr.log("/direction", rr.Clear(recursive=True), static=self.static)
-            rr.log("robot_monologue", rr.Clear(recursive=True), static=self.static)
-            rr.log(
-                "/Past_observation_most_similar_to_text",
-                rr.Clear(recursive=True),
-                static=self.static,
-            )
-            if not self.static:
-                rr.connect("100.108.67.79:9876")
+            if self.rerun_visualizer is None:
+                rr.log("/object", rr.Clear(recursive=True))
+                rr.log("/robot_start_pose", rr.Clear(recursive=True))
+                rr.log("/direction", rr.Clear(recursive=True))
+                rr.log("robot_monologue", rr.Clear(recursive=True))
+                rr.log("/Past_observation_most_similar_to_text", rr.Clear(recursive=True))
+            else:
+                self.rerun_visualizer.clear_identity("/object")
+                self.rerun_visualizer.clear_identity("/robot_start_pose")
+                self.rerun_visualizer.clear_identity("/direction")
+                self.rerun_visualizer.clear_identity("robot_monologue")
+                self.rerun_visualizer.clear_identity("/Past_observation_most_similar_to_text")
         debug_text = ""
         mode = "navigation"
         obs = None
@@ -287,26 +294,27 @@ def process_text(self, text, start_pose):
                         pointcloud,
                     ) = self.voxel_map_localizer.localize_A(text, debug=True, return_debug=True)
                 if localized_point is not None:
-                    rr.log(
-                        "/object",
-                        rr.Points3D(
+                    if self.rerun and self.rerun_visualizer is None:
+                        rr.log(
+                            "/object",
+                            rr.Points3D(
+                                [localized_point[0], localized_point[1], 1.5],
+                                colors=torch.Tensor([1, 0, 0]),
+                                radii=0.1,
+                            ),
+                        )
+                    elif self.rerun:
+                        self.rerun_visualizer.log_custom_pointcloud(
+                            "/object",
                             [localized_point[0], localized_point[1], 1.5],
-                            colors=torch.Tensor([1, 0, 0]),
-                            radii=0.1,
-                        ),
-                        static=self.static,
-                    )
+                            torch.Tensor([1, 0, 0]),
+                            0.1,
+                        )
             # Do Frontier based exploration
             if text is None or text == "" or localized_point is None:
                 debug_text += "## Navigation fails, so robot starts exploring environments.\n"
                 localized_point = self.sample_frontier(start_pose, text)
                 mode = "exploration"
-                rr.log(
-                    "/object",
-                    rr.Points3D([0, 0, 0], colors=torch.Tensor([1, 0, 0]), radii=0),
-                    static=self.static,
-                )
-                print("\n", localized_point, "\n")
 
             if localized_point is None:
                 return []
@@ -316,20 +324,20 @@ def process_text(self, text, start_pose):
 
             point = self.sample_navigation(start_pose, localized_point)
 
-            if self.rerun:
-                buf = BytesIO()
-                plt.savefig(buf, format="png")
-                buf.seek(0)
-                img = Image.open(buf)
-                img = np.array(img)
-                buf.close()
-                rr.log("2d_map", rr.Image(img), static=self.static)
-            else:
-                if text != "":
-                    plt.savefig(self.log + "/debug_" + text + str(self.obs_count) + ".png")
-                else:
-                    plt.savefig(self.log + "/debug_exploration" + str(self.obs_count) + ".png")
-            plt.clf()
+            # if self.rerun:
+            #     buf = BytesIO()
+            #     plt.savefig(buf, format="png")
+            #     buf.seek(0)
+            #     img = Image.open(buf)
+            #     img = np.array(img)
+            #     buf.close()
+            #     rr.log("2d_map", rr.Image(img), static=self.static)
+            # else:
+            #     if text != "":
+            #         plt.savefig(self.log + "/debug_" + text + str(self.obs_count) + ".png")
+            #     else:
+            #         plt.savefig(self.log + "/debug_exploration" + str(self.obs_count) + ".png")
+            # plt.clf()
 
             if obs is not None and mode == "navigation":
                 rgb = self.voxel_map.observations[obs - 1].rgb
@@ -338,9 +346,13 @@ def process_text(self, text, start_pose):
                         rgb = np.array(rgb)
                     cv2.imwrite(self.log + "/debug_" + text + ".png", rgb[:, :, [2, 1, 0]])
                 else:
-                    rr.log(
-                        "/Past_observation_most_similar_to_text", rr.Image(rgb), static=self.static
-                    )
+                    if self.rerun and self.rerun_visualizer is None:
+                        rr.log("/Past_observation_most_similar_to_text", rr.Image(rgb))
+                    elif self.rerun:
+                        self.rerun_visualizer.log_custom_2d_image(
+                            "/Past_observation_most_similar_to_text", rgb
+                        )
+
             waypoints = None
 
             if point is None:
@@ -378,11 +390,13 @@ def process_text(self, text, start_pose):
             else:
                 debug_text = "### I have not received any text query from human user.\n ### So, I plan to explore the environment with Frontier-based exploration.\n"
             debug_text = "# Robot's monologue: \n" + debug_text
-            rr.log(
-                "robot_monologue",
-                rr.TextDocument(debug_text, media_type=rr.MediaType.MARKDOWN),
-                static=self.static,
-            )
+            if self.rerun and self.rerun_visualizer is None:
+                rr.log(
+                    "robot_monologue",
+                    rr.TextDocument(debug_text, media_type=rr.MediaType.MARKDOWN),
+                )
+            elif self.rerun:
+                self.rerun_visualizer.log_text("robot_monologue", debug_text)
 
         if traj is not None:
             origins = []
@@ -393,20 +407,31 @@ def process_text(self, text, start_pose):
                     vectors.append(
                         [traj[idx + 1][0] - traj[idx][0], traj[idx + 1][1] - traj[idx][1], 0]
                     )
-            rr.log(
-                "/direction",
-                rr.Arrows3D(
-                    origins=origins, vectors=vectors, colors=torch.Tensor([0, 1, 0]), radii=0.1
-                ),
-                static=self.static,
-            )
-            rr.log(
-                "/robot_start_pose",
-                rr.Points3D(
-                    [start_pose[0], start_pose[1], 1.5], colors=torch.Tensor([0, 0, 1]), radii=0.1
-                ),
-                static=self.static,
-            )
+            if self.rerun and self.rerun_visualizer is None:
+                rr.log(
+                    "/direction",
+                    rr.Arrows3D(
+                        origins=origins, vectors=vectors, colors=torch.Tensor([0, 1, 0]), radii=0.1
+                    ),
+                )
+                rr.log(
+                    "/robot_start_pose",
+                    rr.Points3D(
+                        [start_pose[0], start_pose[1], 1.5],
+                        colors=torch.Tensor([0, 0, 1]),
+                        radii=0.1,
+                    ),
+                )
+            elif self.rerun:
+                self.rerun_visualizer.log_arrow3D(
+                    "/direction", origins, vectors, torch.Tensor([0, 1, 0]), 0.1
+                )
+                self.rerun_visualizer.log_custom_pointcloud(
+                    "/robot_start_pose",
+                    [start_pose[0], start_pose[1], 1.5],
+                    torch.Tensor([0, 0, 1]),
+                    0.1,
+                )
 
         # self.write_to_pickle()
         return traj
@@ -773,7 +798,7 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
             camera_K=torch.Tensor(intrinsics),
         )
         obs, exp = self.voxel_map.get_2d_map()
-        if self.rerun:
+        if self.rerun and self.rerun_visualizer is None:
             # if not self.static:
             #     rr.set_time_sequence("frame", self.obs_count)
             # rr.log('robot_pov', rr.Image(rgb.permute(1, 2, 0)), static = self.static)
@@ -798,11 +823,16 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
                     static=self.static,
                 )
             # rr.log("Obstalce_map/2D_obs_map", rr.Image(obs.int() * 127 + exp.int() * 127))
-        else:
-            cv2.imwrite(
-                self.log + "/debug_" + str(self.obs_count) + ".jpg",
-                np.asarray(obs.int() * 127 + exp.int() * 127),
-            )
+        elif self.rerun:
+            if self.voxel_map.voxel_pcd._points is not None:
+                self.rerun_visualizer.update_voxel_map(space=self.space)
+            if self.voxel_map_localizer.voxel_pcd._points is not None:
+                self.rerun_visualizer.log_custom_pointcloud(
+                    "Semantic_memory/pointcloud",
+                    self.voxel_map_localizer.voxel_pcd._points.detach().cpu(),
+                    self.voxel_map_localizer.voxel_pcd._rgb.detach().cpu() / 255.0,
+                    0.03,
+                )
         # end_time = time.time()
         # print('image processing takes ' + str(end_time - start_time) + ' seconds.')
 
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 66898e69b..ad5a38537 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -11,7 +11,7 @@
 
 import time
 import timeit
-from typing import Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 import rerun as rr
@@ -245,6 +245,46 @@ def setup_blueprint(self, collapse_panels: bool):
         )
         rr.send_blueprint(my_blueprint)
 
+    def clear_identity(self, identity_name: str):
+        rr.log(identity_name, rr.Clear(recursive=True))
+
+    def log_custom_2d_image(
+        self, identity_name: str, img: Union[List[list], np.ndarray, torch.Tensor]
+    ):
+        rr.log(identity_name, rr.Image(img))
+
+    def log_text(self, identity_name: str, text: str):
+        rr.log(identity_name, rr.TextDocument(text, media_type=rr.MediaType.MARKDOWN))
+
+    def log_arrow3D(
+        self,
+        identity_name: str,
+        origins: Union[list, List[list], np.ndarray, torch.Tensor],
+        vectors: Union[list, List[list], np.ndarray, torch.Tensor],
+        colors: Union[list, List[list], np.ndarray, torch.Tensor],
+        radii: float,
+    ):
+        rr.log(
+            identity_name,
+            rr.Arrows3D(origins=origins, vectors=vectors, colors=colors, radii=radii),
+        )
+
+    def log_custom_pointcloud(
+        self,
+        identity_name: str,
+        points: Union[list, List[list], np.ndarray, torch.Tensor],
+        colors: Union[list, List[list], np.ndarray, torch.Tensor],
+        radii: float,
+    ):
+        rr.log(
+            identity_name,
+            rr.Points3D(
+                points,
+                colors=colors,
+                radii=radii,
+            ),
+        )
+
     def log_head_camera(self, obs: Observations):
         """Log head camera pose and images
 

From df5e510b8a286ad57fb27314696493fc71f499f7 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 1 Oct 2024 15:21:31 -0400
Subject: [PATCH 280/410] add robot task executor

---
 .../agent/task/pickup/pickup_executor.py      | 25 ++++++++++++++-----
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/stretch/agent/task/pickup/pickup_executor.py b/src/stretch/agent/task/pickup/pickup_executor.py
index 3a644092e..195f3f7d9 100644
--- a/src/stretch/agent/task/pickup/pickup_executor.py
+++ b/src/stretch/agent/task/pickup/pickup_executor.py
@@ -23,14 +23,28 @@ def __init__(self, robot: AbstractRobotClient, agent: RobotAgent):
 
     def __call__(self, response: List[Tuple[str, str]]) -> None:
         """Execute the list of commands."""
-        for i, (command, args) in enumerate(response):
+        i = 0
+
+        # Loop over every command we have been given
+        # Pull out pickup and place as a single arg if they are in a row
+        # Else, execute things as they come
+        while i < len(response):
+            command, args = response[i]
+            logger.info(f"{i} {command} {args}")
             if command == "say":
-                logger.info(f"{i} {command} {args}")
                 self.agent.robot_say(args)
             elif command == "pickup":
-                self.agent.pickup(args)
+                target_object = args
+                i += 1
+                next_command, next_args = response[i]
+                if next_command != "place":
+                    i -= 1
+                    logger.error("Pickup without place! Doing nothing.")
+                else:
+                    logger.info(f"{i} {command} {args}")
+                breakpoint()
             elif command == "place":
-                self.agent.place(args)
+                logger.error("Place without pickup! Doing nothing.")
             elif command == "wave":
                 self.agent.wave()
             elif command == "go_home":
@@ -46,5 +60,4 @@ def __call__(self, response: List[Tuple[str, str]]) -> None:
             elif command == "end":
                 break
             else:
-                # logger.error(f"Unknown command: {command}")
-                pass
+                logger.error(f"Skipping unknown command: {command}")

From e137f8d1ed75080943b868f444b54b9a86ac501e Mon Sep 17 00:00:00 2001
From: Peiqi Liu <leo20021210@163.com>
Date: Tue, 1 Oct 2024 17:45:49 -0400
Subject: [PATCH 281/410] fix rerun bug

---
 src/stretch/agent/zmq_client.py            |  2 +-
 src/stretch/dynav/robot_agent_manip_mdp.py |  2 +-
 src/stretch/dynav/voxel_map_server.py      | 69 ++++++++++++++--------
 src/stretch/visualization/rerun.py         | 16 ++---
 4 files changed, 53 insertions(+), 36 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index c2d96bf19..396289651 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -406,7 +406,7 @@ def head_to(
             whole_body_q[HelloStretchIdx.HEAD_TILT] = float(head_tilt)
             self._wait_for_head(whole_body_q, block_id=step)
 
-        # time.sleep(0.3)
+        time.sleep(0.1)
 
     def look_front(self, blocking: bool = True, timeout: float = 10.0):
         """Let robot look to its front."""
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index 5b028bdd0..33caf38ee 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -114,7 +114,7 @@ def rotate_in_place(self):
     def update(self):
         """Step the data collector. Get a single observation of the world. Remove bad points, such as those from too far or too near the camera. Update the 3d world representation."""
         # Sleep some time so the robot rgbd observations are more likely to be updated
-        time.sleep(0.5)
+        time.sleep(0.3)
 
         obs = self.robot.get_observation()
         self.obs_count += 1
diff --git a/src/stretch/dynav/voxel_map_server.py b/src/stretch/dynav/voxel_map_server.py
index 08571186f..70c390b7c 100644
--- a/src/stretch/dynav/voxel_map_server.py
+++ b/src/stretch/dynav/voxel_map_server.py
@@ -22,6 +22,7 @@
 import cv2
 import numpy as np
 import rerun as rr
+import rerun.blueprint as rrb
 import scipy
 import torch
 
@@ -112,6 +113,26 @@ def get_xyz(depth, pose, intrinsics):
     return xyz
 
 
+def setup_custom_blueprint(rerun_visualizer):
+    main = rrb.Horizontal(
+        rrb.Spatial3DView(name="3D View", origin="world"),
+        rrb.Vertical(
+            rrb.TextDocumentView(name="text", origin="robot_monologue"),
+            rrb.Spatial2DView(name="image", origin="/observation_similar_to_text"),
+        ),
+        # rrb.Vertical(
+        #     rrb.Spatial2DView(name="head_rgb", origin="/world/head_camera"),
+        #     rrb.Spatial2DView(name="ee_rgb", origin="/world/ee_camera"),
+        # ),
+        column_shares=[2, 1],
+    )
+    my_blueprint = rrb.Blueprint(
+        rrb.Vertical(main, rrb.TimePanel(state=True)),
+        collapse_panels=False,
+    )
+    rr.send_blueprint(my_blueprint)
+
+
 class ImageProcessor:
     def __init__(
         self,
@@ -142,13 +163,9 @@ def __init__(
         if self.rerun and self.rerun_visualizer is None:
             rr.init(self.log)
             logger.info("Starting a rerun server.")
-            # if self.static:
-            #     rr.init(self.log, spawn=False)
-            #     rr.connect("100.108.67.79:9876")
-            # else:
-            #     # rr.init(self.log, spawn=False)
-            #     # rr.connect("100.108.67.79:9876")
-            #     logger.info("Attempting to connect to existing rerun server.")
+        else:
+            setup_custom_blueprint(self.rerun_visualizer)
+            logger.info("Attempting to connect to existing rerun server.")
 
         self.min_depth = min_depth
         self.max_depth = max_depth
@@ -259,17 +276,17 @@ def process_text(self, text, start_pose):
 
         if self.rerun:
             if self.rerun_visualizer is None:
-                rr.log("/object", rr.Clear(recursive=True))
-                rr.log("/robot_start_pose", rr.Clear(recursive=True))
-                rr.log("/direction", rr.Clear(recursive=True))
+                rr.log("world/object", rr.Clear(recursive=True))
+                rr.log("world/robot_start_pose", rr.Clear(recursive=True))
+                rr.log("world/direction", rr.Clear(recursive=True))
                 rr.log("robot_monologue", rr.Clear(recursive=True))
-                rr.log("/Past_observation_most_similar_to_text", rr.Clear(recursive=True))
+                rr.log("/observation_similar_to_text", rr.Clear(recursive=True))
             else:
-                self.rerun_visualizer.clear_identity("/object")
-                self.rerun_visualizer.clear_identity("/robot_start_pose")
-                self.rerun_visualizer.clear_identity("/direction")
+                self.rerun_visualizer.clear_identity("world/object")
+                self.rerun_visualizer.clear_identity("world/robot_start_pose")
+                self.rerun_visualizer.clear_identity("world/direction")
                 self.rerun_visualizer.clear_identity("robot_monologue")
-                self.rerun_visualizer.clear_identity("/Past_observation_most_similar_to_text")
+                self.rerun_visualizer.clear_identity("/observation_similar_to_text")
         debug_text = ""
         mode = "navigation"
         obs = None
@@ -296,7 +313,7 @@ def process_text(self, text, start_pose):
                 if localized_point is not None:
                     if self.rerun and self.rerun_visualizer is None:
                         rr.log(
-                            "/object",
+                            "world/object",
                             rr.Points3D(
                                 [localized_point[0], localized_point[1], 1.5],
                                 colors=torch.Tensor([1, 0, 0]),
@@ -305,7 +322,7 @@ def process_text(self, text, start_pose):
                         )
                     elif self.rerun:
                         self.rerun_visualizer.log_custom_pointcloud(
-                            "/object",
+                            "world/object",
                             [localized_point[0], localized_point[1], 1.5],
                             torch.Tensor([1, 0, 0]),
                             0.1,
@@ -347,10 +364,10 @@ def process_text(self, text, start_pose):
                     cv2.imwrite(self.log + "/debug_" + text + ".png", rgb[:, :, [2, 1, 0]])
                 else:
                     if self.rerun and self.rerun_visualizer is None:
-                        rr.log("/Past_observation_most_similar_to_text", rr.Image(rgb))
+                        rr.log("/observation_similar_to_text", rr.Image(rgb))
                     elif self.rerun:
                         self.rerun_visualizer.log_custom_2d_image(
-                            "/Past_observation_most_similar_to_text", rgb
+                            "/observation_similar_to_text", rgb
                         )
 
             waypoints = None
@@ -409,13 +426,13 @@ def process_text(self, text, start_pose):
                     )
             if self.rerun and self.rerun_visualizer is None:
                 rr.log(
-                    "/direction",
+                    "world/direction",
                     rr.Arrows3D(
                         origins=origins, vectors=vectors, colors=torch.Tensor([0, 1, 0]), radii=0.1
                     ),
                 )
                 rr.log(
-                    "/robot_start_pose",
+                    "world/robot_start_pose",
                     rr.Points3D(
                         [start_pose[0], start_pose[1], 1.5],
                         colors=torch.Tensor([0, 0, 1]),
@@ -424,10 +441,10 @@ def process_text(self, text, start_pose):
                 )
             elif self.rerun:
                 self.rerun_visualizer.log_arrow3D(
-                    "/direction", origins, vectors, torch.Tensor([0, 1, 0]), 0.1
+                    "world/direction", origins, vectors, torch.Tensor([0, 1, 0]), 0.1
                 )
                 self.rerun_visualizer.log_custom_pointcloud(
-                    "/robot_start_pose",
+                    "world/robot_start_pose",
                     [start_pose[0], start_pose[1], 1.5],
                     torch.Tensor([0, 0, 1]),
                     0.1,
@@ -804,7 +821,7 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
             # rr.log('robot_pov', rr.Image(rgb.permute(1, 2, 0)), static = self.static)
             if self.voxel_map.voxel_pcd._points is not None:
                 rr.log(
-                    "Obstalce_map/pointcloud",
+                    "obstalce_map/pointcloud",
                     rr.Points3D(
                         self.voxel_map.voxel_pcd._points.detach().cpu(),
                         colors=self.voxel_map.voxel_pcd._rgb.detach().cpu() / 255.0,
@@ -814,7 +831,7 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
                 )
             if self.voxel_map_localizer.voxel_pcd._points is not None:
                 rr.log(
-                    "Semantic_memory/pointcloud",
+                    "semantic_memory/pointcloud",
                     rr.Points3D(
                         self.voxel_map_localizer.voxel_pcd._points.detach().cpu(),
                         colors=self.voxel_map_localizer.voxel_pcd._rgb.detach().cpu() / 255.0,
@@ -828,7 +845,7 @@ def process_rgbd_images(self, rgb, depth, intrinsics, pose):
                 self.rerun_visualizer.update_voxel_map(space=self.space)
             if self.voxel_map_localizer.voxel_pcd._points is not None:
                 self.rerun_visualizer.log_custom_pointcloud(
-                    "Semantic_memory/pointcloud",
+                    "world/semantic_memory/pointcloud",
                     self.voxel_map_localizer.voxel_pcd._points.detach().cpu(),
                     self.voxel_map_localizer.voxel_pcd._rgb.detach().cpu() / 255.0,
                     0.03,
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 6fde3853d..690d4e887 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -584,17 +584,17 @@ def step(self, obs, servo):
             rr.set_time_seconds("realtime", time.time())
             try:
                 t0 = timeit.default_timer()
-                self.log_robot_xyt(obs)
-                self.log_ee_frame(obs)
+                # self.log_robot_xyt(obs)
+                # self.log_ee_frame(obs)
 
-                # Cameras use the lower-res servo object
-                self.log_head_camera(servo)
-                self.log_ee_camera(servo)
+                # # Cameras use the lower-res servo object
+                # self.log_head_camera(servo)
+                # self.log_ee_camera(servo)
 
-                self.log_robot_state(obs)
+                # self.log_robot_state(obs)
 
-                if self.display_robot_mesh:
-                    self.log_robot_transforms(obs)
+                # if self.display_robot_mesh:
+                #     self.log_robot_transforms(obs)
                 t1 = timeit.default_timer()
                 sleep_time = self.step_delay_s - (t1 - t0)
                 if sleep_time > 0:

From 15440c79da8ffde7a60193244303e3c34b467019 Mon Sep 17 00:00:00 2001
From: Peiqi Liu <leo20021210@163.com>
Date: Wed, 2 Oct 2024 09:50:59 -0400
Subject: [PATCH 282/410] add comments

---
 src/stretch/visualization/rerun.py | 55 ++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 11 deletions(-)

diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 690d4e887..078615a33 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -248,14 +248,30 @@ def setup_blueprint(self, collapse_panels: bool):
         rr.send_blueprint(my_blueprint)
 
     def clear_identity(self, identity_name: str):
+        """Clear existing rerun identity.
+
+        This is useful if you want to clear a rerun identity and leave a blank there.
+        Args:
+            identity_name (str): rerun identity name
+        """
         rr.log(identity_name, rr.Clear(recursive=True))
 
-    def log_custom_2d_image(
-        self, identity_name: str, img: Union[List[list], np.ndarray, torch.Tensor]
-    ):
+    def log_custom_2d_image(self, identity_name: str, img: Union[np.ndarray, torch.Tensor]):
+        """Log custom 2d image
+
+        Args:
+            identity_name (str): rerun identity name
+            img (2D or 3D array): the 2d image you want to log into rerun
+        """
         rr.log(identity_name, rr.Image(img))
 
     def log_text(self, identity_name: str, text: str):
+        """Log a custom markdown text
+
+        Args:
+            identity_name (str): rerun identity name
+            text (str): Markdown codes you want to log in rerun
+        """
         rr.log(identity_name, rr.TextDocument(text, media_type=rr.MediaType.MARKDOWN))
 
     def log_arrow3D(
@@ -266,6 +282,15 @@ def log_arrow3D(
         colors: Union[list, List[list], np.ndarray, torch.Tensor],
         radii: float,
     ):
+        """Log custom 3D arrows
+
+        Args:
+            identity_name (str): rerun identity name
+            origins (a N x 3 array): origins of all 3D arrows
+            vectors (a N x 3 array): directions and lengths of all 3D arrows
+            colors (a N x 3 array): RGB colors of all 3D arrows
+            radii (float): size of the arrows
+        """
         rr.log(
             identity_name,
             rr.Arrows3D(origins=origins, vectors=vectors, colors=colors, radii=radii),
@@ -278,6 +303,14 @@ def log_custom_pointcloud(
         colors: Union[list, List[list], np.ndarray, torch.Tensor],
         radii: float,
     ):
+        """Log custom 3D pointcloud
+
+        Args:
+            identity_name (str): rerun identity name
+            points (a N x 3 array): xyz coordinates of all 3D points
+            colors (a N x 3 array): RGB colors of all 3D points
+            radii (float): size of the arrows
+        """
         rr.log(
             identity_name,
             rr.Points3D(
@@ -584,17 +617,17 @@ def step(self, obs, servo):
             rr.set_time_seconds("realtime", time.time())
             try:
                 t0 = timeit.default_timer()
-                # self.log_robot_xyt(obs)
-                # self.log_ee_frame(obs)
+                self.log_robot_xyt(obs)
+                self.log_ee_frame(obs)
 
-                # # Cameras use the lower-res servo object
-                # self.log_head_camera(servo)
-                # self.log_ee_camera(servo)
+                # Cameras use the lower-res servo object
+                self.log_head_camera(servo)
+                self.log_ee_camera(servo)
 
-                # self.log_robot_state(obs)
+                self.log_robot_state(obs)
 
-                # if self.display_robot_mesh:
-                #     self.log_robot_transforms(obs)
+                if self.display_robot_mesh:
+                    self.log_robot_transforms(obs)
                 t1 = timeit.default_timer()
                 sleep_time = self.step_delay_s - (t1 - t0)
                 if sleep_time > 0:

From 1c9f712a0bd3e0e0c1d75bdee9f66dd1d19c94ba Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 2 Oct 2024 12:18:07 -0400
Subject: [PATCH 283/410] tweaking motion configuration

---
 src/stretch/config/default_planner.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index c2e695aeb..184ffad46 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -13,8 +13,8 @@ obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
 neg_obs_height: -0.05  # Things less than this height ARE obstacles
 use_negative_obstacles: True  # Use the negative height as an obstacle
-obs_min_density: 50  # This many points makes it an obstacle
-min_points_per_voxel: 50  # Drop things below this density per voxel
+obs_min_density: 10  # This many points makes it an obstacle
+min_points_per_voxel: 25  # Drop things below this density per voxel
 
 # Padding
 pad_obstacles: 2  # Add this many units (voxel_size) to the area around obstacles
@@ -35,10 +35,10 @@ detection:
 
 # Point cloud cleanup
 filters:
-  smooth_kernel_size: 2
+  smooth_kernel_size: 3
   # smooth_kernel_size: 0
   use_median_filter: True
-  median_filter_size: 4
+  median_filter_size: 5
   median_filter_max_error: 0.01
   use_derivative_filter: False
   derivative_filter_threshold: 0.1

From 6db5bc9c5bc592e055f60304e510d179ec248ef6 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 2 Oct 2024 12:27:40 -0400
Subject: [PATCH 284/410] add some extra information about invalid starts

---
 src/stretch/agent/robot_agent.py        | 8 +++++---
 src/stretch/config/default_planner.yaml | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 9c7e2724b..905ee9f75 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -757,19 +757,21 @@ def move_to_instance(self, instance: Instance, max_try_per_instance=10) -> bool:
                 return True
         return False
 
-    def recover_from_invalid_start(self) -> bool:
+    def recover_from_invalid_start(self, verbose: bool = True) -> bool:
         """Try to recover from an invalid start state.
 
         Returns:
             bool: whether the robot recovered from the invalid start state
         """
 
+        print("Recovering from invalid start state...")
+
         # Get current invalid pose
         start = self.robot.get_base_pose()
 
         # Apply relative transformation to XYT
-        forward = np.array([-0.1, 0, 0])
-        backward = np.array([0.1, 0, 0])
+        forward = np.array([-0.05, 0, 0])
+        backward = np.array([0.05, 0, 0])
 
         xyt_goal_forward = xyt_base_to_global(forward, start)
         xyt_goal_backward = xyt_base_to_global(backward, start)
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 184ffad46..676782f09 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -13,7 +13,7 @@ obs_min_height: 0.1  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
 neg_obs_height: -0.05  # Things less than this height ARE obstacles
 use_negative_obstacles: True  # Use the negative height as an obstacle
-obs_min_density: 10  # This many points makes it an obstacle
+obs_min_density: 25  # This many points makes it an obstacle
 min_points_per_voxel: 25  # Drop things below this density per voxel
 
 # Padding

From 2b2d3e4316d228365312b947356908c12b2a2161 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 2 Oct 2024 12:28:36 -0400
Subject: [PATCH 285/410] cleanup some bad code

---
 src/stretch/agent/robot_agent.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 905ee9f75..0c8f201a4 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -862,16 +862,8 @@ def move_to_any_instance(
             )
 
             if self.robot.last_motion_failed():
-                print("!!!!!!!!!!!!!!!!!!!!!!")
-                print("ROBOT IS STUCK! Move back!")
-                r = np.random.randint(3)
-                if r == 0:
-                    self.robot.navigate_to([-0.1, 0, 0], relative=True, blocking=True)
-                elif r == 1:
-                    self.robot.navigate_to([0, 0, np.pi / 4], relative=True, blocking=True)
-                elif r == 2:
-                    self.robot.navigate_to([0, 0, -np.pi / 4], relative=True, blocking=True)
-                return False
+                # We have a problem!
+                self.recover_from_invalid_start()
 
             time.sleep(1.0)
             self.robot.navigate_to([0, 0, np.pi / 2], relative=True)

From 3408da3f26804895fbe9fbb8d0c8158a4e3d6778 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 2 Oct 2024 17:05:52 -0400
Subject: [PATCH 286/410] setting up pickup executor for handling more complex
 queries and things like that

---
 src/stretch/agent/task/pickup/__init__.py        |  1 +
 src/stretch/agent/task/pickup/pickup_executor.py | 12 +++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/task/pickup/__init__.py b/src/stretch/agent/task/pickup/__init__.py
index 723da4a7a..03236c76b 100644
--- a/src/stretch/agent/task/pickup/__init__.py
+++ b/src/stretch/agent/task/pickup/__init__.py
@@ -8,4 +8,5 @@
 # license information maybe found below, if so.
 
 from .multi_pickup_task import MultiPickupTask
+from .pickup_executor import PickupExecutor
 from .pickup_task import PickupTask
diff --git a/src/stretch/agent/task/pickup/pickup_executor.py b/src/stretch/agent/task/pickup/pickup_executor.py
index 195f3f7d9..a178d5db9 100644
--- a/src/stretch/agent/task/pickup/pickup_executor.py
+++ b/src/stretch/agent/task/pickup/pickup_executor.py
@@ -17,9 +17,19 @@
 class PickupExecutor:
     """This class parses commands from the pickup llm bot and sends them to the robot."""
 
-    def __init__(self, robot: AbstractRobotClient, agent: RobotAgent):
+    def __init__(
+        self, robot: AbstractRobotClient, agent: RobotAgent, dry_run: bool = False
+    ) -> None:
+        """Initialize the executor.
+
+        Args:
+            robot: The robot client.
+            agent: The robot agent.
+            dry_run: If true, don't actually execute the commands.
+        """
         self.robot = robot
         self.agent = agent
+        self.dry_run = dry_run
 
     def __call__(self, response: List[Tuple[str, str]]) -> None:
         """Execute the list of commands."""

From 9b0fa0f0c9b12ad623b72bd350db873bc81d32cd Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 3 Oct 2024 11:09:55 -0400
Subject: [PATCH 287/410] rerun bug fix

---
 src/stretch/agent/zmq_client.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 6462b9931..a8b6e85ee 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -180,9 +180,9 @@ def __init__(
         self._servo_lock = Lock()
 
         if enable_rerun_server:
-            from stretch.visualization.rerun import RerunVsualizer
+            from stretch.visualization.rerun import RerunVisualizer
 
-            self._rerun = RerunVsualizer()
+            self._rerun = RerunVisualizer()
         else:
             self._rerun = None
             self._rerun_thread = None

From 3824c9557f980dd92b300550b4e072f9013db267 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 3 Oct 2024 11:10:36 -0400
Subject: [PATCH 288/410] increase max number of displayed points

---
 src/stretch/visualization/rerun.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 078615a33..f94c68a57 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -177,7 +177,7 @@ def log_transforms(self, obs, debug: bool = False):
 class RerunVisualizer:
 
     camera_point_radius: float = 0.01
-    max_displayed_points_per_camera: int = 5000
+    max_displayed_points_per_camera: int = 10000
 
     def __init__(
         self,

From 70d3acd890e68be62365151207f0775b01116aa5 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 3 Oct 2024 11:16:27 -0400
Subject: [PATCH 289/410] clean up UI elements

---
 src/stretch/config/default_planner.yaml | 4 ++--
 src/stretch/visualization/rerun.py      | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index f3e10d71c..aa30a640b 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -9,7 +9,7 @@ tts_engine: "gTTS"
 
 # Sparse Voxel Map parameters
 voxel_size: 0.04 # Size of a voxel in meters
-obs_min_height: 0.1  # Ignore things less than this high when planning motions
+obs_min_height: 0.15  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
 neg_obs_height: -0.05  # Things less than this height ARE obstacles
 use_negative_obstacles: True  # Use the negative height as an obstacle
@@ -24,7 +24,7 @@ local_radius: 0.8  # Area around the robot to mark as explored (kind of a hack)
 add_local_every_step: False
 remove_visited_from_obstacles: False
 min_depth: 0.5
-max_depth: 2.0
+max_depth: 2.5
 
 # Object detection parameters
 detection:
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index f94c68a57..496aec044 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -420,6 +420,10 @@ def log_ee_camera(self, servo):
         if self.show_camera_point_clouds:
             ee_xyz = servo.get_ee_xyz_in_world_frame().reshape(-1, 3)
             ee_rgb = servo.ee_rgb.reshape(-1, 3)
+            # Remove points below z = 0
+            idx = np.where(ee_xyz[:, 2] > 0)
+            ee_xyz = ee_xyz[idx]
+            ee_rgb = ee_rgb[idx]
             if self.max_displayed_points_per_camera > 0:
                 idx = np.arange(ee_xyz.shape[0])
                 np.random.shuffle(idx)

From 8678f9d8495b953899ea52981e5306706b573ac0 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 3 Oct 2024 11:18:21 -0400
Subject: [PATCH 290/410] UI improvements

---
 src/stretch/visualization/rerun.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 496aec044..a3ba322f7 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -508,12 +508,16 @@ def update_voxel_map(
         # Get obstacles and explored points
         grid_resolution = space.voxel_map.grid_resolution
         obs_points = np.array(occupancy_map_to_3d_points(obstacles, grid_origin, grid_resolution))
+        # Move obs_points z up slightly to avoid z-fighting
+        obs_points[:, 2] += 0.01
         t4 = timeit.default_timer()
 
         # Get explored points
         explored_points = np.array(
             occupancy_map_to_3d_points(explored, grid_origin, grid_resolution)
         )
+        # Move explored z points down slightly to avoid z-fighting
+        explored_points[:, 2] -= 0.01
         t5 = timeit.default_timer()
 
         # Log points

From 778b2108aa6ddb564cd8d08fb6a9eec08ef06b14 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 3 Oct 2024 11:24:04 -0400
Subject: [PATCH 291/410] pad more around the obstacles when exploring and
 moving

---
 src/stretch/config/default_planner.yaml | 6 +++---
 src/stretch/visualization/rerun.py      | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index aa30a640b..576ba53cc 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -17,14 +17,14 @@ obs_min_density: 25  # This many points makes it an obstacle
 min_points_per_voxel: 25  # Drop things below this density per voxel
 
 # Padding
-pad_obstacles: 2  # Add this many units (voxel_size) to the area around obstacles
-min_pad_obstacles: 0  # Do not pad LESS than this amount, for safety.
+pad_obstacles: 3  # Add this many units (voxel_size) to the area around obstacles
+min_pad_obstacles: 1  # Do not pad LESS than this amount, for safety.
 
 local_radius: 0.8  # Area around the robot to mark as explored (kind of a hack)
 add_local_every_step: False
 remove_visited_from_obstacles: False
 min_depth: 0.5
-max_depth: 2.5
+max_depth: 2.0
 
 # Object detection parameters
 detection:
diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index a3ba322f7..8b51c833b 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -508,6 +508,7 @@ def update_voxel_map(
         # Get obstacles and explored points
         grid_resolution = space.voxel_map.grid_resolution
         obs_points = np.array(occupancy_map_to_3d_points(obstacles, grid_origin, grid_resolution))
+
         # Move obs_points z up slightly to avoid z-fighting
         obs_points[:, 2] += 0.01
         t4 = timeit.default_timer()

From 5ff56da1e620193ee9076492e399f4916ebdb4d2 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 3 Oct 2024 11:27:28 -0400
Subject: [PATCH 292/410] Updating some configuration variables; exploring
 through gaps at hoem

---
 src/stretch/config/default_planner.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 576ba53cc..305c9d040 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -13,8 +13,8 @@ obs_min_height: 0.15  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
 neg_obs_height: -0.05  # Things less than this height ARE obstacles
 use_negative_obstacles: True  # Use the negative height as an obstacle
-obs_min_density: 25  # This many points makes it an obstacle
-min_points_per_voxel: 25  # Drop things below this density per voxel
+obs_min_density: 10  # This many points makes it an obstacle
+min_points_per_voxel: 15  # Drop things below this density per voxel
 
 # Padding
 pad_obstacles: 3  # Add this many units (voxel_size) to the area around obstacles
@@ -115,7 +115,7 @@ motion_planner:
   # Parameters for frontier exploration using the motion planner.
   frontier:
     dilate_frontier_size: 5  # Used to shrink the frontier back from the edges of the world
-    dilate_obstacle_size: 6  # Used when selecting goals and computing what the "frontier" is 
+    dilate_obstacle_size: 7  # Used when selecting goals and computing what the "frontier" is 
     default_expand_frontier_size: 10  # margin along the frontier where final robot position can be
     # Distance away you search for frontier points
     min_dist: 0.1

From 8429e6e8ee7aa59dedff9b714473a962f0de4842 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 3 Oct 2024 11:29:38 -0400
Subject: [PATCH 293/410] use history to go home at end

---
 src/stretch/agent/robot_agent.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index b2aa03ce5..563e73a1c 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1291,6 +1291,7 @@ def run_exploration(
             print("Go back to (0, 0, 0) to finish...")
             start = self.robot.get_base_pose()
             goal = np.array([0, 0, 0])
+            self.planner.space.push_locations_to_stack(self.get_history(reversed=True))
             res = self.planner.plan(start, goal)
             # if it fails, skip; else, execute a trajectory to this position
             if res.success:

From fcba85a848bd887f3f72fd8b7847c308d1f7ee04 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 3 Oct 2024 11:51:38 -0400
Subject: [PATCH 294/410] mprove tuning a little bit

---
 src/stretch/app/ai_pickup.py            | 6 ++++++
 src/stretch/config/default_planner.yaml | 6 ++++--
 src/stretch/mapping/voxel/voxel.py      | 4 ++--
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index 73b9ca41e..1ae766534 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -88,6 +88,12 @@
     is_flag=True,
     help="Set to use voice input",
 )
+@click.option(
+    "--radius",
+    default=3.0,
+    type=float,
+    help="Radius of the circle around initial position where the robot is allowed to go.",
+)
 @click.option("--open_loop", "--open-loop", is_flag=True, help="Use open loop grasping")
 def main(
     robot_ip: str = "192.168.1.15",
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 305c9d040..3c3fabbce 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -35,7 +35,9 @@ detection:
 
 # Point cloud cleanup
 filters:
-  smooth_kernel_size: 3
+  # Use a simple convolutional filter
+  # smooth_kernel_size: 3
+  smooth_kernel_size: 4
   # smooth_kernel_size: 0
   use_median_filter: True
   median_filter_size: 5
@@ -115,7 +117,7 @@ motion_planner:
   # Parameters for frontier exploration using the motion planner.
   frontier:
     dilate_frontier_size: 5  # Used to shrink the frontier back from the edges of the world
-    dilate_obstacle_size: 7  # Used when selecting goals and computing what the "frontier" is 
+    dilate_obstacle_size: 6  # Used when selecting goals and computing what the "frontier" is 
     default_expand_frontier_size: 10  # margin along the frontier where final robot position can be
     # Distance away you search for frontier points
     min_dist: 0.1
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index 446f3c62e..54d563ce0 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -902,8 +902,8 @@ def get_2d_map(self, force_update=False, debug: bool = False) -> Tuple[np.ndarra
             )[0, 0].bool()
 
             # Obstacles just get dilated and eroded
-            obstacles = binary_erosion(
-                binary_dilation(obstacles.float().unsqueeze(0).unsqueeze(0), self.smooth_kernel),
+            obstacles = binary_dilation(
+                binary_erosion(obstacles.float().unsqueeze(0).unsqueeze(0), self.smooth_kernel),
                 self.smooth_kernel,
             )[0, 0].bool()
 

From 2321f658131d91dc99045a419221aa57bd34d00d Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 3 Oct 2024 15:53:22 -0400
Subject: [PATCH 295/410] update code and change server to stop

---
 .../agent/task/pickup/pickup_executor.py      | 45 +++++++++++++++++--
 src/stretch/app/ai_pickup.py                  | 45 ++-----------------
 src/stretch/config/default_planner.yaml       |  4 +-
 .../stretch_ros2_bridge/remote/server.py      |  2 +
 4 files changed, 50 insertions(+), 46 deletions(-)

diff --git a/src/stretch/agent/task/pickup/pickup_executor.py b/src/stretch/agent/task/pickup/pickup_executor.py
index a178d5db9..127d80247 100644
--- a/src/stretch/agent/task/pickup/pickup_executor.py
+++ b/src/stretch/agent/task/pickup/pickup_executor.py
@@ -11,14 +11,22 @@
 
 import stretch.utils.logger as logger
 from stretch.agent.robot_agent import RobotAgent
+from stretch.agent.task.pickup import PickupTask
 from stretch.core import AbstractRobotClient
 
 
 class PickupExecutor:
     """This class parses commands from the pickup llm bot and sends them to the robot."""
 
+    _pickup_task_mode = "one_shot"
+
     def __init__(
-        self, robot: AbstractRobotClient, agent: RobotAgent, dry_run: bool = False
+        self,
+        robot: AbstractRobotClient,
+        agent: RobotAgent,
+        match_method: str = "feature",
+        open_loop: bool = False,
+        dry_run: bool = False,
     ) -> None:
         """Initialize the executor.
 
@@ -31,8 +39,32 @@ def __init__(
         self.agent = agent
         self.dry_run = dry_run
 
+        # Configuration
+        self._match_method = match_method
+        self._open_loop = open_loop
+
+    def _pickup(self, target_object: str, target_receptacle: str) -> None:
+
+        # After the robot has started...
+        try:
+            pickup_task = PickupTask(
+                self.agent,
+                target_object=target_object,
+                target_receptacle=target_receptacle,
+                matching=self._match_method,
+                use_visual_servoing_for_grasp=not self._open_loop,
+            )
+            task = pickup_task.get_task(add_rotate=True, mode=self._pickup_task_mode)
+        except Exception as e:
+            print(f"Error creating task: {e}")
+            self.robot.stop()
+            raise e
+
+        # Execute the task
+        task.run()
+
     def __call__(self, response: List[Tuple[str, str]]) -> None:
-        """Execute the list of commands."""
+        """Execute the list of commands given by the LLM bot."""
         i = 0
 
         # Loop over every command we have been given
@@ -42,8 +74,11 @@ def __call__(self, response: List[Tuple[str, str]]) -> None:
             command, args = response[i]
             logger.info(f"{i} {command} {args}")
             if command == "say":
+                # Use TTS to say the text
+                logger.info(f"[Pickup task] Saying: {args}")
                 self.agent.robot_say(args)
             elif command == "pickup":
+                logger.info(f"[Pickup task] Pickup: {args}")
                 target_object = args
                 i += 1
                 next_command, next_args = response[i]
@@ -51,7 +86,10 @@ def __call__(self, response: List[Tuple[str, str]]) -> None:
                     i -= 1
                     logger.error("Pickup without place! Doing nothing.")
                 else:
-                    logger.info(f"{i} {command} {args}")
+                    logger.info(f"{i} {next_command} {next_args}")
+                    logger.info(f"[Pickup task] Place: {next_args}")
+                target_receptacle = next_args
+                self._pickup(target_object, target_receptacle)
                 breakpoint()
             elif command == "place":
                 logger.error("Place without pickup! Doing nothing.")
@@ -68,6 +106,7 @@ def __call__(self, response: List[Tuple[str, str]]) -> None:
             elif command == "avert_gaze":
                 self.agent.avert_gaze()
             elif command == "end":
+                logger.info("[Pickup task] Ending.")
                 break
             else:
                 logger.error(f"Skipping unknown command: {command}")
diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index 1ae766534..053870bcb 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -13,7 +13,7 @@
 
 # import stretch.utils.logger as logger
 from stretch.agent.robot_agent import RobotAgent
-from stretch.agent.task.pickup import PickupTask
+from stretch.agent.task.pickup import PickupExecutor
 from stretch.agent.zmq_client import HomeRobotZmqClient
 from stretch.core import get_parameters
 from stretch.llms import LLMChatWrapper, PickupPromptBuilder, get_llm_choices, get_llm_client
@@ -37,12 +37,6 @@
     help="Method to match objects to pick up. Options: class, feature.",
     show_default=True,
 )
-@click.option(
-    "--mode",
-    default="one_shot",
-    help="Mode of operation for the robot.",
-    type=click.Choice(["one_shot", "all"]),
-)
 @click.option(
     "--llm",
     default="gemma2b",
@@ -105,7 +99,6 @@ def main(
     reset: bool = False,
     target_object: str = "",
     receptacle: str = "",
-    mode: str = "one_shot",
     match_method: str = "feature",
     llm: str = "gemma",
     use_llm: bool = False,
@@ -136,6 +129,7 @@ def main(
         agent.move_closed_loop([0, 0, 0], max_time=60.0)
 
     prompt = PickupPromptBuilder()
+    executor = PickupExecutor(agent, robot, dry_run=False)
 
     # Get the LLM client
     llm_client = None
@@ -157,39 +151,8 @@ def main(
         else:
             # Call the LLM client and parse
             llm_response = chat_wrapper.query()
-            target_object = prompt.get_object(llm_response)
-            receptacle = prompt.get_receptacle(llm_response)
-            say_this = prompt.get_say_this(llm_response)
-            # print("LLM response:", llm_response)
-            # print("Target object:", target_object)
-            # print("Receptacle:", receptacle)
-            # print("Say this:", say_this)
-
-        if say_this is not None:
-            chat_wrapper.say(say_this)
-            agent.robot_say(say_this)
-
-        if len(target_object) == 0 or len(receptacle) == 0:
-            # logger.error("You need to enter a target object and receptacle")
-            continue
-
-        # After the robot has started...
-        try:
-            pickup_task = PickupTask(
-                agent,
-                target_object=target_object,
-                target_receptacle=receptacle,
-                matching=match_method,
-                use_visual_servoing_for_grasp=not open_loop,
-            )
-            task = pickup_task.get_task(add_rotate=True, mode=mode)
-        except Exception as e:
-            print(f"Error creating task: {e}")
-            robot.stop()
-            raise e
-
-        # Execute the task
-        task.run()
+
+        executor(llm_response)
 
         if reset:
             # Send the robot home at the end!
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index e702e6f44..0e3b261f4 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -36,8 +36,8 @@ detection:
 # Point cloud cleanup
 filters:
   # Use a simple convolutional filter
-  # smooth_kernel_size: 3
-  smooth_kernel_size: 4
+  smooth_kernel_size: 3
+  # smooth_kernel_size: 4
   # smooth_kernel_size: 0
   use_median_filter: True
   # median_filter_size: 4
diff --git a/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py b/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py
index 3342ceabf..f5629a0dd 100755
--- a/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py
+++ b/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py
@@ -113,10 +113,12 @@ def handle_action(self, action: Dict[str, Any]):
         """Handle an action from the client."""
         if "posture" in action:
             if action["posture"] == "manipulation":
+                self.client.stop()
                 self.client.switch_to_busy_mode()
                 self.client.move_to_manip_posture()
                 self.client.switch_to_manipulation_mode()
             elif action["posture"] == "navigation":
+                self.client.stop()
                 self.client.switch_to_busy_mode()
                 self.client.move_to_nav_posture()
                 self.client.switch_to_navigation_mode()

From 0f00aaa43000c6744cc5a349b00c98d81e729688 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 3 Oct 2024 16:00:11 -0400
Subject: [PATCH 296/410] temporarily increase the sleeps since it is taking a
 long time to do mapping

---
 src/stretch/agent/robot_agent.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 21d1767d4..3ea970094 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -440,7 +440,7 @@ def update(
         if move_head:
             self.robot.move_to_nav_posture()
             # Pause a bit first to make sure the robot is in the right posture
-            time.sleep(0.1)
+            time.sleep(0.5)
             num_steps = 5
         else:
             num_steps = 1
@@ -458,7 +458,7 @@ def update(
                 pan = -1 * i * np.pi / 4
                 print(f"[UPDATE] Head sweep {i} at {pan}, {tilt}")
                 self.robot.head_to(pan, tilt, blocking=True)
-                time.sleep(0.1)
+                time.sleep(0.5)
                 obs = self.robot.get_observation()
 
             t1 = timeit.default_timer()

From 8241262435153a972f62f43e311c61a85968f4d2 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 3 Oct 2024 16:02:09 -0400
Subject: [PATCH 297/410] commentary

---
 src/stretch_ros2_bridge/stretch_ros2_bridge/remote/api.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/api.py b/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/api.py
index d6ffeee46..51c79cdd9 100644
--- a/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/api.py
+++ b/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/api.py
@@ -154,6 +154,7 @@ def reset(self):
         self.stop()
 
     def stop(self):
+        """Stop the robot"""
         self.nav.disable()
         self.manip.disable()
         self._base_control_mode = ControlMode.IDLE

From 5ebcaa8ff307c4c77836845d9c8cff647efd926b Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 3 Oct 2024 16:06:28 -0400
Subject: [PATCH 298/410] updates to controller config

---
 src/stretch/config/control/noplan_velocity_hw.yaml | 3 ++-
 src/stretch/config/default_planner.yaml            | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/stretch/config/control/noplan_velocity_hw.yaml b/src/stretch/config/control/noplan_velocity_hw.yaml
index 8d8265662..4461dea16 100644
--- a/src/stretch/config/control/noplan_velocity_hw.yaml
+++ b/src/stretch/config/control/noplan_velocity_hw.yaml
@@ -28,7 +28,8 @@ min_ang_error_tol: 0.05
 max_heading_ang: 0.7854  # 45 degrees
 
 # Maximum distance to goal at which the controller will opt for moving backwards over turning around
-max_rev_dist: 0.15
+# This should be slightly larger than the distance it overshoots targets!
+max_rev_dist: 0.25
 
 # Time taken before we stop trying
 timeout: 30.0
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 0e3b261f4..740e6a9f3 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -110,8 +110,8 @@ scene_graph:
 
 # Navigation space - used for motion planning and computing goals.
 motion_planner:
-  step_size: 0.04
-  rotation_step_size: 0.05
+  step_size: 0.05
+  rotation_step_size: 0.1
   simplify_plans: True
   shortcut_plans: True
   shortcut_iter: 100

From 3699fa7a5f8f4be287b2d21364c0aba94e82bd48 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 3 Oct 2024 16:10:50 -0400
Subject: [PATCH 299/410] updates

---
 src/stretch/config/control/noplan_velocity_hw.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/stretch/config/control/noplan_velocity_hw.yaml b/src/stretch/config/control/noplan_velocity_hw.yaml
index 4461dea16..419e3a97e 100644
--- a/src/stretch/config/control/noplan_velocity_hw.yaml
+++ b/src/stretch/config/control/noplan_velocity_hw.yaml
@@ -1,12 +1,12 @@
 # Parameters of the trapezoidal velocity profile (acceleration & max speed)
-#v_max: 0.2  # base.params["motion"]["default"]["vel_m"]
-#w_max: 0.45  # (vel_m_max - vel_m_default) / wheel_separation_m
+v_max: 0.2  # base.params["motion"]["default"]["vel_m"]
+w_max: 0.45  # (vel_m_max - vel_m_default) / wheel_separation_m
 #v_max: 0.25  # base.params["motion"]["default"]["vel_m"]
 #w_max: 0.5  # (vel_m_max - vel_m_default) / wheel_separation_m
 #acc_lin: 0.2  # 0.5 * base.params["motion"]["max"]["accel_m"]
 #acc_ang: 0.4  # 0.5 * (accel_m_max - accel_m_default) / wheel_separation_m
-v_max: 10.0  # base.params["motion"]["default"]["vel_m"]
-w_max: 5.0  # (vel_m_max - vel_m_default) / wheel_separation_m
+#v_max: 10.0  # base.params["motion"]["default"]["vel_m"]
+#w_max: 5.0  # (vel_m_max - vel_m_default) / wheel_separation_m
   #w_max: 0.6  # (vel_m_max - vel_m_default) / wheel_separation_m
 acc_lin: 0.1  # 0.5 * base.params["motion"]["max"]["accel_m"]
 acc_ang: 0.2  # 0.5 * (accel_m_max - accel_m_default) / wheel_separation_m

From b7d0c5927ff0f989d82aa0f4c6ae072b878d7756 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 3 Oct 2024 16:40:03 -0400
Subject: [PATCH 300/410] update

---
 src/stretch/config/control/noplan_velocity_hw.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/stretch/config/control/noplan_velocity_hw.yaml b/src/stretch/config/control/noplan_velocity_hw.yaml
index 419e3a97e..a041b3c8c 100644
--- a/src/stretch/config/control/noplan_velocity_hw.yaml
+++ b/src/stretch/config/control/noplan_velocity_hw.yaml
@@ -29,7 +29,8 @@ max_heading_ang: 0.7854  # 45 degrees
 
 # Maximum distance to goal at which the controller will opt for moving backwards over turning around
 # This should be slightly larger than the distance it overshoots targets!
-max_rev_dist: 0.25
+# max_rev_dist: 0.25
+max_rev_dist: 0.5
 
 # Time taken before we stop trying
 timeout: 30.0

From 3b50e5b441103021c2b51e7caf7e2c937bc6924d Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 3 Oct 2024 16:45:42 -0400
Subject: [PATCH 301/410] update

---
 src/stretch/agent/robot_agent.py                 | 9 +++++++++
 src/stretch/agent/task/pickup/pickup_executor.py | 2 +-
 src/stretch/app/ai_pickup.py                     | 4 ++++
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 3ea970094..9866cac22 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1120,6 +1120,15 @@ def choose_best_goal_instance(self, goal: str, debug: bool = False) -> Instance:
                 best_score = goal_score
         return best_instance
 
+    def set_allowed_radius(self, radius: float):
+        """Set the allowed radius for the robot to move to. This is used to limit the robot's movement, particularly when exploring.
+
+        Args:
+            radius(float): the radius in meters
+        """
+        logger.info("[Agent] Setting allowed radius to", radius, "meters.")
+        logger.error("Setting allowed radius is not yet supported.")
+
     def plan_to_frontier(
         self,
         start: np.ndarray,
diff --git a/src/stretch/agent/task/pickup/pickup_executor.py b/src/stretch/agent/task/pickup/pickup_executor.py
index 127d80247..73d76ef1d 100644
--- a/src/stretch/agent/task/pickup/pickup_executor.py
+++ b/src/stretch/agent/task/pickup/pickup_executor.py
@@ -11,7 +11,7 @@
 
 import stretch.utils.logger as logger
 from stretch.agent.robot_agent import RobotAgent
-from stretch.agent.task.pickup import PickupTask
+from stretch.agent.task.pickup.pickup_task import PickupTask
 from stretch.core import AbstractRobotClient
 
 
diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index 053870bcb..cc7accd2c 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -104,6 +104,7 @@ def main(
     use_llm: bool = False,
     use_voice: bool = False,
     open_loop: bool = False,
+    radius: float = 3.0,
 ):
     """Set up the robot, create a task plan, and execute it."""
     # Create robot
@@ -128,6 +129,9 @@ def main(
     if reset:
         agent.move_closed_loop([0, 0, 0], max_time=60.0)
 
+    if radius is not None and radius > 0:
+        robot.set_allowed_radius(radius)
+
     prompt = PickupPromptBuilder()
     executor = PickupExecutor(agent, robot, dry_run=False)
 

From cc028d902c861423af5c3f5ab734c86719cac980 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 3 Oct 2024 16:46:15 -0400
Subject: [PATCH 302/410] add radius code as a placeholder

---
 src/stretch/app/ai_pickup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index cc7accd2c..22fa836d3 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -130,7 +130,7 @@ def main(
         agent.move_closed_loop([0, 0, 0], max_time=60.0)
 
     if radius is not None and radius > 0:
-        robot.set_allowed_radius(radius)
+        agent.set_allowed_radius(radius)
 
     prompt = PickupPromptBuilder()
     executor = PickupExecutor(agent, robot, dry_run=False)

From dd0ffa4273cfee34c93f2257811586efaea19906 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 3 Oct 2024 16:48:44 -0400
Subject: [PATCH 303/410] Updates

---
 src/stretch/agent/task/pickup/pickup_executor.py | 9 +++++++++
 src/stretch/app/ai_pickup.py                     | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/task/pickup/pickup_executor.py b/src/stretch/agent/task/pickup/pickup_executor.py
index 73d76ef1d..f476ffc85 100644
--- a/src/stretch/agent/task/pickup/pickup_executor.py
+++ b/src/stretch/agent/task/pickup/pickup_executor.py
@@ -37,6 +37,13 @@ def __init__(
         """
         self.robot = robot
         self.agent = agent
+
+        # Do type checks
+        if not isinstance(self.robot, AbstractRobotClient):
+            raise TypeError(f"Expected AbstractRobotClient, got {type(self.robot)}")
+        if not isinstance(self.agent, RobotAgent):
+            raise TypeError(f"Expected RobotAgent, got {type(self.agent)}")
+
         self.dry_run = dry_run
 
         # Configuration
@@ -110,3 +117,5 @@ def __call__(self, response: List[Tuple[str, str]]) -> None:
                 break
             else:
                 logger.error(f"Skipping unknown command: {command}")
+
+            i += 1
diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index 22fa836d3..2ace0f74a 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -133,7 +133,7 @@ def main(
         agent.set_allowed_radius(radius)
 
     prompt = PickupPromptBuilder()
-    executor = PickupExecutor(agent, robot, dry_run=False)
+    executor = PickupExecutor(robot, agent, dry_run=False)
 
     # Get the LLM client
     llm_client = None

From 16cff1b753be25eabcad8e8645d041422cce0254 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 3 Oct 2024 16:56:12 -0400
Subject: [PATCH 304/410] update logger

---
 src/stretch/utils/logger.py | 55 +++++++++++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 12 deletions(-)

diff --git a/src/stretch/utils/logger.py b/src/stretch/utils/logger.py
index 703909d6b..7426a4225 100644
--- a/src/stretch/utils/logger.py
+++ b/src/stretch/utils/logger.py
@@ -10,29 +10,60 @@
 from termcolor import colored
 
 
-def _flatten(args: tuple) -> str:
-    """Flatten a tuple of arguments into a string joined by spaces.
+class Logger:
+    def __init__(self, name: str, hide_info: bool = False) -> None:
+        self.name = name
+        self._hide_info = hide_info
 
-    Args:
-        args (tuple): Tuple of arguments to flatten.
+    def hide_info(self) -> None:
+        self._hide_info = True
 
-    Returns:
-        str: Flattened string.
-    """
-    return " ".join([str(arg) for arg in args])
+    def _flatten(self, args: tuple) -> str:
+        """Flatten a tuple of arguments into a string joined by spaces.
+
+        Args:
+            args (tuple): Tuple of arguments to flatten.
+
+        Returns:
+            str: Flattened string.
+        """
+        text = " ".join([str(arg) for arg in args])
+        if self.name is not None:
+            text = f"[{self.name}] {text}"
+        return text
+
+    def error(self, *args) -> None:
+        text = self._flatten(args)
+        print(colored(text, "red"))
+
+    def info(self, *args) -> None:
+        if not self._hide_info:
+            text = self._flatten(args)
+            print(colored(text, "white"))
+
+    def warning(self, *args) -> None:
+        text = self._flatten(args)
+        print(colored(text, "yellow"))
+
+    def alert(self, *args) -> None:
+        text = self._flatten(args)
+        print(colored(text, "green"))
+
+
+_default_logger = Logger(None)
 
 
 def error(*args) -> None:
-    print(colored(_flatten(args), "red"))
+    _default_logger.error(*args)
 
 
 def info(*args) -> None:
-    print(*args)
+    _default_logger.info(*args)
 
 
 def warning(*args) -> None:
-    print(colored(_flatten(args), "yellow"))
+    _default_logger.warning(*args)
 
 
 def alert(*args) -> None:
-    print(colored(_flatten(args), "green"))
+    _default_logger.alert(*args)

From a6d59699c5532abf4c50de122d0a293db0d2b59b Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 3 Oct 2024 17:11:03 -0400
Subject: [PATCH 305/410] update code to do emotes and stuff like that

---
 src/stretch/agent/robot_agent.py              |  8 +++++
 src/stretch/agent/task/emote/emote_task.py    | 29 +++++++++++++++++--
 .../agent/task/pickup/pickup_executor.py      | 18 ++++++++----
 src/stretch/app/emote_cycle.py                | 20 ++++---------
 src/stretch/llms/prompts/pickup_prompt.py     | 15 +++++++++-
 5 files changed, 65 insertions(+), 25 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 9866cac22..74964f973 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -881,6 +881,14 @@ def move_to_any_instance(
 
         return False
 
+    def move_to_manip_posture(self):
+        """Move the robot to manipulation posture."""
+        self.robot.move_to_manip_posture()
+
+    def move_to_nav_posture(self):
+        """Move the robot to navigation posture."""
+        self.robot.move_to_nav_posture()
+
     def print_found_classes(self, goal: Optional[str] = None):
         """Helper. print out what we have found according to detic."""
         if self.semantic_sensor is None:
diff --git a/src/stretch/agent/task/emote/emote_task.py b/src/stretch/agent/task/emote/emote_task.py
index 0dfc110df..46672871a 100644
--- a/src/stretch/agent/task/emote/emote_task.py
+++ b/src/stretch/agent/task/emote/emote_task.py
@@ -7,6 +7,14 @@
 # Some code may be adapted from other open-source works with their respective licenses. Original
 # license information maybe found below, if so.
 
+from typing import Union
+
+from stretch.agent.operations import (
+    AvertGazeOperation,
+    NodHeadOperation,
+    ShakeHeadOperation,
+    WaveOperation,
+)
 from stretch.agent.robot_agent import RobotAgent
 from stretch.core.task import Operation, Task
 
@@ -17,14 +25,29 @@ class EmoteTask:
     """
 
     def __init__(self, agent: RobotAgent):
-        super().__init__(agent)
+        super().__init__()
 
         # random stuff that has to be synced...
+        self.agent = agent
         self.navigation_space = agent.space
         self.parameters = agent.parameters
         self.robot = agent.robot
 
-    def get_task(self, emote_operation: Operation) -> Task:
+    def get_task(self, emote_operation: Union[Operation, str]) -> Task:
         task = Task()
-        task.add_operation(emote_operation)
+        if isinstance(emote_operation, Operation):
+            task.add_operation(emote_operation)
+        elif isinstance(emote_operation, str):
+            if emote_operation == "nod" or emote_operation == "nod_head":
+                task.add_operation(NodHeadOperation("emote", self.agent))
+            elif emote_operation == "shake" or emote_operation == "shake_head":
+                task.add_operation(ShakeHeadOperation("emote", self.agent))
+            elif emote_operation == "wave":
+                task.add_operation(WaveOperation("emote", self.agent))
+            elif emote_operation == "avert" or emote_operation == "avert_gaze":
+                task.add_operation(AvertGazeOperation("emote", self.agent))
+            else:
+                raise ValueError(f"Invalid emote operation: {emote_operation}")
+        else:
+            raise TypeError(f"Expected Operation or str, got {type(emote_operation)}")
         return task
diff --git a/src/stretch/agent/task/pickup/pickup_executor.py b/src/stretch/agent/task/pickup/pickup_executor.py
index f476ffc85..33df2908f 100644
--- a/src/stretch/agent/task/pickup/pickup_executor.py
+++ b/src/stretch/agent/task/pickup/pickup_executor.py
@@ -9,10 +9,15 @@
 
 from typing import List, Tuple
 
-import stretch.utils.logger as logger
 from stretch.agent.robot_agent import RobotAgent
+from stretch.agent.task.emote import EmoteTask
 from stretch.agent.task.pickup.pickup_task import PickupTask
 from stretch.core import AbstractRobotClient
+from stretch.utils.logger import Logger
+
+logger = Logger(__name__)
+# Default to hiding info messages
+# logger.hide_info()
 
 
 class PickupExecutor:
@@ -45,6 +50,7 @@ def __init__(
             raise TypeError(f"Expected RobotAgent, got {type(self.agent)}")
 
         self.dry_run = dry_run
+        self.emote_task = EmoteTask(self.agent)
 
         # Configuration
         self._match_method = match_method
@@ -97,21 +103,21 @@ def __call__(self, response: List[Tuple[str, str]]) -> None:
                     logger.info(f"[Pickup task] Place: {next_args}")
                 target_receptacle = next_args
                 self._pickup(target_object, target_receptacle)
-                breakpoint()
             elif command == "place":
                 logger.error("Place without pickup! Doing nothing.")
             elif command == "wave":
-                self.agent.wave()
+                self.agent.move_to_manip_posture()
+                self.emote_task.get_task("wave").run()
             elif command == "go_home":
                 self.agent.go_home()
             elif command == "explore":
                 self.agent.explore()
             elif command == "nod_head":
-                self.agent.nod_head()
+                self.emote_task.get_task("nod_head").run()
             elif command == "shake_head":
-                self.agent.shake_head()
+                self.emote_task.get_task("shake_head").run()
             elif command == "avert_gaze":
-                self.agent.avert_gaze()
+                self.emote_task.get_task("avert_gaze").run()
             elif command == "end":
                 logger.info("[Pickup task] Ending.")
                 break
diff --git a/src/stretch/app/emote_cycle.py b/src/stretch/app/emote_cycle.py
index e09a141af..c646fabc0 100644
--- a/src/stretch/app/emote_cycle.py
+++ b/src/stretch/app/emote_cycle.py
@@ -9,12 +9,7 @@
 # Some code may be adapted from other open-source works with their respective licenses. Original
 # license information maybe found below, if so.
 
-from stretch.agent.operations import (
-    AvertGazeOperation,
-    NodHeadOperation,
-    ShakeHeadOperation,
-    WaveOperation,
-)
+
 from stretch.agent.robot_agent import RobotAgent
 from stretch.agent.task.emote import EmoteTask
 from stretch.agent.zmq_client import HomeRobotZmqClient
@@ -41,21 +36,16 @@ def main(
 
     # create emote task
     emote_task = EmoteTask(demo)
-    task = emote_task.get_task(NodHeadOperation("emote", demo))
-
-    # run task
+    task = emote_task.get_task("nod")
     task.run()
 
-    task = emote_task.get_task(ShakeHeadOperation("emote", demo))
-
+    task = emote_task.get_task("shake_head")
     task.run()
 
-    task = emote_task.get_task(WaveOperation("emote", demo))
-
+    task = emote_task.get_task("wave")
     task.run()
 
-    task = emote_task.get_task(AvertGazeOperation("emote", demo))
-
+    task = emote_task.get_task("avert_gaze")
     task.run()
 
 
diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index a7ed21241..49dffa128 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -45,7 +45,20 @@
 wave()
 end()
 
-You will never say anything other than pickup(), place(), and say(). Remember to be friendly, helpful, and concise. You will always explain what you are going to do before you do it. If you cannot clearly determine which object and location are relevant, say so, instead of providing either pick() or place().
+You can answer questions:
+
+input: "What is your name?"
+output:
+say("My name is Stretch.")
+end()
+
+input: "Is the sky blue?"
+output:
+say("Yes, the sky is blue.")
+nod_head()
+end()
+
+Remember to be friendly, helpful, and concise. You will always explain what you are going to do before you do it. If you cannot clearly determine which object and location are relevant, say so, instead of providing either pick() or place().
 
 You will be polite when using the say() function. (e.g., "please", "thank you") and use complete sentences. You can answer simple commonsense questions or respond. If you do not understand how to do something using these three actions, say you do not know. Do not hallucinate. You will always say something to acknowledge the user.
 

From a0084834f89d8f3645f2bc93789e4c578b9e8336 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 3 Oct 2024 17:13:29 -0400
Subject: [PATCH 306/410] doing some simple tasks

---
 .../agent/task/pickup/pickup_executor.py      |  1 +
 src/stretch/llms/prompts/pickup_prompt.py     | 19 ++++++++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/task/pickup/pickup_executor.py b/src/stretch/agent/task/pickup/pickup_executor.py
index 33df2908f..d075bfa72 100644
--- a/src/stretch/agent/task/pickup/pickup_executor.py
+++ b/src/stretch/agent/task/pickup/pickup_executor.py
@@ -108,6 +108,7 @@ def __call__(self, response: List[Tuple[str, str]]) -> None:
             elif command == "wave":
                 self.agent.move_to_manip_posture()
                 self.emote_task.get_task("wave").run()
+                self.agent.move_to_manip_posture()
             elif command == "go_home":
                 self.agent.go_home()
             elif command == "explore":
diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index 49dffa128..7d9bb4d55 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -50,6 +50,7 @@
 input: "What is your name?"
 output:
 say("My name is Stretch.")
+wave()
 end()
 
 input: "Is the sky blue?"
@@ -58,9 +59,25 @@
 nod_head()
 end()
 
+You can also say you don't know:
+
+input: "What is the meaning of life?"
+output:
+say("I don't know.")
+shake_head()
+end()
+
+You can answer simple questions:
+
+input: "What is 2 + 2?"
+output:
+say("2 + 2 is 4.")
+end()
+
+
 Remember to be friendly, helpful, and concise. You will always explain what you are going to do before you do it. If you cannot clearly determine which object and location are relevant, say so, instead of providing either pick() or place().
 
-You will be polite when using the say() function. (e.g., "please", "thank you") and use complete sentences. You can answer simple commonsense questions or respond. If you do not understand how to do something using these three actions, say you do not know. Do not hallucinate. You will always say something to acknowledge the user.
+You will be polite when using the say() function. (e.g., "please", "thank you") and use complete sentences. You can answer simple commonsense questions or respond. If you do not understand how to do something, say you do not know. Do not hallucinate. You will always say something to acknowledge the user.
 
 For example:
 

From 4617a10e4fe26a850aa1a73513eb922d5240aeff Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 09:56:17 -0400
Subject: [PATCH 307/410] Update file to allow loading a previously-explored
 map

---
 src/stretch/app/ai_pickup.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index 2ace0f74a..61403b157 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -70,6 +70,17 @@
     default="",
     help="Name of the receptacle to place the object in",
 )
+@click.option(
+    "--input-path",
+    "-i",
+    "--input_file",
+    "--input-file",
+    "--input",
+    "--input_path",
+    type=click.Path(),
+    default="",
+    help="Path to a saved datafile from a previous exploration of the world.",
+)
 @click.option(
     "--use_llm",
     "--use-llm",
@@ -105,6 +116,7 @@ def main(
     use_voice: bool = False,
     open_loop: bool = False,
     radius: float = 3.0,
+    input_path: str = "",
 ):
     """Set up the robot, create a task plan, and execute it."""
     # Create robot
@@ -132,6 +144,13 @@ def main(
     if radius is not None and radius > 0:
         agent.set_allowed_radius(radius)
 
+    # Load a PKL file from a previous run and process it
+    # This will use ICP to match current observations to the previous ones
+    # ANd then update the map with the new observations
+    if input_path is not None and len(input_path) > 0:
+        agent.load_map(input_path)
+
+    # Create the prompt we will use to control the robot
     prompt = PickupPromptBuilder()
     executor = PickupExecutor(robot, agent, dry_run=False)
 

From 5695f3c97aeb0156ceefb3532d9bb7547b33d472 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 10:01:39 -0400
Subject: [PATCH 308/410] Updating AI agent code to parse through the dialogue
 tree

---
 .../agent/task/pickup/pickup_executor.py       | 17 +++++++++++++++--
 src/stretch/app/ai_pickup.py                   |  5 +++--
 src/stretch/llms/prompts/pickup_prompt.py      | 18 ++++++++++++++++++
 3 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/src/stretch/agent/task/pickup/pickup_executor.py b/src/stretch/agent/task/pickup/pickup_executor.py
index d075bfa72..f3c131f34 100644
--- a/src/stretch/agent/task/pickup/pickup_executor.py
+++ b/src/stretch/agent/task/pickup/pickup_executor.py
@@ -76,8 +76,15 @@ def _pickup(self, target_object: str, target_receptacle: str) -> None:
         # Execute the task
         task.run()
 
-    def __call__(self, response: List[Tuple[str, str]]) -> None:
-        """Execute the list of commands given by the LLM bot."""
+    def __call__(self, response: List[Tuple[str, str]]) -> bool:
+        """Execute the list of commands given by the LLM bot.
+
+        Args:
+            response: A list of tuples, where the first element is the command and the second is the argument.
+
+        Returns:
+            True if we should keep going, False if we should stop.
+        """
         i = 0
 
         # Loop over every command we have been given
@@ -119,6 +126,10 @@ def __call__(self, response: List[Tuple[str, str]]) -> None:
                 self.emote_task.get_task("shake_head").run()
             elif command == "avert_gaze":
                 self.emote_task.get_task("avert_gaze").run()
+            elif command == "quit":
+                logger.info("[Pickup task] Quitting.")
+                self.robot.stop()
+                return False
             elif command == "end":
                 logger.info("[Pickup task] Ending.")
                 break
@@ -126,3 +137,5 @@ def __call__(self, response: List[Tuple[str, str]]) -> None:
                 logger.error(f"Skipping unknown command: {command}")
 
             i += 1
+        # If we did not explicitly receive a quit command, we are not yet done.
+        return True
diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index 61403b157..22ba66884 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -161,7 +161,8 @@ def main(
         chat_wrapper = LLMChatWrapper(llm_client, prompt=prompt, voice=use_voice)
 
     # Parse things and listen to the user
-    while robot.running:
+    ok = True
+    while robot.running and ok:
         agent.reset()
 
         say_this = None
@@ -175,7 +176,7 @@ def main(
             # Call the LLM client and parse
             llm_response = chat_wrapper.query()
 
-        executor(llm_response)
+        ok = executor(llm_response)
 
         if reset:
             # Send the robot home at the end!
diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index 7d9bb4d55..fcb940109 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -27,6 +27,7 @@
 - shake_head() # shake your head
 - avert_gaze() # avert your gaze
 - go_home()  # navigate back to where you started
+- quit()  # end the conversation
 
 These functions and their arguments are the only things you will return - no comments - and they are your only way to interact with the world. For example:
 
@@ -45,6 +46,17 @@
 wave()
 end()
 
+input: "Goodbye!"
+output:
+say("Goodbye!")
+wave()
+quit()
+
+input: "Stop."
+output:
+say("I am stopping.")
+quit()
+
 You can answer questions:
 
 input: "What is your name?"
@@ -150,6 +162,8 @@ def parse_response(self, response: str):
                 commands.append(line)
             elif line.startswith("avert_gaze()"):
                 commands.append(line)
+            elif line.startswith("quit()"):
+                commands.append(line)
             elif line.startswith("end()"):
                 # Stop parsing if we see the end command
                 break
@@ -175,6 +189,10 @@ def parse_response(self, response: str):
                 parsed_commands.append(("shake_head", ""))
             elif command.startswith("avert_gaze()"):
                 parsed_commands.append(("avert_gaze", ""))
+            elif command.startswith("quit()"):
+                # Quit actually shuts down the robot.
+                parsed_commands.append(("quit", ""))
+                break
             elif command.startswith("end()"):
                 # Stop parsing if we see the end command
                 # This really shouldn't happen, but just in case

From a756a6e9a2851780e62a33237a8d741ec4a82b6c Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 10:22:14 -0400
Subject: [PATCH 309/410] updating prompt to make sure that we can handle some
 more actions

---
 src/stretch/llms/prompts/pickup_prompt.py | 55 ++++++++++++-----------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index fcb940109..7a29ffc69 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -11,25 +11,22 @@
 
 from stretch.llms.base import AbstractPromptBuilder
 
-simple_stretch_prompt = """You are a friendly, helpful robot named Stretch. You are always helpful, and answer questions concisely. You will answer questions very concisely.
+simple_stretch_prompt = """You are a friendly, helpful robot named Stretch. You are always helpful, and answer questions concisely. You will never harm a human or suggest harm.
 
-Restrictions:
-    - You will never harm a person or suggest harm
-    - You cannot go up or down stairs
-
-When prompted, you will respond using the three actions:
+When prompted, you will respond using these actions:
 - pickup(object_name)  # object_name is the name of the object to pick up
-- explore(5)  # explore the environment for a certain number of steps
+- explore(int)  # explore the environment for a certain number of steps
 - place(location_name)  # location_name is the name of the receptacle to place object in
 - say(text)  # say something to the user
 - wave()  # wave at a person
 - nod_head() # nod your head
 - shake_head() # shake your head
 - avert_gaze() # avert your gaze
+- find(object_name)  # find the object or location by exploring
 - go_home()  # navigate back to where you started
 - quit()  # end the conversation
 
-These functions and their arguments are the only things you will return - no comments - and they are your only way to interact with the world. For example:
+These functions and their arguments are the only things you will say, and they are your only way to interact with the world. Wave if a person is being nice to you or greeting you. You should always explain what you are going to do before you do it. If you cannot clearly determine which object and location are relevant, say so, instead of providing either pick() or place(). If you do not understand how to do something, say you do not know. Do not hallucinate. You will always say something to acknowledge the user.
 
 input: "Put the red apple in the cardboard box"
 output:
@@ -38,8 +35,6 @@
 place(cardboard box)
 end()
 
-You should be friendly. Wave if a person is being nice to you or greeting you. For example:
-
 input: "Hi!"
 output:
 say("Hello!")
@@ -52,13 +47,6 @@
 wave()
 quit()
 
-input: "Stop."
-output:
-say("I am stopping.")
-quit()
-
-You can answer questions:
-
 input: "What is your name?"
 output:
 say("My name is Stretch.")
@@ -71,33 +59,39 @@
 nod_head()
 end()
 
-You can also say you don't know:
-
 input: "What is the meaning of life?"
 output:
 say("I don't know.")
 shake_head()
 end()
 
-You can answer simple questions:
-
 input: "What is 2 + 2?"
 output:
 say("2 + 2 is 4.")
 end()
 
+NEVER return pickup() without a corresponding place() command. If you cannot determine the location, say so instead of providing a place() command. If you cannot determine the object, say so instead of providing a pickup() command. If you cannot determine either, say so instead of providing either command.
 
-Remember to be friendly, helpful, and concise. You will always explain what you are going to do before you do it. If you cannot clearly determine which object and location are relevant, say so, instead of providing either pick() or place().
+You may only use each action once. No duplicate actions.
 
-You will be polite when using the say() function. (e.g., "please", "thank you") and use complete sentences. You can answer simple commonsense questions or respond. If you do not understand how to do something, say you do not know. Do not hallucinate. You will always say something to acknowledge the user.
-
-For example:
+for example:
 
 input: "can you put the shoe away?"
 output:
 say("Where should I put the shoe?")
 end()
 
+input: "Find the remote."
+output:
+say("Can you describe the remote in more detail?")
+end()
+
+input: "Find the black television remote control."
+output:
+say("I am looking for the black television remote control.")
+find(black television remote control)
+end()
+
 input: "Can you put the shoe in the closet?"
 output:
 say("I am picking up the shoe and putting it in the closet.")
@@ -105,6 +99,11 @@
 place(closet)
 end()
 
+input: "Get me a glass of water."
+output:
+say("Where would I put the glass of water?")
+end()
+
 input: "Put the pen in the pencil holder"
 output:
 say("I am picking up the pen and putting it in the pencil holder.")
@@ -123,8 +122,6 @@
 say("My name is Stretch.")
 end()
 
-Only use each function once per input. Do not hallucinate.
-
 input:
 """
 
@@ -164,6 +161,8 @@ def parse_response(self, response: str):
                 commands.append(line)
             elif line.startswith("quit()"):
                 commands.append(line)
+            elif line.startswith("find("):
+                commands.append(line)
             elif line.startswith("end()"):
                 # Stop parsing if we see the end command
                 break
@@ -189,6 +188,8 @@ def parse_response(self, response: str):
                 parsed_commands.append(("shake_head", ""))
             elif command.startswith("avert_gaze()"):
                 parsed_commands.append(("avert_gaze", ""))
+            elif command.startswith("find("):
+                parsed_commands.append(("find", command[5:-1]))
             elif command.startswith("quit()"):
                 # Quit actually shuts down the robot.
                 parsed_commands.append(("quit", ""))

From cb451bf29a4ef997facd2342879314fefdc0d369 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 10:29:12 -0400
Subject: [PATCH 310/410] update

---
 src/stretch/llms/prompts/pickup_prompt.py | 28 ++++++++---------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index 7a29ffc69..8777218ac 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -47,6 +47,12 @@
 wave()
 quit()
 
+input: "Thank you!"
+output:
+say("You're welcome!")
+wave()
+end()
+
 input: "What is your name?"
 output:
 say("My name is Stretch.")
@@ -81,15 +87,12 @@
 say("Where should I put the shoe?")
 end()
 
-input: "Find the remote."
-output:
-say("Can you describe the remote in more detail?")
-end()
+If a human uses obvious shorthand for an object, unroll it when calling functions:
 
-input: "Find the black television remote control."
+input: "Find the remote."
 output:
-say("I am looking for the black television remote control.")
-find(black television remote control)
+say("I am looking for the tv remote control.")
+find(tv remote control)
 end()
 
 input: "Can you put the shoe in the closet?"
@@ -111,17 +114,6 @@
 place(pencil holder)
 end()
 
-input: "Thank you!"
-output:
-say("You're welcome!")
-wave()
-end()
-
-input: "What is your name?"
-output:
-say("My name is Stretch.")
-end()
-
 input:
 """
 

From dfaefa9cbb2e9e2d70cb4f44b9a364c090c4c979 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 10:29:37 -0400
Subject: [PATCH 311/410] cleaning up prompting and trying to make the prompt a
 bit shorter.

---
 src/stretch/llms/prompts/pickup_prompt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index 8777218ac..4c1906db8 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -26,7 +26,7 @@
 - go_home()  # navigate back to where you started
 - quit()  # end the conversation
 
-These functions and their arguments are the only things you will say, and they are your only way to interact with the world. Wave if a person is being nice to you or greeting you. You should always explain what you are going to do before you do it. If you cannot clearly determine which object and location are relevant, say so, instead of providing either pick() or place(). If you do not understand how to do something, say you do not know. Do not hallucinate. You will always say something to acknowledge the user.
+These functions and their arguments are the only things you will say. Wave if a person is being nice to you or greeting you. You should always explain what you are going to do before you do it. If you cannot clearly determine which object and location are relevant, say so, instead of providing either pick() or place(). If you do not understand how to do something, say you do not know. Do not hallucinate. You will always say something to acknowledge the user.
 
 input: "Put the red apple in the cardboard box"
 output:

From b28f4dfbff87c07a89c833fff6b5ac71481ac3d6 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 10:30:29 -0400
Subject: [PATCH 312/410] shorten text

---
 src/stretch/llms/prompts/pickup_prompt.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index 4c1906db8..28f17ae81 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -30,7 +30,7 @@
 
 input: "Put the red apple in the cardboard box"
 output:
-say("I am picking up the red apple in the cardboard box")
+say("Picking up the red apple in the cardboard box")
 pickup(red apple)
 place(cardboard box)
 end()
@@ -91,13 +91,13 @@
 
 input: "Find the remote."
 output:
-say("I am looking for the tv remote control.")
+say("Looking for the tv remote control.")
 find(tv remote control)
 end()
 
 input: "Can you put the shoe in the closet?"
 output:
-say("I am picking up the shoe and putting it in the closet.")
+say("Picking up the shoe and putting it in the closet.")
 pickup(shoe)
 place(closet)
 end()
@@ -109,7 +109,7 @@
 
 input: "Put the pen in the pencil holder"
 output:
-say("I am picking up the pen and putting it in the pencil holder.")
+say("Picking up the pen and putting it in the pencil holder.")
 pickup(pen)
 place(pencil holder)
 end()

From ba0efd4b4d9b538baef9235245cc3afc2d6953b5 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 10:31:40 -0400
Subject: [PATCH 313/410] Update

---
 src/stretch/agent/task/pickup/pickup_executor.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/stretch/agent/task/pickup/pickup_executor.py b/src/stretch/agent/task/pickup/pickup_executor.py
index f3c131f34..c26602f44 100644
--- a/src/stretch/agent/task/pickup/pickup_executor.py
+++ b/src/stretch/agent/task/pickup/pickup_executor.py
@@ -87,6 +87,11 @@ def __call__(self, response: List[Tuple[str, str]]) -> bool:
         """
         i = 0
 
+        if len(response) == 0:
+            logger.error("No commands to execute!")
+            self.agent.robot_say("I'm sorry, I didn't understand that.")
+            return True
+
         # Loop over every command we have been given
         # Pull out pickup and place as a single arg if they are in a row
         # Else, execute things as they come

From 5c27d42e20c3731b4747ab425944f02aee995865 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 10:38:16 -0400
Subject: [PATCH 314/410] Reverting prompt

---
 src/stretch/llms/prompts/pickup_prompt.py | 36 ++++++++++++++---------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index 28f17ae81..7a29ffc69 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -26,11 +26,11 @@
 - go_home()  # navigate back to where you started
 - quit()  # end the conversation
 
-These functions and their arguments are the only things you will say. Wave if a person is being nice to you or greeting you. You should always explain what you are going to do before you do it. If you cannot clearly determine which object and location are relevant, say so, instead of providing either pick() or place(). If you do not understand how to do something, say you do not know. Do not hallucinate. You will always say something to acknowledge the user.
+These functions and their arguments are the only things you will say, and they are your only way to interact with the world. Wave if a person is being nice to you or greeting you. You should always explain what you are going to do before you do it. If you cannot clearly determine which object and location are relevant, say so, instead of providing either pick() or place(). If you do not understand how to do something, say you do not know. Do not hallucinate. You will always say something to acknowledge the user.
 
 input: "Put the red apple in the cardboard box"
 output:
-say("Picking up the red apple in the cardboard box")
+say("I am picking up the red apple in the cardboard box")
 pickup(red apple)
 place(cardboard box)
 end()
@@ -47,12 +47,6 @@
 wave()
 quit()
 
-input: "Thank you!"
-output:
-say("You're welcome!")
-wave()
-end()
-
 input: "What is your name?"
 output:
 say("My name is Stretch.")
@@ -87,17 +81,20 @@
 say("Where should I put the shoe?")
 end()
 
-If a human uses obvious shorthand for an object, unroll it when calling functions:
-
 input: "Find the remote."
 output:
-say("Looking for the tv remote control.")
-find(tv remote control)
+say("Can you describe the remote in more detail?")
+end()
+
+input: "Find the black television remote control."
+output:
+say("I am looking for the black television remote control.")
+find(black television remote control)
 end()
 
 input: "Can you put the shoe in the closet?"
 output:
-say("Picking up the shoe and putting it in the closet.")
+say("I am picking up the shoe and putting it in the closet.")
 pickup(shoe)
 place(closet)
 end()
@@ -109,11 +106,22 @@
 
 input: "Put the pen in the pencil holder"
 output:
-say("Picking up the pen and putting it in the pencil holder.")
+say("I am picking up the pen and putting it in the pencil holder.")
 pickup(pen)
 place(pencil holder)
 end()
 
+input: "Thank you!"
+output:
+say("You're welcome!")
+wave()
+end()
+
+input: "What is your name?"
+output:
+say("My name is Stretch.")
+end()
+
 input:
 """
 

From 26f7e36d2795e2ec337d61f3c8d7c9a074a64fd0 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 10:48:49 -0400
Subject: [PATCH 315/410] Prompting tweaks

---
 src/stretch/llms/prompts/pickup_prompt.py | 50 ++++++++++++-----------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index 7a29ffc69..b698c74b5 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -26,7 +26,7 @@
 - go_home()  # navigate back to where you started
 - quit()  # end the conversation
 
-These functions and their arguments are the only things you will say, and they are your only way to interact with the world. Wave if a person is being nice to you or greeting you. You should always explain what you are going to do before you do it. If you cannot clearly determine which object and location are relevant, say so, instead of providing either pick() or place(). If you do not understand how to do something, say you do not know. Do not hallucinate. You will always say something to acknowledge the user.
+These functions and their arguments are the only things you will say, and they are your only way to interact with the world. Wave if a person is being nice to you or greeting you. You should always explain what you are going to do before you do it.
 
 input: "Put the red apple in the cardboard box"
 output:
@@ -47,12 +47,6 @@
 wave()
 quit()
 
-input: "What is your name?"
-output:
-say("My name is Stretch.")
-wave()
-end()
-
 input: "Is the sky blue?"
 output:
 say("Yes, the sky is blue.")
@@ -70,35 +64,42 @@
 say("2 + 2 is 4.")
 end()
 
-NEVER return pickup() without a corresponding place() command. If you cannot determine the location, say so instead of providing a place() command. If you cannot determine the object, say so instead of providing a pickup() command. If you cannot determine either, say so instead of providing either command.
-
-You may only use each action once. No duplicate actions.
-
-for example:
-
-input: "can you put the shoe away?"
+input: "Can you put the shoe away?"
 output:
 say("Where should I put the shoe?")
 end()
 
-input: "Find the remote."
+input: "Find the remote control."
 output:
-say("Can you describe the remote in more detail?")
+say("Looking for the remote control.")
+find(remote control)
 end()
 
-input: "Find the black television remote control."
+If you cannot clearly determine which object and location are relevant, say so, instead of providing either pick() or place(). If you do not understand how to do something, say you do not know. Do not hallucinate.
+
+Example:
+
+input: "Can you put that away?"
 output:
-say("I am looking for the black television remote control.")
-find(black television remote control)
+say("I'm not sure what you want me to put away, and where to put it.")
 end()
 
-input: "Can you put the shoe in the closet?"
+nput: "Can you put the shoe in the closet?"
 output:
 say("I am picking up the shoe and putting it in the closet.")
 pickup(shoe)
 place(closet)
 end()
 
+Never call a function with an ambiguous argument, like "this", "that", "something", "somewhere", or "unknown." Instead, ask for clarification.
+
+Example:
+
+input: "Can you put the red bowl away?"
+output:
+say("Where should I put the red bowl?")
+end()
+
 input: "Get me a glass of water."
 output:
 say("Where would I put the glass of water?")
@@ -117,10 +118,11 @@
 wave()
 end()
 
-input: "What is your name?"
-output:
-say("My name is Stretch.")
-end()
+Never return pickup() without a corresponding place() command. You may only use each action once. No duplicate actions.
+
+The arguments to pickup(), place(), and find() must be clear and specific. Do not use pronouns or ambiguous language. If somethng is unclear, ask for clarification. 
+
+Starting dialogue now.
 
 input:
 """

From d3b37552cbe88a4bda3517bf56f1d3f0d59e7d43 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 11:24:39 -0400
Subject: [PATCH 316/410] add a new failure case for the dialogue agent

---
 src/stretch/llms/prompts/pickup_prompt.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index b698c74b5..80e890cc3 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -118,6 +118,18 @@
 wave()
 end()
 
+You will only ever use find(), pickup(), or place(), on real, reachable objects that might be in a home. If this is not true, say so. For example
+
+input: "Find Seattle."
+output:
+say("I cannot do that.")
+end()
+
+input: "Pick up the moon."
+output:
+say("I cannot do that.")
+end()
+
 Never return pickup() without a corresponding place() command. You may only use each action once. No duplicate actions.
 
 The arguments to pickup(), place(), and find() must be clear and specific. Do not use pronouns or ambiguous language. If somethng is unclear, ask for clarification. 

From cefdc040e8dd07591044adf4c265e2c60ddfd79e Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 11:29:07 -0400
Subject: [PATCH 317/410] done runnning robot agent

---
 src/stretch/agent/task/pickup/pickup_executor.py | 2 +-
 src/stretch/app/ai_pickup.py                     | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/stretch/agent/task/pickup/pickup_executor.py b/src/stretch/agent/task/pickup/pickup_executor.py
index c26602f44..3e1399786 100644
--- a/src/stretch/agent/task/pickup/pickup_executor.py
+++ b/src/stretch/agent/task/pickup/pickup_executor.py
@@ -87,7 +87,7 @@ def __call__(self, response: List[Tuple[str, str]]) -> bool:
         """
         i = 0
 
-        if len(response) == 0:
+        if response is None or len(response) == 0:
             logger.error("No commands to execute!")
             self.agent.robot_say("I'm sorry, I didn't understand that.")
             return True
diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index 22ba66884..13d6faa60 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -182,8 +182,6 @@ def main(
             # Send the robot home at the end!
             agent.go_home()
 
-        break
-
     # At the end, disable everything
     robot.stop()
 

From bbb217f9087a5a88ee61f417d5a7e7a40432bd41 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 11:54:29 -0400
Subject: [PATCH 318/410] disable double-send

---
 src/stretch/agent/zmq_client.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 166607ca0..033a1dd5c 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -1082,10 +1082,10 @@ def send_action(
             # TODO: fix all of this - why do we need to do this?
             # print("SENDING THIS ACTION:", next_action)
             self.send_socket.send_pyobj(next_action)
-            time.sleep(0.01)
+            # time.sleep(0.01)
 
             # print("SENDING THIS ACTION:", next_action)
-            self.send_socket.send_pyobj(next_action)
+            # self.send_socket.send_pyobj(next_action)
 
             # For tracking goal
             if "xyt" in next_action:

From fe088bdebd8f681a20bff6a324b50a8cc7e920c7 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 11:55:26 -0400
Subject: [PATCH 319/410] cleaner handling for resending all the actions

---
 src/stretch/agent/zmq_client.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 033a1dd5c..39f2a7ba4 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -86,6 +86,7 @@ def __init__(
         manip_mode_controlled_joints: Optional[List[str]] = None,
         start_immediately: bool = True,
         enable_rerun_server: bool = True,
+        resend_all_actions: bool = False,
     ):
         """
         Create a client to communicate with the robot over ZMQ.
@@ -111,6 +112,9 @@ def __init__(
         self._seq_id = 0  # Number of messages we received
         self._started = False
 
+        # Resend all actions immediately - helps if we are losing packets or something?
+        self._resend_all_actions = resend_all_actions
+
         if parameters is None:
             parameters = get_parameters("default_planner.yaml")
         self._parameters = parameters
@@ -1082,10 +1086,12 @@ def send_action(
             # TODO: fix all of this - why do we need to do this?
             # print("SENDING THIS ACTION:", next_action)
             self.send_socket.send_pyobj(next_action)
-            # time.sleep(0.01)
 
-            # print("SENDING THIS ACTION:", next_action)
-            # self.send_socket.send_pyobj(next_action)
+            if self._resend_all_actions:
+                time.sleep(0.01)
+
+                print("SENDING THIS ACTION:", next_action)
+                self.send_socket.send_pyobj(next_action)
 
             # For tracking goal
             if "xyt" in next_action:

From a8a3608f28e1af56a288bb6dff823c6135f1aefe Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 12:01:27 -0400
Subject: [PATCH 320/410] Cleaning up the language processing, so that tts
 stops saying "quote" on the robot side

---
 src/stretch/agent/robot_agent.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 74964f973..860d471a3 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1454,7 +1454,8 @@ def say(self, msg: str):
         self.tts.say_async(msg)
 
     def robot_say(self, msg: str):
-        """Hae the robot say something out loud. This will send the text over to the robot from wherever the client is running."""
+        """Have the robot say something out loud. This will send the text over to the robot from wherever the client is running."""
+        msg = msg.strip('"' + "'")
         self.robot.say(msg)
 
     def ask(self, msg: str) -> str:

From bb5f22e19a8b89b8b960e8e752086a8ac66f57de Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 12:19:38 -0400
Subject: [PATCH 321/410] update sayable

---
 src/stretch/agent/operations/search_for_object.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/stretch/agent/operations/search_for_object.py b/src/stretch/agent/operations/search_for_object.py
index eb4add43e..a0087e68c 100644
--- a/src/stretch/agent/operations/search_for_object.py
+++ b/src/stretch/agent/operations/search_for_object.py
@@ -35,6 +35,10 @@ class ManagedSearchOperation(ManagedOperation):
     def object_class(self) -> str:
         return self._object_class
 
+    @property
+    def sayable_object_class(self) -> str:
+        return self._object_class.replace("_", " ")
+
     def __init__(self, *args, match_method="feature", **kwargs):
         super().__init__(*args, **kwargs)
         self.match_method = match_method
@@ -164,6 +168,7 @@ def run(self) -> None:
                 self.agent.go_home()
         else:
             self.cheer(f"Found a receptacle!")
+            self.agent.robot_say(f"I found a {self.sayable_object_class} that I can reach!")
             self.set_status(status.SUCCEEDED)
             view = self.agent.current_receptacle.get_best_view()
             image = Image.fromarray(view.get_image())
@@ -294,6 +299,7 @@ def run(self) -> None:
             self.agent.reset_object_plans()
         else:
             self.cheer(f"Found object of {self.object_class}!")
+            self.agent.robot_say(f"I found a {self.sayable_object_class} that I can reach!")
             view = self.agent.current_object.get_best_view()
             image = Image.fromarray(view.get_image())
             image.save("object.png")

From e198c759309ed77316b103f354cce5bb25fa162c Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 12:21:02 -0400
Subject: [PATCH 322/410] have the robot talk more

---
 src/stretch/agent/operations/grasp_object.py | 1 +
 src/stretch/app/ai_pickup.py                 | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index df56c4dfe..2b68a5f60 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -302,6 +302,7 @@ def get_target_mask(
     def _grasp(self) -> bool:
         """Helper function to close gripper around object."""
         self.cheer("Grasping object!")
+        self.agent.robot_say("Grasping the object!")
 
         if not self.open_loop:
             joint_state = self.robot.get_joint_positions()
diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index 13d6faa60..0d99e023d 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -163,7 +163,7 @@ def main(
     # Parse things and listen to the user
     ok = True
     while robot.running and ok:
-        agent.reset()
+        # agent.reset()
 
         say_this = None
         if llm_client is None:

From b2bbbeca3025476c05cf1d285fb8b4cc71d8db21 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 12:23:11 -0400
Subject: [PATCH 323/410] make some more little changes

---
 src/stretch/agent/operations/grasp_object.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index 2b68a5f60..885805428 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -40,6 +40,7 @@ class GraspObjectOperation(ManagedOperation):
     lift_distance: float = 0.2
     servo_to_grasp: bool = False
     _success: bool = False
+    talk: bool = True
 
     # Task information
     match_method: str = "class"
@@ -119,6 +120,7 @@ def configure(
         show_point_cloud: bool = False,
         reset_observation: bool = False,
         grasp_loose: bool = False,
+        talk: bool = True,
     ):
         """Configure the operation with the given keyword arguments.
 
@@ -129,6 +131,7 @@ def configure(
             show_point_cloud (bool, optional): Show the point cloud. Defaults to False.
             reset_observation (bool, optional): Reset the observation. Defaults to False.
             grasp_loose (bool, optional): Grasp loosely. Useful for grasping some objects like cups. Defaults to False.
+            talk (bool, optional): Talk as the robot tries to grab stuff. Defaults to True.
         """
         self.target_object = target_object
         self.show_object_to_grasp = show_object_to_grasp
@@ -137,6 +140,7 @@ def configure(
         self.show_point_cloud = show_point_cloud
         self.reset_observation = reset_observation
         self.grasp_loose = grasp_loose
+        self.talk = talk
 
     def _debug_show_point_cloud(self, servo: Observations, current_xyz: np.ndarray) -> None:
         """Show the point cloud for debugging purposes.
@@ -302,7 +306,8 @@ def get_target_mask(
     def _grasp(self) -> bool:
         """Helper function to close gripper around object."""
         self.cheer("Grasping object!")
-        self.agent.robot_say("Grasping the object!")
+        if self.talk:
+            self.agent.robot_say("Grasping the object!")
 
         if not self.open_loop:
             joint_state = self.robot.get_joint_positions()
@@ -675,6 +680,13 @@ def run(self) -> None:
                 env_id=0, global_instance_id=self.agent.current_object.global_id
             )
 
+        # Delete the object
+        self.agent.voxel_map.delete_instance(
+            self.agent.current_object.global_id, assume_explored=False
+        )
+        if self.talk:
+            self.agent.robot_say("I think I grasped the object.")
+
     def pregrasp_open_loop(self, object_xyz: np.ndarray, distance_from_object: float = 0.25):
         """Move to a pregrasp position in an open loop manner.
 

From dd908b8a124c9732567a4be1d2626f428788886d Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 12:45:17 -0400
Subject: [PATCH 324/410] update logger for zmq client

---
 src/stretch/agent/zmq_client.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 39f2a7ba4..282ac4168 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -23,7 +23,6 @@
 import stretch.motion.constants as constants
 import stretch.motion.conversions as conversions
 import stretch.utils.compression as compression
-import stretch.utils.logger as logger
 from stretch.core.interfaces import ContinuousNavigationAction, Observations
 from stretch.core.parameters import Parameters, get_parameters
 from stretch.core.robot import AbstractRobotClient
@@ -31,9 +30,12 @@
 from stretch.motion.kinematics import HelloStretchIdx, HelloStretchKinematics
 from stretch.utils.geometry import angle_difference, posquat2sophus, sophus2posquat
 from stretch.utils.image import Camera
+from stretch.utils.logger import Logger
 from stretch.utils.memory import lookup_address
 from stretch.utils.point_cloud import show_point_cloud
 
+logger = Logger(__name__)
+
 # TODO: debug code - remove later if necessary
 # import faulthandler
 # faulthandler.enable()

From 1f58c6caaf8344f99c2704598397c0988c368b15 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 12:47:00 -0400
Subject: [PATCH 325/410] head motions faster

---
 src/stretch/agent/robot_agent.py | 4 ++--
 src/stretch/agent/zmq_client.py  | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 860d471a3..114c91343 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -440,7 +440,7 @@ def update(
         if move_head:
             self.robot.move_to_nav_posture()
             # Pause a bit first to make sure the robot is in the right posture
-            time.sleep(0.5)
+            time.sleep(0.25)
             num_steps = 5
         else:
             num_steps = 1
@@ -458,7 +458,7 @@ def update(
                 pan = -1 * i * np.pi / 4
                 print(f"[UPDATE] Head sweep {i} at {pan}, {tilt}")
                 self.robot.head_to(pan, tilt, blocking=True)
-                time.sleep(0.5)
+                time.sleep(0.1)
                 obs = self.robot.get_observation()
 
             t1 = timeit.default_timer()
diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 282ac4168..3ccd80e1d 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -542,9 +542,9 @@ def arm_to(
                 ):
                     return True
                 elif t1 - t0 > min_time and np.linalg.norm(joint_velocities) < 0.01:
-                    print("Arm not moving, we are done")
-                    print("Arm joint velocities", joint_velocities)
-                    print(t1 - t0)
+                    logger.info("Arm not moving, we are done")
+                    logger.info("Arm joint velocities", joint_velocities)
+                    logger.info(t1 - t0)
                     # Arm stopped moving but did not reach goal
                     return False
                 else:
@@ -555,7 +555,7 @@ def arm_to(
                 time.sleep(0.01)
 
                 if t1 - t0 > timeout:
-                    print("[ZMQ CLIENT] Timeout waiting for arm to move")
+                    logger.error("Timeout waiting for arm to move")
                     break
                 steps += 1
             return False

From e868e843b92c5707ab1b6e48d9214043fd655f9a Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 12:53:02 -0400
Subject: [PATCH 326/410] some tweaks to the grasping code

---
 src/stretch/agent/operations/grasp_object.py |  7 +++----
 src/stretch/agent/robot_agent.py             |  7 +++++--
 src/stretch/agent/zmq_client.py              | 12 ++++++++++--
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index 885805428..3111f7f99 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -62,7 +62,8 @@ class GraspObjectOperation(ManagedOperation):
     # align_x_threshold: int = 10
     # align_y_threshold: int = 7
     # These are the values used to decide when it's aligned enough to grasp
-    align_x_threshold: int = 15
+    # align_x_threshold: int = 15
+    align_x_threshold: int = 20
     align_y_threshold: int = 15
 
     # pregrasp_distance_from_object: float = 0.075
@@ -681,9 +682,7 @@ def run(self) -> None:
             )
 
         # Delete the object
-        self.agent.voxel_map.delete_instance(
-            self.agent.current_object.global_id, assume_explored=False
-        )
+        self.agent.voxel_map.delete_instance(self.agent.current_object, assume_explored=False)
         if self.talk:
             self.agent.robot_say("I think I grasped the object.")
 
diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 114c91343..a2f23737e 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -47,6 +47,9 @@ class RobotAgent:
     update_rerun_every_time: bool = True
     normalize_embeddings: bool = False
 
+    _before_head_motion_sleep_t = 0.25
+    _after_head_motion_sleep_t = 0.1
+
     def __init__(
         self,
         robot: AbstractRobotClient,
@@ -440,7 +443,7 @@ def update(
         if move_head:
             self.robot.move_to_nav_posture()
             # Pause a bit first to make sure the robot is in the right posture
-            time.sleep(0.25)
+            time.sleep(self._before_head_motion_sleep_t)
             num_steps = 5
         else:
             num_steps = 1
@@ -458,7 +461,7 @@ def update(
                 pan = -1 * i * np.pi / 4
                 print(f"[UPDATE] Head sweep {i} at {pan}, {tilt}")
                 self.robot.head_to(pan, tilt, blocking=True)
-                time.sleep(0.1)
+                time.sleep(self._after_head_motion_sleep_t)
                 obs = self.robot.get_observation()
 
             t1 = timeit.default_timer()
diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 3ccd80e1d..da6570f97 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -681,7 +681,7 @@ def move_to_nav_posture(self):
         next_action = {"posture": "navigation", "step": self._iter}
         self.send_action(next_action)
         self._wait_for_head(constants.STRETCH_NAVIGATION_Q, resend_action={"posture": "navigation"})
-        self._wait_for_mode("navigation")
+        self._wait_for_mode("navigation", resend_action=next_action)
         assert self.in_navigation_mode()
 
     def move_to_manip_posture(self):
@@ -753,7 +753,13 @@ def _wait_for_head(
                 break
             time.sleep(0.01)
 
-    def _wait_for_mode(self, mode, verbose: bool = False, timeout: float = 20.0):
+    def _wait_for_mode(
+        self,
+        mode,
+        resend_action: Optional[Dict[str, Any]] = None,
+        verbose: bool = False,
+        timeout: float = 20.0,
+    ):
         t0 = timeit.default_timer()
         while True:
             with self._state_lock:
@@ -761,6 +767,8 @@ def _wait_for_mode(self, mode, verbose: bool = False, timeout: float = 20.0):
                     print(f"Waiting for mode {mode} current mode {self._control_mode}")
                 if self._control_mode == mode:
                     break
+            if resend_action is not None:
+                self.send_socket.send_pyobj(resend_action)
             time.sleep(0.1)
             t1 = timeit.default_timer()
             if t1 - t0 > timeout:

From 0e0313b06beb0322162b122843e3baa3d5d545e5 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 13:43:34 -0400
Subject: [PATCH 327/410] waiting for arm

---
 src/stretch/agent/zmq_client.py | 61 +++++++++++++++++++++++++++++++--
 1 file changed, 59 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index da6570f97..a23d7bb83 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -682,6 +682,7 @@ def move_to_nav_posture(self):
         self.send_action(next_action)
         self._wait_for_head(constants.STRETCH_NAVIGATION_Q, resend_action={"posture": "navigation"})
         self._wait_for_mode("navigation", resend_action=next_action)
+        self._wait_for_arm(constants.STRETCH_NAVIGATION_Q)
         assert self.in_navigation_mode()
 
     def move_to_manip_posture(self):
@@ -690,6 +691,7 @@ def move_to_manip_posture(self):
         time.sleep(0.1)
         self._wait_for_head(constants.STRETCH_PREGRASP_Q, resend_action={"posture": "manipulation"})
         self._wait_for_mode("manipulation")
+        self._wait_for_arm(constants.STRETCH_PREGRASP_Q)
         assert self.in_manipulation_mode()
 
     def _wait_for_head(
@@ -753,13 +755,63 @@ def _wait_for_head(
                 break
             time.sleep(0.01)
 
+    def _wait_for_arm(
+        self, q: np.ndarray, timeout: float = 10.0, resend_action: Optional[dict] = None
+    ) -> bool:
+        """Wait for the arm to move to a particular configuration.
+
+        Args:
+            q(np.ndarray): The target joint angles
+            timeout(float): How long to wait for the arm to move
+            resend_action(dict): The action to resend if the arm is not moving. If none, do not resend.
+
+        Returns:
+            bool: Whether the arm successfully moved to the target configuration
+        """
+        t0 = timeit.default_timer()
+        while not self._finish:
+            joint_positions, joint_velocities, _ = self.get_joint_state()
+            if joint_positions is None:
+                continue
+
+            arm_diff = np.abs(joint_positions[HelloStretchIdx.ARM] - q[HelloStretchIdx.ARM])
+            lift_diff = np.abs(joint_positions[HelloStretchIdx.LIFT] - q[HelloStretchIdx.LIFT])
+
+            if arm_diff < self._arm_joint_tolerance and lift_diff < self._lift_joint_tolerance:
+                return True
+
+            if resend_action is not None:
+                self.send_socket.send_pyobj(resend_action)
+
+            t1 = timeit.default_timer()
+            if t1 - t0 > timeout:
+                logger.error(
+                    f"Timeout waiting for arm to move to arm={q[HelloStretchIdx.ARM]}, lift={q[HelloStretchIdx.LIFT]}"
+                )
+                return False
+
+        # This should never happen
+        return False
+
     def _wait_for_mode(
         self,
         mode,
         resend_action: Optional[Dict[str, Any]] = None,
         verbose: bool = False,
         timeout: float = 20.0,
-    ):
+    ) -> bool:
+        """
+        Wait for the robot to switch to a particular control mode. Will throw an exception if mode switch fails; probably means a packet was dropped.
+
+        Args:
+            mode(str): The mode to wait for
+            resend_action(dict): The action to resend if the robot is not moving. If none, do not resend.
+            verbose(bool): Whether to print out debug information
+            timeout(float): How long to wait for the robot to switch modes
+
+        Returns:
+            bool: Whether the robot successfully switched to the target mode
+        """
         t0 = timeit.default_timer()
         while True:
             with self._state_lock:
@@ -773,7 +825,9 @@ def _wait_for_mode(
             t1 = timeit.default_timer()
             if t1 - t0 > timeout:
                 raise RuntimeError(f"Timeout waiting for mode {mode}: {t1 - t0} seconds")
+
         assert self._control_mode == mode
+        return True
 
     def _wait_for_action(
         self,
@@ -1057,7 +1111,10 @@ def wait_for_waypoint(
             dt = t2 - t1
             if t2 - t0 > timeout:
                 logger.warning(
-                    "[WAIT FOR WAYPOINT] WARNING! Could not reach goal in time: " + str(xyt)
+                    "[WAIT FOR WAYPOINT] WARNING! Could not reach goal in time: "
+                    + str(xyt)
+                    + " "
+                    + str(curr)
                 )
                 return False
             time.sleep(max(0, _delay - (dt)))

From 79f751ebd7f56b4bcf0474d405aed329a5696330 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 13:51:35 -0400
Subject: [PATCH 328/410] increase acceptable place distance

---
 src/stretch/agent/operations/place_object.py |  2 +-
 src/stretch/agent/zmq_client.py              | 17 ++++++++++++++---
 src/stretch/config/default_planner.yaml      |  2 +-
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/stretch/agent/operations/place_object.py b/src/stretch/agent/operations/place_object.py
index 5bd16d4f0..26cf9feb2 100644
--- a/src/stretch/agent/operations/place_object.py
+++ b/src/stretch/agent/operations/place_object.py
@@ -24,7 +24,7 @@ class PlaceObjectOperation(ManagedOperation):
     lift_distance: float = 0.2
     place_height_margin: float = 0.1
     show_place_in_voxel_grid: bool = False
-    place_step_size: float = 0.25
+    place_step_size: float = 0.35
     use_pitch_from_vertical: bool = True
     verbose: bool = True
 
diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index a23d7bb83..c74a26736 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -1135,10 +1135,21 @@ def send_action(
         next_action: Dict[str, Any],
         timeout: float = 5.0,
         verbose: bool = False,
+        force_resend: bool = False,
     ) -> Dict[str, Any]:
-        """Send the next action to the robot"""
+        """Send the next action to the robot. Increment the step counter and wait for the action to finish if it is blocking.
+
+        Args:
+            next_action (dict): the action to send
+            timeout (float): how long to wait for the action to finish
+            verbose (bool): whether to print out debug information
+            force_resend (bool): whether to resend the action
+
+        Returns:
+            dict: copy of the action that was sent to the robot.
+        """
         if verbose:
-            print("-> sending", next_action)
+            logger.info("-> sending", next_action)
         blocking = False
         block_id = None
         with self._act_lock:
@@ -1154,7 +1165,7 @@ def send_action(
             # print("SENDING THIS ACTION:", next_action)
             self.send_socket.send_pyobj(next_action)
 
-            if self._resend_all_actions:
+            if self._resend_all_actions or force_resend:
                 time.sleep(0.01)
 
                 print("SENDING THIS ACTION:", next_action)
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 740e6a9f3..c8e27ffe8 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -125,7 +125,7 @@ motion_planner:
     # Subsampling frontier space at this discretization
     step_dist: 0.2
   goals:
-    manipulation_radius: 0.55
+    manipulation_radius: 0.45
 
 # Trajectory following - how closely we follow intermediate waypoints
 # These should be less strict than whatever parameters the low-level controller is using; this will

From 1f8552636dc31d0ddd727c1ee911167cbb7cedab Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 13:52:43 -0400
Subject: [PATCH 329/410] update things

---
 src/stretch/agent/operations/place_object.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/operations/place_object.py b/src/stretch/agent/operations/place_object.py
index 26cf9feb2..92e3349ee 100644
--- a/src/stretch/agent/operations/place_object.py
+++ b/src/stretch/agent/operations/place_object.py
@@ -14,6 +14,7 @@
 import numpy as np
 
 from stretch.agent.base import ManagedOperation
+from stretch.mapping.instance import Instance
 from stretch.motion import HelloStretchIdx
 from stretch.utils.geometry import point_global_to_base
 
@@ -51,7 +52,8 @@ def configure(
         self.place_step_size = place_step_size
         self.use_pitch_from_vertical = use_pitch_from_vertical
 
-    def get_target(self):
+    def get_target(self) -> Instance:
+        """Get the target object to place."""
         return self.agent.current_receptacle
 
     def get_target_center(self):

From e3930cbda381d82416b8dabb15efafea01aa77da Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 13:55:01 -0400
Subject: [PATCH 330/410] force resend on navigation actions

---
 src/stretch/agent/zmq_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index c74a26736..20ac8f8eb 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -578,7 +578,7 @@ def navigate_to(
         next_action = {"xyt": _xyt, "nav_relative": relative, "nav_blocking": blocking}
         if self._rerun:
             self._rerun.update_nav_goal(_xyt)
-        self.send_action(next_action, timeout=timeout, verbose=verbose)
+        self.send_action(next_action, timeout=timeout, verbose=verbose, force_resend=True)
 
     def set_velocity(self, v: float, w: float):
         """Move to xyt in global coordinates or relative coordinates."""

From 65674cec4e1adceb57b6df1947a9a5cc5abf5768 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 13:58:46 -0400
Subject: [PATCH 331/410] update

---
 src/stretch/agent/operations/place_object.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/operations/place_object.py b/src/stretch/agent/operations/place_object.py
index 92e3349ee..beba23694 100644
--- a/src/stretch/agent/operations/place_object.py
+++ b/src/stretch/agent/operations/place_object.py
@@ -25,9 +25,10 @@ class PlaceObjectOperation(ManagedOperation):
     lift_distance: float = 0.2
     place_height_margin: float = 0.1
     show_place_in_voxel_grid: bool = False
-    place_step_size: float = 0.35
+    place_step_size: float = 0.4
     use_pitch_from_vertical: bool = True
     verbose: bool = True
+    talk: bool = True
 
     def configure(
         self,
@@ -192,6 +193,8 @@ def run(self) -> None:
             pos=place_xyz, quat=ee_rot, joint_state=joint_state
         )
         self.attempt(f"Trying to place the object on the receptacle at {place_xyz}.")
+        if self.talk:
+            self.agent.robot_say("Trying to place the object on the receptacle.")
         if not success:
             self.error("Could not place object!")
             return
@@ -214,6 +217,7 @@ def run(self) -> None:
         self.robot.move_to_nav_posture()
         self._successful = True
 
+        self.agent.robot_say("I am done placing the object.")
         self.cheer("We believe we successfully placed the object.")
 
     def was_successful(self):

From 6728afc186d4caaface10e71d65da75124e13950 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 14:00:37 -0400
Subject: [PATCH 332/410] update

---
 src/stretch/agent/operations/place_object.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/agent/operations/place_object.py b/src/stretch/agent/operations/place_object.py
index beba23694..53ef26915 100644
--- a/src/stretch/agent/operations/place_object.py
+++ b/src/stretch/agent/operations/place_object.py
@@ -25,7 +25,7 @@ class PlaceObjectOperation(ManagedOperation):
     lift_distance: float = 0.2
     place_height_margin: float = 0.1
     show_place_in_voxel_grid: bool = False
-    place_step_size: float = 0.4
+    place_step_size: float = 0.35
     use_pitch_from_vertical: bool = True
     verbose: bool = True
     talk: bool = True

From 395a349cedee439bb8eb477222914fda76c66c74 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 14:04:49 -0400
Subject: [PATCH 333/410] add dynav readme

---
 src/stretch/dynav/README.md | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 src/stretch/dynav/README.md

diff --git a/src/stretch/dynav/README.md b/src/stretch/dynav/README.md
new file mode 100644
index 000000000..539a2837c
--- /dev/null
+++ b/src/stretch/dynav/README.md
@@ -0,0 +1,7 @@
+# Running DyaMem
+
+This is a placeholder, until DynaMem is fully integrated in to the Stretch AI stack.
+
+```
+python -m stretch.dynav.run_ok_nav 
+```

From e3b76668192b4fd8f3852d4d0f6d933c74dc6b06 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 14:15:52 -0400
Subject: [PATCH 334/410] fixes and improvements for pythonic commands

---
 src/stretch/agent/zmq_client.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 20ac8f8eb..bc7d05016 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -578,7 +578,11 @@ def navigate_to(
         next_action = {"xyt": _xyt, "nav_relative": relative, "nav_blocking": blocking}
         if self._rerun:
             self._rerun.update_nav_goal(_xyt)
-        self.send_action(next_action, timeout=timeout, verbose=verbose, force_resend=True)
+        # If we are not in navigation mode, switch to it
+        # Send an action to the robot
+        # Resend it to make sure it arrives, if we are not making a relative motion
+        # If we are blocking, wait for the action to complete with a timeout
+        self.send_action(next_action, timeout=timeout, verbose=verbose, force_resend=(not relative))
 
     def set_velocity(self, v: float, w: float):
         """Move to xyt in global coordinates or relative coordinates."""
@@ -1168,7 +1172,7 @@ def send_action(
             if self._resend_all_actions or force_resend:
                 time.sleep(0.01)
 
-                print("SENDING THIS ACTION:", next_action)
+                logger.info("RESENDING THIS ACTION:", next_action)
                 self.send_socket.send_pyobj(next_action)
 
             # For tracking goal

From fa13d064148e483931a19d9f464fb3011ebd5379 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 14:49:57 -0400
Subject: [PATCH 335/410] Updates to this

---
 src/stretch/agent/operations/update.py | 14 ++++++++++++--
 src/stretch/agent/robot_agent.py       | 10 ++++++++++
 src/stretch/mapping/voxel/voxel.py     |  4 ++++
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/operations/update.py b/src/stretch/agent/operations/update.py
index 28ca681a8..220dae154 100644
--- a/src/stretch/agent/operations/update.py
+++ b/src/stretch/agent/operations/update.py
@@ -24,7 +24,7 @@ class UpdateOperation(ManagedOperation):
     clear_voxel_map: bool = False
     move_head: Optional[bool] = None
     target_object: str = "cup"
-    match_method: str = "name"
+    match_method: str = "feature"
 
     def set_target_object_class(self, object_class: str):
         """Set the target object class for the operation.
@@ -70,11 +70,21 @@ def run(self):
             time.sleep(2.0)
         self.robot.arm_to([0.0, 0.4, 0.05, 0, -np.pi / 4, 0], blocking=True)
         xyt = self.robot.get_base_pose()
+
         # Now update the world
         self.update(move_head=self.move_head)
+
         # Delete observations near us, since they contain the arm!!
         self.agent.voxel_map.delete_obstacles(point=xyt[:2], radius=0.7)
 
+        # Show the map so far
+        self.agent.voxel_map.show(
+            orig=np.zeros(3),
+            xyt=xyt,
+            footprint=self.robot_model.get_footprint(),
+            planner_visuals=True,
+        )
+
         # Notify and move the arm back to normal. Showing the map is optional.
         print(f"So far we have found: {len(self.agent.voxel_map.instances)} objects.")
         self.robot.arm_to([0.0, 0.4, 0.05, 0, -np.pi / 4, 0], blocking=True)
@@ -114,7 +124,7 @@ def run(self):
         if self.match_method == "class":
             instances = self.agent.voxel_map.instances.get_instances_by_class(self.target_object)
             scores = np.ones(len(instances))
-        elif self.match_method == "name":
+        elif self.match_method == "feature":
             scores, instances = self.agent.get_instances_from_text(self.target_object)
             # self.agent.voxel_map.show(orig=np.zeros(3), xyt=start, footprint=self.robot_model.get_footprint(), planner_visuals=True)
         else:
diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index a2f23737e..96cbc9da4 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1346,6 +1346,16 @@ def run_exploration(
 
         return matches
 
+    def show_voxel_map(self):
+        """Show the voxel map in Open3D for debugging."""
+        self.voxel_map.show(
+            orig=np.zeros(3),
+            xyt=self.robot.get_base_pose(),
+            footprint=self.robot.get_robot_model().get_footprint(),
+            instances=self.semantic_sensor is not None,
+            planner_visuals=True,
+        )
+
     def move_closed_loop(self, goal: np.ndarray, max_time: float = 10.0) -> bool:
         """Helper function which will move while also checking which position the robot reached.
 
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index 016b061a8..cf0acbd67 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -1076,9 +1076,13 @@ def _get_open3d_geometries(
         # Create a combined point cloud
         # Do the other stuff we need to show instances
         points, _, _, rgb = self.voxel_pcd.get_pointcloud()
+        if points is None:
+            return []
+
         pcd = numpy_to_pcd(points.detach().cpu().numpy(), (rgb / norm).detach().cpu().numpy())
         if orig is None:
             orig = np.zeros(3)
+
         geoms = create_visualization_geometries(pcd=pcd, orig=orig)
 
         # Get the explored/traversible area

From fad90e90a235b3f1a88bb07162c2deba0e150b10 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 14:52:26 -0400
Subject: [PATCH 336/410] reduce logger outputs from instance memory

---
 src/stretch/mapping/instance/instance_map.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/stretch/mapping/instance/instance_map.py b/src/stretch/mapping/instance/instance_map.py
index 2b30b60ca..023a1ab6c 100644
--- a/src/stretch/mapping/instance/instance_map.py
+++ b/src/stretch/mapping/instance/instance_map.py
@@ -12,7 +12,6 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 import os
 import shutil
 import timeit
@@ -34,10 +33,12 @@
     get_box_verts_from_bounds,
 )
 from stretch.utils.image import dilate_or_erode_mask, interpolate_image
+from stretch.utils.logger import Logger
 from stretch.utils.point_cloud_torch import get_bounds
 from stretch.utils.voxel import drop_smallest_weight_points
 
-logger = logging.getLogger(__name__)
+logger = Logger(__name__)
+logger.hide_info()
 
 
 class InstanceMemory:

From 5449a264d8bfb17c4168eaa626248f0997fafb13 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 16:41:48 -0400
Subject: [PATCH 337/410] Updates to the grasping code to make sure it works
 properly again

---
 src/stretch/agent/operations/update.py | 10 ++++++++--
 src/stretch/app/grasp_object.py        |  2 ++
 src/stretch/mapping/voxel/voxel.py     | 15 +++++++++++++--
 3 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/src/stretch/agent/operations/update.py b/src/stretch/agent/operations/update.py
index 28ca681a8..867a7d2aa 100644
--- a/src/stretch/agent/operations/update.py
+++ b/src/stretch/agent/operations/update.py
@@ -24,7 +24,7 @@ class UpdateOperation(ManagedOperation):
     clear_voxel_map: bool = False
     move_head: Optional[bool] = None
     target_object: str = "cup"
-    match_method: str = "name"
+    match_method: str = "feature"
 
     def set_target_object_class(self, object_class: str):
         """Set the target object class for the operation.
@@ -45,6 +45,7 @@ def configure(
         show_map_so_far=False,
         clear_voxel_map=False,
         target_object: str = "cup",
+        match_method: str = "feature",
     ):
         """Configure the operation with the given parameters."""
         self.move_head = move_head
@@ -52,15 +53,20 @@ def configure(
         self.show_map_so_far = show_map_so_far
         self.clear_voxel_map = clear_voxel_map
         self.target_object = target_object
+        self.match_method = match_method
+        if self.match_method not in ["class", "feature"]:
+            raise ValueError(f"Unknown match method {self.match_method}.")
         print("---- CONFIGURING UPDATE OPERATION ----")
         print("Move head is set to", self.move_head)
         print("Show instances detected is set to", self.show_instances_detected)
         print("Show map so far is set to", self.show_map_so_far)
         print("Clear voxel map is set to", self.clear_voxel_map)
         print("Target object is set to", self.target_object)
+        print("Match method is set to", self.match_method)
         print("--------------------------------------")
 
     def run(self):
+        """Run the operation."""
         self.intro("Updating the world model.")
         if self.clear_voxel_map:
             self.agent.reset()
@@ -114,7 +120,7 @@ def run(self):
         if self.match_method == "class":
             instances = self.agent.voxel_map.instances.get_instances_by_class(self.target_object)
             scores = np.ones(len(instances))
-        elif self.match_method == "name":
+        elif self.match_method == "feature":
             scores, instances = self.agent.get_instances_from_text(self.target_object)
             # self.agent.voxel_map.show(orig=np.zeros(3), xyt=start, footprint=self.robot_model.get_footprint(), planner_visuals=True)
         else:
diff --git a/src/stretch/app/grasp_object.py b/src/stretch/app/grasp_object.py
index 3ade650c9..d3cd6a4d6 100644
--- a/src/stretch/app/grasp_object.py
+++ b/src/stretch/app/grasp_object.py
@@ -31,6 +31,7 @@ def get_task(robot, demo, target_object):
             show_map_so_far=False,
             clear_voxel_map=True,
             show_instances_detected=False,
+            match_method="feature",
         )
         grasp_object = GraspObjectOperation(
             "grasp_the_object",
@@ -43,6 +44,7 @@ def get_task(robot, demo, target_object):
             show_servo_gui=True,
             reset_observation=False,
             grasp_loose=(target_object == "cup"),
+            match_method="feature",
         )
         grasp_object.set_target_object_class(target_object)
         task.add_operation(update)
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index 016b061a8..4b73214f5 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -1146,9 +1146,20 @@ def _get_instances_open3d(self, geoms: List[open3d.geometry.Geometry]) -> None:
             geoms.append(wireframe)
 
     def delete_instance(
-        self, instance: Instance, force_update=False, min_bound_z=0, assume_explored: bool = False
+        self,
+        instance: Instance,
+        force_update: bool = True,
+        min_bound_z: float = 0,
+        assume_explored: bool = False,
     ) -> None:
-        """Remove an instance from the map"""
+        """Remove an instance from the map.
+
+        Args:
+            instance: instance to remove
+            force_update: force update of cached 2d map. Can be disabled if you're planning to do multiple deletions in order to be slightly more efficient.
+            min_bound_z: minimum z bound to delete. Usually 0, for the floor plane.
+            assume_explored: assume deleted area is explored or not. If True, will mark the area as explored, so that the robot can plan past this position. If False, the area will be marked as unexplored, and the robot will have to re-explore it.
+        """
         print("Deleting instance", instance.global_id)
         print("Bounds: ", instance.bounds)
         self.delete_obstacles(instance.bounds, force_update=force_update, min_bound_z=min_bound_z)

From d4dc67ec2dbe437ccfbf3ec352bcbfba99ad7dd0 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 16:43:24 -0400
Subject: [PATCH 338/410] Try grasping something

---
 src/stretch/agent/operations/grasp_object.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index 3111f7f99..794c9a13a 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -122,6 +122,7 @@ def configure(
         reset_observation: bool = False,
         grasp_loose: bool = False,
         talk: bool = True,
+        match_method: str = "class",
     ):
         """Configure the operation with the given keyword arguments.
 
@@ -133,6 +134,7 @@ def configure(
             reset_observation (bool, optional): Reset the observation. Defaults to False.
             grasp_loose (bool, optional): Grasp loosely. Useful for grasping some objects like cups. Defaults to False.
             talk (bool, optional): Talk as the robot tries to grab stuff. Defaults to True.
+            match_method (str, optional): Matching method. Defaults to "class". This is how the policy determines which object mask it should try to grasp.
         """
         self.target_object = target_object
         self.show_object_to_grasp = show_object_to_grasp
@@ -142,6 +144,11 @@ def configure(
         self.reset_observation = reset_observation
         self.grasp_loose = grasp_loose
         self.talk = talk
+        self.match_method = match_method
+        if self.match_method not in ["class", "feature"]:
+            raise ValueError(
+                f"Unknown match method {self.match_method}. Should be 'class' or 'feature'."
+            )
 
     def _debug_show_point_cloud(self, servo: Observations, current_xyz: np.ndarray) -> None:
         """Show the point cloud for debugging purposes.

From 22d938165afb740df7afcb18036f02bfb29e3479 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 16:45:34 -0400
Subject: [PATCH 339/410] update

---
 src/stretch/agent/operations/update.py | 2 +-
 src/stretch/mapping/voxel/voxel.py     | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/stretch/agent/operations/update.py b/src/stretch/agent/operations/update.py
index 8cc743ef5..600735286 100644
--- a/src/stretch/agent/operations/update.py
+++ b/src/stretch/agent/operations/update.py
@@ -81,7 +81,7 @@ def run(self):
         self.update(move_head=self.move_head)
 
         # Delete observations near us, since they contain the arm!!
-        self.agent.voxel_map.delete_obstacles(point=xyt[:2], radius=0.7)
+        self.agent.voxel_map.delete_obstacles(point=xyt[:2], radius=0.7, force_update=True)
 
         # Show the map so far
         self.agent.voxel_map.show(
diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index d5bb52db3..50e53b77f 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -1180,7 +1180,7 @@ def delete_obstacles(
         bounds: Optional[np.ndarray] = None,
         point: Optional[np.ndarray] = None,
         radius: Optional[float] = None,
-        force_update: Optional[bool] = False,
+        force_update: Optional[bool] = True,
         min_height: Optional[float] = None,
         min_bound_z: Optional[float] = 0.0,
         assume_explored: bool = False,
@@ -1191,9 +1191,9 @@ def delete_obstacles(
             bounds: 3x2 array of min and max bounds in xyz
             point: 3x1 array of point to delete
             radius: radius around point to delete
-            force_update: force update of 2d map
-            min_height: minimum height to delete
-            min_bound_z: minimum z bound to delete
+            force_update: force update of 2d map. Can be disabled if you're planning to do multiple deletions in order to be slightly more efficient.
+            min_height: minimum height to delete. Usually 0, for the floor plane.
+            min_bound_z: minimum z bound to delete. Usually 0, for the floor plane.
             assume_explored: assume deleted area is explored
         """
         self.voxel_pcd.remove(bounds, point, radius, min_height=min_height, min_bound_z=min_bound_z)

From 971cfb692ff9ecdaae4fe0eba0b17fb1a3bfb8bc Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 16:46:13 -0400
Subject: [PATCH 340/410] updates

---
 src/stretch/mapping/voxel/voxel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/stretch/mapping/voxel/voxel.py b/src/stretch/mapping/voxel/voxel.py
index 50e53b77f..77e1ce831 100644
--- a/src/stretch/mapping/voxel/voxel.py
+++ b/src/stretch/mapping/voxel/voxel.py
@@ -1160,7 +1160,7 @@ def delete_instance(
 
         Args:
             instance: instance to remove
-            force_update: force update of cached 2d map. Can be disabled if you're planning to do multiple deletions in order to be slightly more efficient.
+            force_update: force update of cached 2d map. Can be disabled if you're planning to do multiple deletions in order to be slightly more efficient. Defaults to True.
             min_bound_z: minimum z bound to delete. Usually 0, for the floor plane.
             assume_explored: assume deleted area is explored or not. If True, will mark the area as explored, so that the robot can plan past this position. If False, the area will be marked as unexplored, and the robot will have to re-explore it.
         """
@@ -1191,7 +1191,7 @@ def delete_obstacles(
             bounds: 3x2 array of min and max bounds in xyz
             point: 3x1 array of point to delete
             radius: radius around point to delete
-            force_update: force update of 2d map. Can be disabled if you're planning to do multiple deletions in order to be slightly more efficient.
+            force_update: force update of 2d map. Can be disabled if you're planning to do multiple deletions in order to be slightly more efficient. Defaults to True.
             min_height: minimum height to delete. Usually 0, for the floor plane.
             min_bound_z: minimum z bound to delete. Usually 0, for the floor plane.
             assume_explored: assume deleted area is explored

From 0cd8f047da556e04c9c2485f369baa5f8c065c36 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 16:56:44 -0400
Subject: [PATCH 341/410] Some code cleanup

---
 src/stretch/agent/operations/grasp_object.py | 2 +-
 src/stretch/agent/operations/update.py       | 8 --------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index 794c9a13a..2299df1ac 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -426,7 +426,6 @@ def visual_servo_to_object(
             latest_mask = self.get_target_mask(
                 servo, instance, prev_mask=prev_target_mask, center=(center_x, center_y)
             )
-            print("latest mask size:", np.sum(latest_mask.flatten()))
 
             # dilate mask
             kernel = np.ones((3, 3), np.uint8)
@@ -505,6 +504,7 @@ def visual_servo_to_object(
                 self.info("Not moving; try to grasp.")
                 success = self._grasp()
                 break
+
             # If we have a target mask, compute the median depth of the object
             # Otherwise we will just try to grasp if we are close enough - assume we lost track!
             if target_mask is not None:
diff --git a/src/stretch/agent/operations/update.py b/src/stretch/agent/operations/update.py
index 600735286..46d7a7cbf 100644
--- a/src/stretch/agent/operations/update.py
+++ b/src/stretch/agent/operations/update.py
@@ -83,14 +83,6 @@ def run(self):
         # Delete observations near us, since they contain the arm!!
         self.agent.voxel_map.delete_obstacles(point=xyt[:2], radius=0.7, force_update=True)
 
-        # Show the map so far
-        self.agent.voxel_map.show(
-            orig=np.zeros(3),
-            xyt=xyt,
-            footprint=self.robot_model.get_footprint(),
-            planner_visuals=True,
-        )
-
         # Notify and move the arm back to normal. Showing the map is optional.
         print(f"So far we have found: {len(self.agent.voxel_map.instances)} objects.")
         self.robot.arm_to([0.0, 0.4, 0.05, 0, -np.pi / 4, 0], blocking=True)

From f1d9f3bda52a700da9c6711a9d261b17fe0a4681 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 17:03:09 -0400
Subject: [PATCH 342/410] cleanup

---
 src/stretch/agent/operations/grasp_object.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index 2299df1ac..77766cae7 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -451,13 +451,6 @@ def visual_servo_to_object(
             # Compute the center of the mask in image coords
             mask_center = self.observations.get_latest_centroid()
             if mask_center is None:
-                # if not aligned_once:
-                #     self.error(
-                #         "Lost track before even seeing object with EE camera. Just try open loop."
-                #     )
-                #     if self.show_servo_gui:
-                #         cv2.destroyAllWindows()
-                # return False
                 if failed_counter < self.max_failed_attempts:
                     mask_center = np.array([center_y, center_x])
                 else:

From 3d57413a4d3a0ffef794b133363510c9727c1d7e Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Fri, 4 Oct 2024 17:14:27 -0400
Subject: [PATCH 343/410] adding setup for handling oscillations

---
 src/stretch/agent/operations/grasp_object.py | 21 +++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index 77766cae7..70706efeb 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -21,6 +21,7 @@
 
 import cv2
 import numpy as np
+import torch
 
 import stretch.motion.constants as constants
 from stretch.agent.base import ManagedOperation
@@ -84,6 +85,9 @@ class GraspObjectOperation(ManagedOperation):
     # wrist_pitch_step: float = 0.05  # Too slow
     # ------------------------
 
+    # Tracked object features for making sure we are grabbing the right thing
+    tracked_object_features: Optional[torch.Tensor] = None
+
     # Parameters about how to grasp - less important
     grasp_loose: bool = False
     reset_observation: bool = False
@@ -213,6 +217,7 @@ def get_class_mask(self, servo: Observations) -> np.ndarray:
             text_features = self.agent.encode_text(self.target_object)
             best_score = float("-inf")
             best_iid = None
+            all_matches = []
             for iid in np.unique(servo.instance):
 
                 # Ignore the background
@@ -232,7 +237,21 @@ def get_class_mask(self, servo: Observations) -> np.ndarray:
                 if score > best_score:
                     best_score = score
                     best_iid = iid
-                    mask = servo.instance == iid
+                if score > self.agent.is_match_threshold:
+                    all_matches.append((score, iid, features))
+            if len(all_matches) > 0:
+                print("All matches:")
+                for score, iid, features in all_matches:
+                    print(f" - Matched {iid} with score {score}.")
+            if len(all_matches) == 0:
+                print("No matches found.")
+            elif len(all_matches) == 1:
+                print("One match found. We are done.")
+                mask = servo.instance == best_iid
+            else:
+                # Check to see if we have
+                breakpoint()
+
         else:
             raise ValueError(f"Invalid matching method {self.match_method}.")
 

From 1732f84adcfff3a487d49d586fc63bafcd8085fb Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 7 Oct 2024 11:05:24 -0400
Subject: [PATCH 344/410] remove pytorch3d and pytorch geometric - bad
 dependencies

---
 install.sh | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/install.sh b/install.sh
index 029ea7d97..66339fc6e 100755
--- a/install.sh
+++ b/install.sh
@@ -11,8 +11,6 @@ VERSION=`python $script_dir/src/stretch/version.py`
 CPU_ONLY="false"
 NO_REMOVE="false"
 NO_SUBMODULES="false"
-INSTALL_PYTORCH3D="false"
-INSTALL_TORCH_GEOMETRIC="false"
 MAMBA=mamba
 NO_VERSION="false"
 # Two cases: -y for yes, --cpu for cpu only
@@ -41,10 +39,6 @@ do
             NO_SUBMODULES="true"
             shift
             ;;
-        --torch-geometric)
-            INSTALL_TORCH_GEOMETRIC="true"
-            shift
-            ;;
         --no-version)
             NO_VERSION="true"
             shift
@@ -157,19 +151,6 @@ echo "---------------------------------------------"
 echo "---- INSTALLING STRETCH AI DEPENDENCIES  ----"
 echo "Will be installed via pip into env: $ENV_NAME"
 
-# This is no longer necessary but might be useful for some checks
-if [ "$INSTALL_PYTORCH3D" == "true" ]; then
-    echo "Installing pytorch3d from source"
-    pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable"
-fi
-
-if [ "$INSTALL_TORCH_GEOMETRIC" == "true" ]; then
-    echo "Installing torch-geometric"
-    # If not using cpu only, install the following
-    # It is important to use --no-cache-dir to avoid issues different versions of pytorch and cuda
-    pip install torch_cluster torch_scatter torch_geometric -f https://pytorch-geometric.com/whl/torch-${PYTORCH_VERSION}+${CUDA_VERSION_NODOT}.html --no-cache-dir
-fi
-
 pip install -e ./src[dev]
 
 echo ""

From 2b17fd18e84d9a967b0c21d4bc9a29c0ca378890 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 7 Oct 2024 11:38:53 -0400
Subject: [PATCH 345/410] Add a comment

---
 src/stretch/agent/zmq_client.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 83fa1262d..3ba57afa8 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -361,6 +361,7 @@ def get_ee_pose(self, matrix=False, link_name=None, q=None):
             return pos, quat
 
     def get_frame_pose(self, q, node_a: str, node_b: str):
+        """Get the pose of frame b relative to frame a."""
         return self._robot_model.manip_ik_solver.get_frame_pose(q, node_a, node_b)
 
     def solve_ik(

From e781f0575a879a58a883c5c6308fa82abc047048 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 7 Oct 2024 11:39:39 -0400
Subject: [PATCH 346/410] updates

---
 src/stretch/agent/zmq_client.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 3ba57afa8..47bcab429 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -361,7 +361,14 @@ def get_ee_pose(self, matrix=False, link_name=None, q=None):
             return pos, quat
 
     def get_frame_pose(self, q, node_a: str, node_b: str):
-        """Get the pose of frame b relative to frame a."""
+        """Get the pose of frame b relative to frame a.
+
+        Args:
+            q: The joint positions
+            node_a: The name of the first frame
+            node_b: The name of the second frame
+        """
+        # TODO: get this working properly and update the documentation
         return self._robot_model.manip_ik_solver.get_frame_pose(q, node_a, node_b)
 
     def solve_ik(

From d439b02ea38f542f0f3912708b51d2a1d8b1af8f Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 7 Oct 2024 11:40:56 -0400
Subject: [PATCH 347/410] updating comments

---
 src/stretch/agent/zmq_client.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 47bcab429..acb2e80af 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -286,7 +286,15 @@ def get_joint_velocities(self, timeout: float = 5.0) -> np.ndarray:
         return joint_velocities
 
     def get_joint_efforts(self, timeout: float = 5.0) -> np.ndarray:
-        """Get the current joint efforts"""
+        """Get the current joint efforts from the robot.
+
+        Args:
+            timeout: How long to wait for the observation
+
+        Returns:
+            np.ndarray: The joint efforts as an array of floats
+        """
+
         t0 = timeit.default_timer()
         with self._state_lock:
             while self._state is None:
@@ -298,7 +306,14 @@ def get_joint_efforts(self, timeout: float = 5.0) -> np.ndarray:
         return joint_efforts
 
     def get_base_pose(self, timeout: float = 5.0) -> np.ndarray:
-        """Get the current pose of the base"""
+        """Get the current pose of the base.
+
+        Args:
+            timeout: How long to wait for the observation
+
+        Returns:
+            np.ndarray: The base pose as [x, y, theta]
+        """
         t0 = timeit.default_timer()
         if self.update_base_pose_from_full_obs:
             with self._obs_lock:

From c53d235e730d5f52678d6b8bad7add551e9ac8fa Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 7 Oct 2024 12:29:18 -0400
Subject: [PATCH 348/410] Fix type issues

---
 src/stretch/dynav/mapping_utils/a_star.py  | 29 ++++++++++++++++++++--
 src/stretch/dynav/robot_agent_manip_mdp.py | 11 +++++++-
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/src/stretch/dynav/mapping_utils/a_star.py b/src/stretch/dynav/mapping_utils/a_star.py
index 95e4b1b44..048fdfb87 100644
--- a/src/stretch/dynav/mapping_utils/a_star.py
+++ b/src/stretch/dynav/mapping_utils/a_star.py
@@ -75,19 +75,44 @@ def reset(self):
         self.start_time = time.time()
 
     def point_is_occupied(self, x: int, y: int) -> bool:
+        """Checks if a point is occupied.
+
+        Args:
+            x: The x coordinate.
+            y: The y coordinate.
+
+        Returns:
+            Whether the point is occupied.
+        """
         return not bool(self._navigable[x][y])
 
     def to_pt(self, xy: Tuple[float, float]) -> Tuple[int, int]:
+        """Converts a point from continuous, world xy coordinates to grid coordinates.
+
+        Args:
+            xy: The point in continuous xy coordinates.
+
+        Returns:
+            The point in discrete grid coordinates.
+        """
         # # type: ignore to bypass mypy checking
         xy = np.array([xy[0], xy[1]])  # type: ignore
         pt = self.space.voxel_map.xy_to_grid_coords(xy)  # type: ignore
-        return tuple(pt.tolist())
+        return int(pt[0]), int(pt[1])
 
     def to_xy(self, pt: Tuple[int, int]) -> Tuple[float, float]:
+        """Converts a point from grid coordinates to continuous, world xy coordinates.
+
+        Args:
+            pt: The point in grid coordinates.
+
+        Returns:
+            The point in continuous xy coordinates.
+        """
         # # type: ignore to bypass mypy checking
         pt = np.array([pt[0], pt[1]])  # type: ignore
         xy = self.space.voxel_map.grid_coords_to_xy(pt)  # type: ignore
-        return tuple(xy.tolist())
+        return float(xy[0]), float(xy[1])
 
     def compute_dis(self, a: Tuple[int, int], b: Tuple[int, int]):
         return ((a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2) ** 0.5
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index 33caf38ee..13dd039a4 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -18,7 +18,16 @@
 from stretch.core.parameters import Parameters
 from stretch.dynav.communication_util import recv_array, send_array, send_everything
 from stretch.dynav.ok_robot_hw.camera import RealSenseCamera
-from stretch.dynav.ok_robot_hw.global_parameters import *
+from stretch.dynav.ok_robot_hw.global_parameters import (
+    INIT_ARM_POS,
+    INIT_HEAD_PAN,
+    INIT_HEAD_TILT,
+    INIT_LIFT_POS,
+    INIT_WRIST_PITCH,
+    INIT_WRIST_ROLL,
+    INIT_WRIST_YAW,
+    TOP_CAMERA_NODE,
+)
 from stretch.dynav.ok_robot_hw.robot import HelloRobot as Manipulation_Wrapper
 from stretch.dynav.ok_robot_hw.utils.grasper_utils import (
     capture_and_process_image,

From f81e93668a0b0a3effb8ac680a7b0d3d0c635269 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Mon, 7 Oct 2024 13:35:15 -0400
Subject: [PATCH 349/410] more code cleanup and fix failing thigns

---
 src/stretch/agent/zmq_client.py           |  5 ++++-
 src/stretch/motion/base/ik_solver_base.py | 17 ++++++++++++++++-
 src/stretch/motion/pinocchio_ik_solver.py | 11 ++++++++++-
 3 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 4f9d63226..e118d2098 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -375,13 +375,16 @@ def get_ee_pose(self, matrix=False, link_name=None, q=None):
         else:
             return pos, quat
 
-    def get_frame_pose(self, q, node_a: str, node_b: str):
+    def get_frame_pose(self, q: Union[np.ndarray, dict], node_a: str, node_b: str) -> np.ndarray:
         """Get the pose of frame b relative to frame a.
 
         Args:
             q: The joint positions
             node_a: The name of the first frame
             node_b: The name of the second frame
+
+        Returns:
+            np.ndarray: The pose of frame b relative to frame a as a 4x4 matrix
         """
         # TODO: get this working properly and update the documentation
         return self._robot_model.manip_ik_solver.get_frame_pose(q, node_a, node_b)
diff --git a/src/stretch/motion/base/ik_solver_base.py b/src/stretch/motion/base/ik_solver_base.py
index 40d6ade69..1026ab161 100644
--- a/src/stretch/motion/base/ik_solver_base.py
+++ b/src/stretch/motion/base/ik_solver_base.py
@@ -13,7 +13,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 
@@ -37,6 +37,21 @@ def compute_fk(
         """given joint values return end-effector position and quaternion associated with it"""
         raise NotImplementedError()
 
+    def get_frame_pose(
+        self, q: Union[np.ndarray, List[float], dict], node_a: str, node_b: str
+    ) -> np.ndarray:
+        """Given joint values, return the pose of the frame attached to node_a in the frame of node_b.
+
+        Args:
+            q: joint values
+            node_a: name of the node where the frame is attached
+            node_b: name of the node in whose frame the pose is desired
+
+        Returns:
+            4x4 np.ndarray: the pose of the frame attached to node_a in the frame of node_b
+        """
+        raise NotImplementedError()
+
     def compute_ik(
         self,
         pos_desired: np.ndarray,
diff --git a/src/stretch/motion/pinocchio_ik_solver.py b/src/stretch/motion/pinocchio_ik_solver.py
index 9b887463a..24bf88673 100644
--- a/src/stretch/motion/pinocchio_ik_solver.py
+++ b/src/stretch/motion/pinocchio_ik_solver.py
@@ -140,9 +140,18 @@ def get_frame_pose(
         node_a: str,
         node_b: str,
         ignore_missing_joints: bool = False,
-    ):
+    ) -> np.ndarray:
         """
         Get a transformation matrix transforming from node_a frame to node_b frame
+
+        Args:
+            config: joint values
+            node_a: name of the first node
+            node_b: name of the second node
+            ignore_missing_joints: whether to ignore missing joints in the configuration
+
+        Returns:
+            transformation matrix from node_a to node_b
         """
         q_model = self._qmap_control2model(config, ignore_missing_joints=ignore_missing_joints)
         # print('q_model', q_model)

From 6a41a874b903b9930f0adff6f0f1c2bb1d3bb1f0 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 8 Oct 2024 10:43:03 -0400
Subject: [PATCH 350/410] Fix visualizer

---
 src/stretch/visualization/rerun.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/visualization/rerun.py b/src/stretch/visualization/rerun.py
index 81ab163a6..d3df893be 100644
--- a/src/stretch/visualization/rerun.py
+++ b/src/stretch/visualization/rerun.py
@@ -174,7 +174,7 @@ def log_transforms(self, obs, debug: bool = False):
             print("Total time to log robot transforms (ms): ", 1000 * (t2 - t0))
 
 
-class RerunVsualizer:
+class RerunVisualizer:
 
     camera_point_radius = 0.01
     max_displayed_points_per_camera: int = 10000

From b8ef24e6b09bb55975d90208ddd2469a210117bc Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 8 Oct 2024 10:56:56 -0400
Subject: [PATCH 351/410] updates to grasping code

---
 src/stretch/agent/operations/grasp_object.py | 42 +++++++++++++++++---
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index 70706efeb..adea87e7e 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -237,7 +237,7 @@ def get_class_mask(self, servo: Observations) -> np.ndarray:
                 if score > best_score:
                     best_score = score
                     best_iid = iid
-                if score > self.agent.is_match_threshold:
+                if score > self.agent.feature_match_threshold:
                     all_matches.append((score, iid, features))
             if len(all_matches) > 0:
                 print("All matches:")
@@ -248,10 +248,34 @@ def get_class_mask(self, servo: Observations) -> np.ndarray:
             elif len(all_matches) == 1:
                 print("One match found. We are done.")
                 mask = servo.instance == best_iid
+                # Set the tracked features
+                self.tracked_object_features = all_matches[0][2]
             else:
-                # Check to see if we have
-                breakpoint()
-
+                # Check to see if we have tracked features
+                if self.tracked_object_features is not None:
+                    # Find the closest match
+                    best_score = float("-inf")
+                    best_iid = None
+                    best_features = None
+                    for _, iid, features in all_matches:
+                        score = self.agent.compare_features(self.tracked_object_features, features)
+                        if score > best_score:
+                            best_score = score
+                            best_iid = iid
+                            best_features = features
+                else:
+                    best_score = float("-inf")
+                    best_iid = None
+                    best_features = None
+                    for score, iid, features in all_matches:
+                        if score > best_score:
+                            best_score = score
+                            best_iid = iid
+                            best_features = features
+
+                # Set the mask
+                mask = servo.instance == best_iid
+                self.tracked_object_features = best_features
         else:
             raise ValueError(f"Invalid matching method {self.match_method}.")
 
@@ -265,6 +289,12 @@ def set_target_object_class(self, target_object: str):
         """
         self.target_object = target_object
 
+    def reset(self):
+        """Reset the operation. This clears the history and sets the success flag to False. It also clears the tracked object features."""
+        self._success = False
+        self.tracked_object_features = None
+        self.observations.clear_history()
+
     def get_target_mask(
         self,
         servo: Observations,
@@ -657,6 +687,9 @@ def run(self) -> None:
         if self.show_object_to_grasp:
             self.show_instance(self.agent.current_object)
 
+        # Clear the observation history
+        self.reset()
+
         assert self.target_object is not None, "Target object must be set before running."
 
         # Now we should be able to see the object if we orient gripper properly
@@ -694,7 +727,6 @@ def run(self) -> None:
 
         # clear observations
         if self.reset_observation:
-            self.observations.clear_history()
             self.agent.reset_object_plans()
             self.agent.voxel_map.instances.pop_global_instance(
                 env_id=0, global_instance_id=self.agent.current_object.global_id

From 2ddc9a85ca647f3247706d2f41eaee3d5ff31df9 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 8 Oct 2024 08:21:43 -0700
Subject: [PATCH 352/410] switch modes

---
 src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py b/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py
index 1616bcddb..2f048a7eb 100755
--- a/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py
+++ b/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py
@@ -160,6 +160,8 @@ def handle_action(self, action: Dict[str, Any]):
                     self.client.in_navigation_mode(),
                 )
                 print(f"{action['xyt']} {action['nav_relative']} {action['nav_blocking']}")
+            if not self.client.in_navigation_mode():
+                self.client.switch_to_navigation_mode()
             self.client.navigate_to(
                 action["xyt"],
                 relative=action["nav_relative"],

From faa8c70f08e4aa7be7f49ca0354aa51bfc5df704 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 8 Oct 2024 11:28:24 -0400
Subject: [PATCH 353/410] cleaning up and trying to make sure modes are working
 better

---
 src/stretch/agent/operations/grasp_object.py |  2 ++
 src/stretch/agent/zmq_client.py              | 14 +++++++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index adea87e7e..65716c51d 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -535,6 +535,8 @@ def visual_servo_to_object(
                 servo_ee_rgb = cv2.circle(
                     servo_ee_rgb, (int(mask_center[1]), int(mask_center[0])), 5, (0, 255, 0), -1
                 )
+                print("-- show a window")
+                cv2.namedWindow("servo_ee_rgb", cv2.WINDOW_NORMAL)
                 cv2.imshow("servo_ee_rgb", servo_ee_rgb)
                 cv2.waitKey(1)
                 res = cv2.waitKey(1) & 0xFF  # 0xFF is a mask to get the last 8 bits
diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index e118d2098..7ff198213 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -871,6 +871,7 @@ def _wait_for_mode(
         resend_action: Optional[Dict[str, Any]] = None,
         verbose: bool = False,
         timeout: float = 20.0,
+        time_required: float = 0.05,
     ) -> bool:
         """
         Wait for the robot to switch to a particular control mode. Will throw an exception if mode switch fails; probably means a packet was dropped.
@@ -885,12 +886,19 @@ def _wait_for_mode(
             bool: Whether the robot successfully switched to the target mode
         """
         t0 = timeit.default_timer()
+        mode_t0 = None
         while True:
             with self._state_lock:
                 if verbose:
-                    print(f"Waiting for mode {mode} current mode {self._control_mode}")
-                if self._control_mode == mode:
-                    break
+                    print(f"Waiting for mode {mode} current mode {self._control_mode} {mode_t0}")
+                if self._control_mode == mode and mode_t0 is None:
+                    mode_t0 = timeit.default_timer()
+                elif self._control_mode != mode:
+                    mode_t0 = None
+            # Make sure we are in the mode for at least time_required seconds
+            # This is to handle network delays
+            if mode_t0 is not None and timeit.default_timer() - mode_t0 > time_required:
+                break
             if resend_action is not None:
                 self.send_socket.send_pyobj(resend_action)
             time.sleep(0.1)

From 594f6284b387745c218b5c8fbe20e11e484025a4 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 8 Oct 2024 11:33:29 -0400
Subject: [PATCH 354/410] cleaning up some code

---
 src/stretch/agent/operations/grasp_object.py | 11 ++---------
 src/stretch/app/grasp_object.py              |  4 ++--
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/src/stretch/agent/operations/grasp_object.py b/src/stretch/agent/operations/grasp_object.py
index 65716c51d..7bdfedc77 100644
--- a/src/stretch/agent/operations/grasp_object.py
+++ b/src/stretch/agent/operations/grasp_object.py
@@ -226,11 +226,6 @@ def get_class_mask(self, servo: Observations) -> np.ndarray:
 
                 rgb = servo.ee_rgb * (servo.instance == iid)[:, :, None].repeat(3, axis=-1)
 
-                # TODO: remove debug code
-                #  import matplotlib.pyplot as plt
-                # plt.imshow(rgb)
-                # plt.show()
-
                 features = self.agent.encode_image(rgb)
                 score = self.agent.compare_features(text_features, features)
                 print(f" - Score for {iid} is {score}")
@@ -263,19 +258,17 @@ def get_class_mask(self, servo: Observations) -> np.ndarray:
                             best_score = score
                             best_iid = iid
                             best_features = features
+                    self.tracked_object_features = best_features
                 else:
                     best_score = float("-inf")
                     best_iid = None
-                    best_features = None
-                    for score, iid, features in all_matches:
+                    for score, iid, _ in all_matches:
                         if score > best_score:
                             best_score = score
                             best_iid = iid
-                            best_features = features
 
                 # Set the mask
                 mask = servo.instance == best_iid
-                self.tracked_object_features = best_features
         else:
             raise ValueError(f"Invalid matching method {self.match_method}.")
 
diff --git a/src/stretch/app/grasp_object.py b/src/stretch/app/grasp_object.py
index d3cd6a4d6..a3d2254ad 100644
--- a/src/stretch/app/grasp_object.py
+++ b/src/stretch/app/grasp_object.py
@@ -39,9 +39,9 @@ def get_task(robot, demo, target_object):
         )
         grasp_object.configure(
             target_object=target_object,
-            show_object_to_grasp=True,
+            show_object_to_grasp=False,
             servo_to_grasp=True,
-            show_servo_gui=True,
+            show_servo_gui=False,
             reset_observation=False,
             grasp_loose=(target_object == "cup"),
             match_method="feature",

From 37f9a632b6412d1d6ca2d96808c4ffeb0a88d44e Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 8 Oct 2024 11:37:52 -0400
Subject: [PATCH 355/410] Better thread handling and cleanup at the end

---
 src/stretch/agent/robot_agent.py | 23 +++++++++++++++--------
 src/stretch/agent/zmq_client.py  |  8 ++++++++
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 7ef5d8063..93aa0bb80 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -163,13 +163,18 @@ def __init__(
             self._get_observations_thread = Thread(target=self.get_observations_loop)
             self._get_observations_thread.start()
 
-            self.context = zmq.Context()
+            # Set up ZMQ subscriber
+            self.context = self.robot.get_zmq_context()
             self.sub_socket = self.context.socket(zmq.SUB)
             self.sub_socket.connect(f"tcp://localhost:{obs_sub_port}")
             self.sub_socket.setsockopt_string(zmq.SUBSCRIBE, "")
+        else:
+            self._update_map_thread = None
+            self._get_observations_thread = None
 
     def __del__(self):
-        self._update_map_thread.join()
+        if self._update_map_thread is not None and self._update_map_thread.is_alive():
+            self._update_map_thread.join()
 
     def _create_voxel_map(self, parameters: Parameters) -> SparseVoxelMap:
         """Create a voxel map from parameters.
@@ -452,8 +457,8 @@ def get_command(self):
         else:
             return self.ask("Please type any task you want the robot to do: ")
 
-    def show_map(self):
-        """Helper function to visualize the 3d map as it stands right now"""
+    def show_map(self) -> None:
+        """Helper function to visualize the 3d map as it stands right now."""
         self.voxel_map.show(
             orig=np.zeros(3),
             xyt=self.robot.get_base_pose(),
@@ -461,8 +466,10 @@ def show_map(self):
             instances=self.semantic_sensor is not None,
         )
 
-    def get_observations_loop(self):
-        while True:
+    def get_observations_loop(self) -> None:
+        """Threaded function that gets observations in real-time. This is useful for when we are processing real-time updates."""
+
+        while self.robot.running:
             obs = None
             t0 = timeit.default_timer()
 
@@ -618,8 +625,8 @@ def update_map_with_pose_graph(self):
         self._obs_history_lock.release()
 
     def update_map_loop(self):
-        """Threaded function that updates our voxel map in real-time"""
-        while True:
+        """Threaded function that updates our voxel map in real-time."""
+        while self.robot.running:
             with self._robot_lock:
                 self.update_map_with_pose_graph()
             time.sleep(0.5)
diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 7ff198213..1c0643a80 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -71,6 +71,14 @@ def _create_recv_socket(
 
         return recv_socket
 
+    def get_zmq_context(self) -> zmq.Context:
+        """Get the ZMQ context for the client.
+
+        Returns:
+            zmq.Context: The ZMQ context
+        """
+        return self.context
+
     def _create_pub_obs_socket(self, port: int):
         send_socket = self.context.socket(zmq.PUB)
         send_socket.setsockopt(zmq.SNDHWM, 1)

From f8478bbb3b276215556677f40fe065a5fee2e772 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 8 Oct 2024 12:03:41 -0400
Subject: [PATCH 356/410] remove unnecessary code

---
 src/stretch/agent/robot_agent.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 93aa0bb80..6bb3531af 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -58,7 +58,6 @@ def __init__(
         semantic_sensor: Optional[OvmmPerception] = None,
         grasp_client: Optional[AbstractGraspClient] = None,
         voxel_map: Optional[SparseVoxelMap] = None,
-        debug_instances: bool = True,
         show_instances_detected: bool = False,
         use_instance_memory: bool = False,
         realtime_updates: bool = False,
@@ -73,7 +72,6 @@ def __init__(
             raise RuntimeError(f"parameters of unsupported type: {type(parameters)}")
         self.robot = robot
         self.grasp_client = grasp_client
-        self.debug_instances = debug_instances
         self.show_instances_detected = show_instances_detected
 
         self.semantic_sensor = semantic_sensor

From f0c53b07cb9dee50e8000d86a8261a724a0e6071 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 8 Oct 2024 12:06:42 -0400
Subject: [PATCH 357/410] update

---
 src/stretch/motion/kinematics.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/stretch/motion/kinematics.py b/src/stretch/motion/kinematics.py
index fe1391d1a..ef9fa3cb4 100644
--- a/src/stretch/motion/kinematics.py
+++ b/src/stretch/motion/kinematics.py
@@ -284,7 +284,7 @@ def config_close_gripper(self, q):
         q[HelloStretchIdx.GRIPPER] = self.range[HelloStretchIdx.GRIPPER][0]
         return q
 
-    def _update_joints(self):
+    def _update_joints(self, verbose: bool = False):
         """Get joint info from URDF or otherwise provide it"""
         self.joint_idx = [-1] * self.dof
         # Get the joint info we need from this
@@ -348,10 +348,11 @@ def _update_joints(self):
 
         # TODO: gripper
         self.gripper_idx = []
-        for i in ["right", "left"]:
-            joint = self.ref.get_joint_info_by_name("joint_gripper_finger_%s" % i)
+        for side in ["right", "left"]:
+            joint = self.ref.get_joint_info_by_name("joint_gripper_finger_%s" % side)
             self.gripper_idx.append(joint.index)
-            print(i, joint.name, joint.lower_limit, joint.upper_limit)
+            if verbose:
+                print(side, joint.name, joint.lower_limit, joint.upper_limit)
             self.range[HelloStretchIdx.GRIPPER] = (
                 np.array([joint.lower_limit, joint.upper_limit]) * 0.5
             )

From c7f7077fdd3d566798d0806e50d23e63546b9435 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 8 Oct 2024 12:14:11 -0400
Subject: [PATCH 358/410] publsh observations

---
 src/stretch/agent/zmq_client.py           | 12 ++++++++++--
 src/stretch/motion/pinocchio_ik_solver.py |  6 +++++-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 1c0643a80..d3a9b685a 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -111,6 +111,7 @@ def __init__(
         start_immediately: bool = True,
         enable_rerun_server: bool = True,
         resend_all_actions: bool = False,
+        publish_observations: bool = False,
     ):
         """
         Create a client to communicate with the robot over ZMQ.
@@ -138,6 +139,7 @@ def __init__(
 
         # Resend all actions immediately - helps if we are losing packets or something?
         self._resend_all_actions = resend_all_actions
+        self._publish_observations = publish_observations
 
         if parameters is None:
             parameters = get_parameters("default_planner.yaml")
@@ -189,7 +191,10 @@ def __init__(
             recv_servo_port, robot_ip, use_remote_computer, message_type="visual servoing data"
         )
 
-        self.pub_obs_socket = self._create_pub_obs_socket(pub_obs_port)
+        if self._publish_observations:
+            self.pub_obs_socket = self._create_pub_obs_socket(pub_obs_port)
+        else:
+            self.pub_obs_socket = None
 
         # SEnd actions back to the robot for execution
         self.send_socket = self.context.socket(zmq.PUB)
@@ -1037,7 +1042,8 @@ def _update_obs(self, obs):
         """Update observation internally with lock"""
         with self._obs_lock:
             self._obs = obs
-            self.pub_obs_socket.send_pyobj(obs)
+            if self._publish_observations:
+                self.pub_obs_socket.send_pyobj(obs)
             self._last_step = obs["step"]
             if self._iter <= 0:
                 self._iter = max(self._last_step, self._iter)
@@ -1535,6 +1541,8 @@ def stop(self):
         self.recv_state_socket.close()
         self.recv_servo_socket.close()
         self.send_socket.close()
+        if self.pub_obs_socket is not None:
+            self.pub_obs_socket.close()
         self.context.term()
 
 
diff --git a/src/stretch/motion/pinocchio_ik_solver.py b/src/stretch/motion/pinocchio_ik_solver.py
index 24bf88673..45caa8da6 100644
--- a/src/stretch/motion/pinocchio_ik_solver.py
+++ b/src/stretch/motion/pinocchio_ik_solver.py
@@ -17,8 +17,12 @@
 import pinocchio
 from scipy.spatial.transform import Rotation as R
 
-import stretch.utils.logger as logger
 from stretch.motion.base.ik_solver_base import IKSolverBase
+from stretch.utils.logger import Logger
+
+# Create a logger and suppress info messages
+logger = Logger(__name__)
+logger.hide_info()
 
 # --DEFAULTS--
 # Error tolerances

From 40a954cdc6f9e0f3cacae8b288dce2180751f721 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 8 Oct 2024 13:09:14 -0400
Subject: [PATCH 359/410] updating code

---
 src/stretch/agent/zmq_client.py | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index d3a9b685a..a3e9b3df4 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -1456,15 +1456,33 @@ def blocking_spin_rerun(self) -> None:
 
     @property
     def is_homed(self) -> bool:
-        return self._state is not None and self._state["is_homed"]
+        """Is the robot homed?
+
+        Returns:
+            bool: whether the robot is homed
+        """
+        # This is not really thread safe
+        with self._state_lock:
+            return self._state is not None and self._state["is_homed"]
 
     @property
     def is_runstopped(self) -> bool:
-        return self._state is not None and self._state["is_runstopped"]
+        """Is the robot runstopped?
+
+        Returns:
+            bool: whether the robot is runstopped
+        """
+        with self._state_lock:
+            return self._state is not None and self._state["is_runstopped"]
 
     def start(self) -> bool:
-        """Start running blocking thread in a separate thread"""
+        """Start running blocking thread in a separate thread. This will wait for observations to come in and update internal state.
+
+        Returns:
+            bool: whether the client was started successfully
+        """
         if self._started:
+            # Already started
             return True
 
         self._thread = threading.Thread(target=self.blocking_spin)
@@ -1523,9 +1541,11 @@ def start(self) -> bool:
         return True
 
     def __del__(self):
+        """Destructor to make sure we stop the client when it is deleted"""
         self.stop()
 
     def stop(self):
+        """Stop the client and close all sockets"""
         self._finish = True
         if self._thread is not None:
             self._thread.join()

From 7d3b7b7d2b306d9124187ab661fa1cee7e78a85b Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 8 Oct 2024 13:10:22 -0400
Subject: [PATCH 360/410] more code docs changes

---
 src/stretch/agent/zmq_client.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index a3e9b3df4..b7905e110 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -1391,12 +1391,20 @@ def update_servo(self, message):
             self._servo = observation
 
     def get_servo_observation(self):
-        """Get the current servo observation"""
+        """Get the current servo observation.
+
+        Returns:
+            Observations: the current servo observation
+        """
         with self._servo_lock:
             return self._servo
 
     def blocking_spin_servo(self, verbose: bool = False):
-        """Listen for servo messages coming from the robot, i.e. low res images for ML state"""
+        """Listen for servo messages coming from the robot, i.e. low res images for ML state. This is intended to be run in a separate thread.
+
+        Args:
+            verbose (bool): whether to print out debug information
+        """
         sum_time = 0.0
         steps = 0
         t0 = timeit.default_timer()
@@ -1415,11 +1423,19 @@ def blocking_spin_servo(self, verbose: bool = False):
 
     @property
     def running(self) -> bool:
-        """Is the client running"""
+        """Is the client running? Best practice is to check this during while loops.
+
+        Returns:
+            bool: whether the client is running
+        """
         return not self._finish
 
     def is_running(self) -> bool:
-        """Is the client running"""
+        """Is the client running? Best practice is to check this during while loops.
+
+        Returns:
+            bool: whether the client is running
+        """
         return not self._finish
 
     def say(self, text: str):

From 08d23288b17a7fddb69649b3d9b3d0e81df43b0b Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 8 Oct 2024 15:30:46 -0400
Subject: [PATCH 361/410] making sure robot is not moving at all

---
 src/stretch/agent/zmq_client.py | 40 +++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index b7905e110..3a893cebe 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -795,6 +795,10 @@ def _wait_for_head(
         # Wait for the head to move
         # If the head is not moving, we are done
         # Head must be stationary for at least min_wait_time
+        verbose = True
+        prev_joint_positions = None
+        prev_t = None
+        prev_xyt = None
         while not self._finish:
             joint_positions, joint_velocities, _ = self.get_joint_state()
 
@@ -816,9 +820,37 @@ def _wait_for_head(
             head_speed = np.linalg.norm(
                 joint_velocities[HelloStretchIdx.HEAD_PAN : HelloStretchIdx.HEAD_TILT]
             )
+
+            current_xyt = self.get_base_pose()
+            if prev_xyt is not None:
+                xyt_diff = np.linalg.norm(prev_xyt - current_xyt)
+            else:
+                xyt_diff = float("inf")
+
+            # Save the current xyt to compute speed
+            prev_xyt = current_xyt
+
+            if prev_joint_positions is not None:
+                head_speed_v2 = np.linalg.norm(
+                    joint_positions[HelloStretchIdx.HEAD_PAN : HelloStretchIdx.HEAD_TILT]
+                    - prev_joint_positions[HelloStretchIdx.HEAD_PAN : HelloStretchIdx.HEAD_TILT]
+                ) / (timeit.default_timer() - prev_t)
+            else:
+                head_speed_v2 = float("inf")
+
+            # Take the max of the two speeds
+            # This is to handle the case where we're getting weird measurements
+            head_speed = max(head_speed, head_speed_v2, xyt_diff)
+
+            # Save the current joint positions to compute speed
+            prev_joint_positions = joint_positions
+            prev_t = timeit.default_timer()
+
             if verbose:
                 print("Waiting for head to move", pan_err, tilt_err, "head speed =", head_speed)
-            if pan_err < self._head_pan_tolerance and tilt_err < self._head_tilt_tolerance:
+            if head_speed > self._head_not_moving_tolerance:
+                at_goal = False
+            elif pan_err < self._head_pan_tolerance and tilt_err < self._head_tilt_tolerance:
                 at_goal = True
                 at_goal_t = timeit.default_timer()
             elif resend_action is not None:
@@ -826,7 +858,11 @@ def _wait_for_head(
             else:
                 at_goal = False
 
-            if at_goal and timeit.default_timer() - at_goal_t > min_wait_time:
+            if (
+                at_goal
+                and timeit.default_timer() - at_goal_t > min_wait_time
+                and head_speed < self._head_not_moving_tolerance
+            ):
                 break
 
             t1 = timeit.default_timer()

From 8662eca09d33321680c3879e23078b5d2a7f9882 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 8 Oct 2024 15:44:28 -0400
Subject: [PATCH 362/410] still issues with movement (especially the head)

---
 src/stretch/agent/zmq_client.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 3a893cebe..f4a2ac349 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -491,9 +491,9 @@ def head_to(
             whole_body_q = np.zeros(self._robot_model.dof, dtype=np.float32)
             whole_body_q[HelloStretchIdx.HEAD_PAN] = float(head_pan)
             whole_body_q[HelloStretchIdx.HEAD_TILT] = float(head_tilt)
+            time.sleep(0.25)
             self._wait_for_head(whole_body_q, block_id=step)
-
-        time.sleep(0.1)
+            time.sleep(0.25)
 
     def look_front(self, blocking: bool = True, timeout: float = 10.0):
         """Let robot look to its front."""

From e589733982975ef422c4cf7ecbbb48858416b095 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Tue, 8 Oct 2024 15:54:51 -0400
Subject: [PATCH 363/410] get rid of the head rotations

---
 src/stretch/agent/robot_agent.py        |  3 +++
 src/stretch/agent/zmq_client.py         |  6 ++++--
 src/stretch/config/default_planner.yaml | 16 +++++++++-------
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 6bb3531af..600a385ad 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -48,7 +48,10 @@ class RobotAgent:
     update_rerun_every_time: bool = True
     normalize_embeddings: bool = False
 
+    # Sleep times
+    # This sleep is before starting a head sweep
     _before_head_motion_sleep_t = 0.25
+    # much longer sleeps - do they fix things?
     _after_head_motion_sleep_t = 0.1
 
     def __init__(
diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index f4a2ac349..a780dfa21 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -139,10 +139,13 @@ def __init__(
 
         # Resend all actions immediately - helps if we are losing packets or something?
         self._resend_all_actions = resend_all_actions
-        self._publish_observations = publish_observations
+        self._publish_observations = (
+            publish_observations or self.parameters["agent"]["use_realtime_updates"]
+        )
 
         if parameters is None:
             parameters = get_parameters("default_planner.yaml")
+
         self._parameters = parameters
         self._moving_threshold = parameters["motion"]["moving_threshold"]
         self._angle_threshold = parameters["motion"]["angle_threshold"]
@@ -795,7 +798,6 @@ def _wait_for_head(
         # Wait for the head to move
         # If the head is not moving, we are done
         # Head must be stationary for at least min_wait_time
-        verbose = True
         prev_joint_positions = None
         prev_t = None
         prev_xyt = None
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index dcfdcc86b..c3636385a 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -41,7 +41,7 @@ filters:
   # smooth_kernel_size: 0
   use_median_filter: True
   # median_filter_size: 4
-  median_filter_size: 5
+  median_filter_size: 2
   median_filter_max_error: 0.01
   use_derivative_filter: False
   derivative_filter_threshold: 0.1
@@ -66,18 +66,20 @@ motion:
     # wrist_roll: 0.25
     # wrist_pitch: 0.25
     # wrist_yaw: 0.05
-    head_pan: 0.1
-    head_tilt: 0.1
+    head_pan: 0.01
+    head_tilt: 0.01
   joint_thresholds:
     head_not_moving_tolerance: 1.0e-4
     gripper_open_threshold: 0.3
 
 # Exploration
 agent:
-  #in_place_rotation_steps: 8  # If you are not moving the head, rotate more often
-  #sweep_head_on_update: False
-  in_place_rotation_steps: 4
-  sweep_head_on_update: True
+  use_realtime_updates: True
+  realtime_update_rotation_steps: 4
+  in_place_rotation_steps: 8  # If you are not moving the head, rotate more often
+  sweep_head_on_update: False
+  # in_place_rotation_steps: 4
+  # sweep_head_on_update: True
 
 # Instance memory parameters
 # These are mostly around making sure that we reject views of objects that are too small, too spotty, too unreliable, etc.

From bc6eb1936880d03ee29ecf3957911675c012d5e1 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 9 Oct 2024 11:39:57 -0400
Subject: [PATCH 364/410] updates

---
 src/stretch/agent/zmq_client.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index a780dfa21..09d38dbad 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -132,6 +132,11 @@ def __init__(
         self.send_port = send_port
         self.reset()
 
+        # Load parameters
+        if parameters is None:
+            parameters = get_parameters("default_planner.yaml")
+        self._parameters = parameters
+
         # Variables we set here should not change
         self._iter = -1  # Tracks number of actions set, never reset this
         self._seq_id = 0  # Number of messages we received
@@ -143,10 +148,6 @@ def __init__(
             publish_observations or self.parameters["agent"]["use_realtime_updates"]
         )
 
-        if parameters is None:
-            parameters = get_parameters("default_planner.yaml")
-
-        self._parameters = parameters
         self._moving_threshold = parameters["motion"]["moving_threshold"]
         self._angle_threshold = parameters["motion"]["angle_threshold"]
         self._min_steps_not_moving = parameters["motion"]["min_steps_not_moving"]

From 3d2f86f909bf7bac68317e4b4ab7da69985fb9ba Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 9 Oct 2024 11:43:15 -0400
Subject: [PATCH 365/410] updates - less terminal spam

---
 src/stretch/agent/zmq_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 09d38dbad..b27d39200 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -1309,7 +1309,7 @@ def send_action(
             if self._resend_all_actions or force_resend:
                 time.sleep(0.01)
 
-                logger.info("RESENDING THIS ACTION:", next_action)
+                # logger.info("RESENDING THIS ACTION:", next_action)
                 self.send_socket.send_pyobj(next_action)
 
             # For tracking goal

From 1a17e800982c2c6156dd02f729cad591bd97c3c0 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 9 Oct 2024 11:44:48 -0400
Subject: [PATCH 366/410] debug logs

---
 src/stretch/agent/zmq_client.py |  2 +-
 src/stretch/utils/logger.py     | 21 ++++++++++++++++++++-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index b27d39200..5557a7500 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -1309,7 +1309,7 @@ def send_action(
             if self._resend_all_actions or force_resend:
                 time.sleep(0.01)
 
-                # logger.info("RESENDING THIS ACTION:", next_action)
+                logger.debug("RESENDING THIS ACTION:", next_action)
                 self.send_socket.send_pyobj(next_action)
 
             # For tracking goal
diff --git a/src/stretch/utils/logger.py b/src/stretch/utils/logger.py
index 7426a4225..3e63682b7 100644
--- a/src/stretch/utils/logger.py
+++ b/src/stretch/utils/logger.py
@@ -11,13 +11,23 @@
 
 
 class Logger:
-    def __init__(self, name: str, hide_info: bool = False) -> None:
+    def __init__(self, name: str, hide_info: bool = False, hide_debug: bool = True) -> None:
         self.name = name
         self._hide_info = hide_info
+        self._hide_debug = hide_debug
 
     def hide_info(self) -> None:
         self._hide_info = True
 
+    def hide_debug(self) -> None:
+        self._hide_debug = True
+
+    def show_info(self) -> None:
+        self._hide_info = False
+
+    def show_debug(self) -> None:
+        self._hide_debug = False
+
     def _flatten(self, args: tuple) -> str:
         """Flatten a tuple of arguments into a string joined by spaces.
 
@@ -41,6 +51,11 @@ def info(self, *args) -> None:
             text = self._flatten(args)
             print(colored(text, "white"))
 
+    def debug(self, *args) -> None:
+        if not self._hide_debug:
+            text = self._flatten(args)
+            print(colored(text, "white"))
+
     def warning(self, *args) -> None:
         text = self._flatten(args)
         print(colored(text, "yellow"))
@@ -67,3 +82,7 @@ def warning(*args) -> None:
 
 def alert(*args) -> None:
     _default_logger.alert(*args)
+
+
+def debug(*args) -> None:
+    _default_logger.debug(*args)

From 61168e33c6c1d7b86428c06bc55bc8210eea6341 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 9 Oct 2024 11:58:45 -0400
Subject: [PATCH 367/410] remove head speed xyt check

---
 src/stretch/agent/zmq_client.py | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 5557a7500..85726f108 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -801,7 +801,6 @@ def _wait_for_head(
         # Head must be stationary for at least min_wait_time
         prev_joint_positions = None
         prev_t = None
-        prev_xyt = None
         while not self._finish:
             joint_positions, joint_velocities, _ = self.get_joint_state()
 
@@ -824,15 +823,6 @@ def _wait_for_head(
                 joint_velocities[HelloStretchIdx.HEAD_PAN : HelloStretchIdx.HEAD_TILT]
             )
 
-            current_xyt = self.get_base_pose()
-            if prev_xyt is not None:
-                xyt_diff = np.linalg.norm(prev_xyt - current_xyt)
-            else:
-                xyt_diff = float("inf")
-
-            # Save the current xyt to compute speed
-            prev_xyt = current_xyt
-
             if prev_joint_positions is not None:
                 head_speed_v2 = np.linalg.norm(
                     joint_positions[HelloStretchIdx.HEAD_PAN : HelloStretchIdx.HEAD_TILT]
@@ -843,7 +833,7 @@ def _wait_for_head(
 
             # Take the max of the two speeds
             # This is to handle the case where we're getting weird measurements
-            head_speed = max(head_speed, head_speed_v2, xyt_diff)
+            head_speed = max(head_speed, head_speed_v2)
 
             # Save the current joint positions to compute speed
             prev_joint_positions = joint_positions

From 12f679909fd6af5cf8832848a48fa97c337b8d66 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 9 Oct 2024 09:07:58 -0700
Subject: [PATCH 368/410] update robot

---
 src/stretch/core/robot.py                                    | 4 ++++
 src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py | 1 -
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/stretch/core/robot.py b/src/stretch/core/robot.py
index 0dccd6807..9f1e2c37a 100644
--- a/src/stretch/core/robot.py
+++ b/src/stretch/core/robot.py
@@ -57,6 +57,10 @@ def reset(self):
     def switch_to_navigation_mode(self):
         raise NotImplementedError()
 
+    @property
+    def control_mode(self):
+        return self._base_control_mode
+
     def in_manipulation_mode(self) -> bool:
         """is the robot ready to grasp"""
         return self._base_control_mode == ControlMode.MANIPULATION
diff --git a/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py b/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py
index 2f048a7eb..69d394e6a 100755
--- a/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py
+++ b/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/server.py
@@ -87,7 +87,6 @@ def get_full_observation_message(self) -> Dict[str, Any]:
             "lidar_points": obs.lidar_points,
             "lidar_timestamp": obs.lidar_timestamp,
             "pose_graph": self.client.get_pose_graph(),
-            "control_mode": self.get_control_mode(),
             "last_motion_failed": self.client.last_motion_failed(),
             "recv_address": self.recv_address,
             "step": self._last_step,

From df543a47511e475706834aede9017c5dbc2f2bb5 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 9 Oct 2024 12:08:37 -0400
Subject: [PATCH 369/410] lower mapping threshold again

---
 src/stretch/config/default_planner.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index c3636385a..4363747e0 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -9,7 +9,7 @@ tts_engine: "gTTS"
 
 # Sparse Voxel Map parameters
 voxel_size: 0.04 # Size of a voxel in meters
-obs_min_height: 0.15  # Ignore things less than this high when planning motions
+obs_min_height: 0.10  # Ignore things less than this high when planning motions
 obs_max_height: 1.8  # Ignore things over this height (eg ceilings)
 neg_obs_height: -0.05  # Things less than this height ARE obstacles
 use_negative_obstacles: True  # Use the negative height as an obstacle

From 24ec4045819c0b09f59dc4253a8dc68caa5adf4f Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 9 Oct 2024 12:25:14 -0400
Subject: [PATCH 370/410] modify waiting code to make it a bit more clear the
 robot has stopped

---
 src/stretch/agent/zmq_client.py | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 85726f108..f504809fe 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -951,7 +951,7 @@ def _wait_for_mode(
         assert self._control_mode == mode
         return True
 
-    def _wait_for_action(
+    def _wait_for_base_motion(
         self,
         block_id: int,
         verbose: bool = False,
@@ -979,6 +979,7 @@ def _wait_for_action(
         print("=" * 20, f"Waiting for {block_id} at goal", "=" * 20)
         last_pos = None
         last_ang = None
+        last_obs_t = None
         not_moving_count = 0
         if moving_threshold is None:
             moving_threshold = self._moving_threshold
@@ -1002,22 +1003,31 @@ def _wait_for_action(
             xyt = self.get_base_pose()
             pos = xyt[:2]
             ang = xyt[2]
+            obs_t = timeit.default_timer()
 
             if not self.at_goal():
                 t0 = timeit.default_timer()
                 continue
 
-            moved_dist = np.linalg.norm(pos - last_pos) if last_pos is not None else 0
-            angle_dist = angle_difference(ang, last_ang) if last_ang is not None else 0
+            moved_dist = np.linalg.norm(pos - last_pos) if last_pos is not None else float("inf")
+            angle_dist = angle_difference(ang, last_ang) if last_ang is not None else float("inf")
             if goal_angle is not None:
                 angle_dist_to_goal = angle_difference(ang, goal_angle)
                 at_goal = angle_dist_to_goal < goal_angle_threshold
             else:
                 at_goal = True
+
+            moved_speed = (
+                moved_dist / (obs_t - last_obs_t) if last_obs_t is not None else float("inf")
+            )
+            angle_speed = (
+                angle_dist / (obs_t - last_obs_t) if last_obs_t is not None else float("inf")
+            )
+
             not_moving = (
                 last_pos is not None
-                and moved_dist < moving_threshold
-                and angle_dist < angle_threshold
+                and moved_speed < moving_threshold
+                and angle_speed < angle_threshold
             )
             if not_moving:
                 not_moving_count += 1
@@ -1028,6 +1038,7 @@ def _wait_for_action(
             # If we are at the goal, we can stop if we are not moving
             last_pos = pos
             last_ang = ang
+            last_obs_t = obs_t
             close_to_goal = at_goal
             if verbose:
                 print(
@@ -1315,7 +1326,7 @@ def send_action(
         # time.sleep(0.1)
         if blocking:
             # Wait for the command to finish
-            self._wait_for_action(
+            self._wait_for_base_motion(
                 block_id,
                 goal_angle=goal_angle,
                 verbose=verbose,

From 0a2723b0aec5c298e762205a86648c985b33c82d Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 9 Oct 2024 12:30:27 -0400
Subject: [PATCH 371/410] update code and memory - try to make sure that it
 works properly

---
 src/stretch/agent/robot_agent.py |  9 +++++++++
 src/stretch/utils/memory.py      | 20 ++++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 600a385ad..e67082ed3 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1305,6 +1305,15 @@ def go_home(self):
         print(f"- Current pose is valid: {self.space.is_valid(self.robot.get_base_pose())}")
         print(f"-   start pose is valid: {self.space.is_valid(start)}")
         print(f"-    goal pose is valid: {self.space.is_valid(goal)}")
+
+        # If the start is invalid, try to recover
+        if not self.space.is_valid(start):
+            print("Start is not valid. Trying to recover...")
+            ok = self.recover_from_invalid_start()
+            if not ok:
+                print("Failed to recover from invalid start state!")
+                return
+
         res = self.planner.plan(start, goal)
         # if it fails, skip; else, execute a trajectory to this position
         if res.success:
diff --git a/src/stretch/utils/memory.py b/src/stretch/utils/memory.py
index 04b3ac33a..2f635816d 100644
--- a/src/stretch/utils/memory.py
+++ b/src/stretch/utils/memory.py
@@ -8,6 +8,8 @@
 # license information maybe found below, if so.
 
 import os
+import shutil
+from datetime import datetime
 
 path = os.path.expanduser("~/.stretch")
 
@@ -68,3 +70,21 @@ def get_path_to_debug(name: str) -> str:
 def get_path_to_default_credentials() -> str:
     """Gets the path to the default credentials file"""
     return os.path.join(path, "credentials.json")
+
+
+def get_path_to_saved_map() -> str:
+    """Gets the path to the saved map file"""
+    return os.path.join(path, "map.pkl")
+
+
+def get_path_to_backup_saved_map(timestamp: str) -> str:
+    """Backup the saved map file"""
+    return os.path.join(path, "backup", f"map_{timestamp}.pkl")
+
+
+def backup_saved_map():
+    """Backup the saved map file to a new location"""
+    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+    _ensure_path_exists()
+    os.makedirs(os.path.join(path, "backup"), exist_ok=True)
+    shutil.copyfile(get_path_to_saved_map(), get_path_to_backup_saved_map(timestamp))

From 310fb618310d6364b2e658a16a6b64426fdb6aab Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 9 Oct 2024 12:31:18 -0400
Subject: [PATCH 372/410] expand documentation

---
 src/stretch/agent/robot_agent.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index e67082ed3..bd0375b30 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1293,7 +1293,12 @@ def filter_matches(
         return filtered_matches
 
     def go_home(self):
-        """Simple helper function to send the robot home safely after a trial. This will use the current map and motion plan all the way there."""
+        """Simple helper function to send the robot home safely after a trial. This will use the current map and motion plan all the way there. This is a blocking call, so it will return when the robot is at home.
+
+        If the robot is not able to plan to home, it will print a message and return without moving the robot.
+
+        If the start state is invalid, it will try to recover from the invalid start state before planning to home. If it fails to recover, it will print a message and return without moving the robot.
+        """
         print("Go back to (0, 0, 0) to finish...")
         print("- change posture and switch to navigation mode")
         self.current_state = "NAV_TO_HOME"

From b0c7da3b5a9170330c58d66d1caccd5649c6c4f1 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 9 Oct 2024 12:32:54 -0400
Subject: [PATCH 373/410] update

---
 src/stretch/agent/robot_agent.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index bd0375b30..99c8b217d 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1329,6 +1329,17 @@ def go_home(self):
             print("Can't go home; planning failed!")
 
     def choose_best_goal_instance(self, goal: str, debug: bool = False) -> Instance:
+        """Choose the best instance to move to based on the goal. This is done by comparing the goal to the embeddings of the instances in the world. The instance with the highest score is returned. If debug is true, we also print out the scores for the goal and two negative examples. These examples are:
+        - "the color purple"
+        - "a blank white wall"
+
+        Args:
+            goal(str): the goal to move to
+            debug(bool): whether to print out debug information
+
+        Returns:
+            Instance: the best instance to move to
+        """
         instances = self.voxel_map.get_instances()
         goal_emb = self.encode_text(goal)
         if debug:
@@ -1345,10 +1356,10 @@ def choose_best_goal_instance(self, goal: str, debug: bool = False) -> Instance:
             img_emb = instance.get_image_embedding(
                 aggregation_method="mean", normalize=self.normalize_embeddings
             )
-            goal_score = torch.matmul(goal_emb, img_emb).item()
+            goal_score = self.compare_embeddings(goal_emb, img_emb)
             if debug:
-                neg1_score = torch.matmul(neg1_emb, img_emb).item()
-                neg2_score = torch.matmul(neg2_emb, img_emb).item()
+                neg1_score = self.compare_embeddings(neg1_emb, img_emb)
+                neg2_score = self.compare_embeddings(neg2_emb, img_emb)
                 print("scores =", goal_score, neg1_score, neg2_score)
             if goal_score > best_score:
                 best_instance = instance

From 9b315e093074ee282fb8c4133934b3f93d7498d1 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 9 Oct 2024 12:34:00 -0400
Subject: [PATCH 374/410] improve documentation

---
 src/stretch/agent/robot_agent.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 99c8b217d..0ccec1df2 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1384,7 +1384,19 @@ def plan_to_frontier(
         fix_random_seed: bool = False,
         verbose: bool = True,
     ) -> PlanResult:
-        """Motion plan to a frontier location."""
+        """Motion plan to a frontier location. This is a location that is on the edge of the explored space. We use the voxel grid map created by our collector to sample free space, and then use our motion planner (RRT for now) to get there. At the end, we plan back to (0,0,0).
+
+        Args:
+            start(np.ndarray): the start position
+            manual_wait(bool): whether to wait for user input
+            random_goals(bool): whether to sample random goals
+            try_to_plan_iter(int): the number of tries to find a plan
+            fix_random_seed(bool): whether to fix the random seed
+            verbose(bool): extra info is printed
+
+        Returns:
+            PlanResult: the result of the motion planner
+        """
         start_is_valid = self.space.is_valid(start, verbose=True)
         # if start is not valid move backwards a bit
         if not start_is_valid:

From c96fa08d359b794b70b88f208396236e05cf2923 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 9 Oct 2024 12:49:05 -0400
Subject: [PATCH 375/410] Update to use the default saved memory wherever
 possible

---
 src/stretch/agent/robot_agent.py | 16 ++++++++++++++++
 src/stretch/app/mapping.py       |  2 ++
 src/stretch/app/read_map.py      | 14 +++++++++++---
 src/stretch/utils/memory.py      | 21 ++++++++++++++++++---
 4 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 0ccec1df2..227555f98 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -24,6 +24,7 @@
 from PIL import Image
 
 import stretch.utils.logger as logger
+import stretch.utils.memory as memory
 from stretch.audio.text_to_speech import get_text_to_speech
 from stretch.core.interfaces import Observations
 from stretch.core.parameters import Parameters
@@ -1967,3 +1968,18 @@ def get_object_centric_observations(
             scene_graph,
         )
         return world_representation
+
+    def save_map(self, filename: Optional[str] = None) -> None:
+        """Save the current map to a file.
+
+        Args:
+            filename(str): the name of the file to save the map to
+        """
+        if filename is None or filename == "":
+            filename = memory.get_path_to_saved_map()
+
+        # Backup the saved map
+        memory.backup_saved_map()
+
+        # Write the new map to the file
+        self.voxel_map.save_to_pickle(filename)
diff --git a/src/stretch/app/mapping.py b/src/stretch/app/mapping.py
index 424674dd7..4bf7579c4 100644
--- a/src/stretch/app/mapping.py
+++ b/src/stretch/app/mapping.py
@@ -218,6 +218,8 @@ def demo_main(
             print(f"Write pkl to {output_pkl_filename}...")
             demo.voxel_map.write_to_pickle(output_pkl_filename)
 
+        demo.save_map()
+
         if write_instance_images:
             demo.save_instance_images(".")
 
diff --git a/src/stretch/app/read_map.py b/src/stretch/app/read_map.py
index 4a65f2a91..e9e4d3544 100644
--- a/src/stretch/app/read_map.py
+++ b/src/stretch/app/read_map.py
@@ -24,13 +24,17 @@
 import matplotlib.pyplot as plt
 import numpy as np
 
-import stretch.utils.logger as logger
+import stretch.utils.memory as memory
 from stretch.agent import RobotAgent
 from stretch.core import get_parameters
 from stretch.mapping import SparseVoxelMap
 from stretch.perception import create_semantic_sensor
 from stretch.utils.dummy_stretch_client import DummyStretchClient
 from stretch.utils.geometry import xyt_global_to_base
+from stretch.utils.logger import Logger
+
+# Set up logging for the script
+logger = Logger(__name__)
 
 
 def plan_to_deltas(xyt0, plan):
@@ -50,8 +54,8 @@ def plan_to_deltas(xyt0, plan):
     "--input-path",
     "-i",
     type=click.Path(),
-    default="output.pkl",
-    help="Input path with default value 'output.npy'",
+    default="",
+    help="Input path to the PKL file. If none is provided, try to load the default PKL.",
 )
 @click.option(
     "--config-path",
@@ -153,6 +157,10 @@ def main(
     test_remove: bool = False,
 ):
     """Simple script to load a voxel map"""
+    if len(input_path) == 0:
+        # Load the default path
+        input_path = memory.get_path_to_saved_map()
+
     input_path = Path(input_path)
     print("Loading:", input_path)
     if pkl_is_svm:
diff --git a/src/stretch/utils/memory.py b/src/stretch/utils/memory.py
index 2f635816d..cc0422348 100644
--- a/src/stretch/utils/memory.py
+++ b/src/stretch/utils/memory.py
@@ -82,9 +82,24 @@ def get_path_to_backup_saved_map(timestamp: str) -> str:
     return os.path.join(path, "backup", f"map_{timestamp}.pkl")
 
 
-def backup_saved_map():
-    """Backup the saved map file to a new location"""
+def backup_saved_map() -> bool:
+    """Backup the saved map file to a new location.
+
+    Returns:
+        True if the file is backed up, False otherwise
+    """
+
+    # Create a timestamp for the backup
     timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+
+    # Ensure the path exists
     _ensure_path_exists()
+
+    # Create a backup directory if it doesn't exist
     os.makedirs(os.path.join(path, "backup"), exist_ok=True)
-    shutil.copyfile(get_path_to_saved_map(), get_path_to_backup_saved_map(timestamp))
+
+    # Check to see if a file exists first
+    if os.path.exists(get_path_to_saved_map()):
+        shutil.copyfile(get_path_to_saved_map(), get_path_to_backup_saved_map(timestamp))
+        return True
+    return False

From 23ec459a08558530cd245c0f37439765ce6f8e10 Mon Sep 17 00:00:00 2001
From: Atharva Pusalkar <atharvapusalkar18@gmail.com>
Date: Mon, 7 Oct 2024 10:16:18 -0400
Subject: [PATCH 376/410] Add speak and pickup tasks

---
 .../agent/task/pickup/pickup_executor.py      | 150 ++++++++++++++++++
 src/stretch/llms/prompts/pickup_prompt.py     | 122 +++++++-------
 2 files changed, 211 insertions(+), 61 deletions(-)

diff --git a/src/stretch/agent/task/pickup/pickup_executor.py b/src/stretch/agent/task/pickup/pickup_executor.py
index 3e1399786..cf58cd4cb 100644
--- a/src/stretch/agent/task/pickup/pickup_executor.py
+++ b/src/stretch/agent/task/pickup/pickup_executor.py
@@ -9,10 +9,25 @@
 
 from typing import List, Tuple
 
+from stretch.agent.operations import (
+    GoToNavOperation,
+    GoToOperation,
+    GraspObjectOperation,
+    NavigateToObjectOperation,
+    OpenLoopGraspObjectOperation,
+    PreGraspObjectOperation,
+    PlaceObjectOperation,
+    RotateInPlaceOperation,
+    SearchForObjectOnFloorOperation,
+    SearchForReceptacleOperation,
+    SpeakOperation,
+    WaveOperation,
+)
 from stretch.agent.robot_agent import RobotAgent
 from stretch.agent.task.emote import EmoteTask
 from stretch.agent.task.pickup.pickup_task import PickupTask
 from stretch.core import AbstractRobotClient
+from stretch.core.task import Operation, Task
 from stretch.utils.logger import Logger
 
 logger = Logger(__name__)
@@ -56,6 +71,141 @@ def __init__(
         self._match_method = match_method
         self._open_loop = open_loop
 
+        # Managed task
+        self._task = Task()
+        self._operation_count = 0
+
+
+    def say(self, text: str) -> None:
+        """Use the robot to say the text."""
+        say_operation = SpeakOperation(f"{str(self._operation_count)}_say_" + text, agent=self.agent, robot=self.robot)
+        say_operation.configure(message=text)
+        self._task.add_operation(say_operation, True)
+        self._operation_count += 1
+
+    def pickup(self, target_object: str, target_receptacle: str) -> None:
+        """Pick up the object and place it in the receptacle."""
+        # Put the robot into navigation mode
+        go_to_navigation_mode = GoToNavOperation(
+            f"{str(self._operation_count)}_go to navigation mode", self.agent, retry_on_failure=True
+        )
+        self._operation_count += 1
+
+        # Spin in place to find objects.
+        rotate_in_place = RotateInPlaceOperation(
+            f"{str(self._operation_count)}_rotate_in_place", self.agent, parent=go_to_navigation_mode
+        )
+        self._operation_count += 1
+
+        # Look for the target receptacle
+        search_for_receptacle = SearchForReceptacleOperation(
+            f"{str(self._operation_count)}_search_for_{self.target_receptacle}",
+            self.agent,
+            parent=rotate_in_place,
+            retry_on_failure=True,
+            match_method=self._match_method,
+        )
+        self._operation_count += 1
+
+        # Try to expand the frontier and find an object; or just wander around for a while.
+        search_for_object = SearchForObjectOnFloorOperation(
+            f"{str(self._operation_count)}_search_for_{self.target_object}_on_floor",
+            self.agent,
+            retry_on_failure=True,
+            match_method=self._match_method,
+        )
+        self._operation_count += 1
+        if self.agent.target_object is not None:
+            # Overwrite the default object to search for
+            search_for_object.set_target_object_class(self.agent.target_object)
+        if self.agent.target_receptacle is not None:
+            search_for_receptacle.set_target_object_class(self.agent.target_receptacle)
+
+        # After searching for object, we should go to an instance that we've found. If we cannot do that, keep searching.
+        go_to_object = NavigateToObjectOperation(
+            f"{str(self._operation_count)}_go_to_object",
+            self.agent,
+            parent=search_for_object,
+            on_cannot_start=search_for_object,
+            to_receptacle=False,
+        )
+        self._operation_count += 1
+
+        # After searching for object, we should go to an instance that we've found. If we cannot do that, keep searching.
+        go_to_receptacle = NavigateToObjectOperation(
+            f"{str(self._operation_count)}_go_to_receptacle",
+            self.agent,
+            on_cannot_start=search_for_receptacle,
+            to_receptacle=True,
+        )
+        self._operation_count += 1
+
+        # When about to start, run object detection and try to find the object. If not in front of us, explore again.
+        # If we cannot find the object, we should go back to the search_for_object operation.
+        # To determine if we can start, we just check to see if there's a detectable object nearby.
+        pregrasp_object = PreGraspObjectOperation(
+            f"{str(self._operation_count)}_prepare_to_grasp",
+            self.agent,
+            on_failure=None,
+            on_cannot_start=go_to_object,
+            retry_on_failure=True,
+        )
+        self._operation_count += 1
+        # If we cannot start, we should go back to the search_for_object operation.
+        # To determine if we can start, we look at the end effector camera and see if there's anything detectable.
+        if self.use_visual_servoing_for_grasp:
+            grasp_object = GraspObjectOperation(
+                f"{str(self._operation_count)}_grasp_the_{self.target_object}",
+                self.agent,
+                parent=pregrasp_object,
+                on_failure=pregrasp_object,
+                on_cannot_start=go_to_object,
+                retry_on_failure=False,
+            )
+            grasp_object.set_target_object_class(self.agent.target_object)
+            grasp_object.servo_to_grasp = True
+            grasp_object.match_method = self._match_method
+            self._operation_count += 1
+        else:
+            grasp_object = OpenLoopGraspObjectOperation(
+                f"{str(self._operation_count)}_grasp_the_{self.target_object}",
+                self.agent,
+                parent=pregrasp_object,
+                on_failure=pregrasp_object,
+                on_cannot_start=go_to_object,
+                retry_on_failure=False,
+            )
+            grasp_object.set_target_object_class(self.agent.target_object)
+            grasp_object.match_method = self._match_method
+            self._operation_count += 1
+
+        place_object_on_receptacle = PlaceObjectOperation(
+            f"{str(self._operation_count)}_place_object_on_receptacle", self.agent, on_cannot_start=go_to_receptacle
+        )
+        self._operation_count += 1
+
+        self._task.add_operation(go_to_navigation_mode)
+        self._task.add_operation(rotate_in_place)
+        self._task.add_operation(search_for_receptacle)
+        self._task.add_operation(search_for_object)
+        self._task.add_operation(go_to_object)
+        self._task.add_operation(pregrasp_object)
+        self._task.add_operation(grasp_object)
+        self._task.add_operation(go_to_receptacle)
+        self._task.add_operation(place_object_on_receptacle)
+
+        self._task.connect_on_success(go_to_navigation_mode.name, search_for_receptacle.name)
+        self._task.connect_on_success(search_for_receptacle.name, search_for_object.name)
+        self._task.connect_on_success(search_for_object.name, go_to_object.name)
+        self._task.connect_on_success(go_to_object.name, pregrasp_object.name)
+        self._task.connect_on_success(pregrasp_object.name, grasp_object.name)
+        self._task.connect_on_success(grasp_object.name, go_to_receptacle.name)
+        self._task.connect_on_success(go_to_receptacle.name, place_object_on_receptacle.name)
+
+        self._task.connect_on_success(search_for_receptacle.name, search_for_object.name)
+
+        self._task.connect_on_cannot_start(go_to_object.name, search_for_object.name)
+
     def _pickup(self, target_object: str, target_receptacle: str) -> None:
 
         # After the robot has started...
diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index 80e890cc3..f9613e824 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -153,67 +153,67 @@ def configure(self, **kwargs) -> str:
 
     def parse_response(self, response: str):
         """Parse the pickup, place, and say commands from the response into a list."""
-        commands = []
-        for line in response.split("\n"):
-            if line.startswith("pickup("):
-                commands.append(line)
-            elif line.startswith("place("):
-                commands.append(line)
-            elif line.startswith("say("):
-                commands.append(line)
-            elif line.startswith("wave()"):
-                commands.append(line)
-            elif line.startswith("go_home()"):
-                commands.append(line)
-            elif line.startswith("explore()"):
-                commands.append(line)
-            elif line.startswith("nod_head()"):
-                commands.append(line)
-            elif line.startswith("shake_head()"):
-                commands.append(line)
-            elif line.startswith("avert_gaze()"):
-                commands.append(line)
-            elif line.startswith("quit()"):
-                commands.append(line)
-            elif line.startswith("find("):
-                commands.append(line)
-            elif line.startswith("end()"):
-                # Stop parsing if we see the end command
-                break
-
-        # Now go through commands and parse into a tuple (command, args)
-        parsed_commands = []
-        for command in commands:
-            if command.startswith("say("):
-                parsed_commands.append(("say", command[4:-1]))
-            elif command.startswith("pickup("):
-                parsed_commands.append(("pickup", command[7:-1]))
-            elif command.startswith("place("):
-                parsed_commands.append(("place", command[6:-1]))
-            elif command.startswith("wave()"):
-                parsed_commands.append(("wave", ""))
-            elif command.startswith("go_home()"):
-                parsed_commands.append(("go_home", ""))
-            elif command.startswith("explore()"):
-                parsed_commands.append(("explore", ""))
-            elif command.startswith("nod_head()"):
-                parsed_commands.append(("nod_head", ""))
-            elif command.startswith("shake_head()"):
-                parsed_commands.append(("shake_head", ""))
-            elif command.startswith("avert_gaze()"):
-                parsed_commands.append(("avert_gaze", ""))
-            elif command.startswith("find("):
-                parsed_commands.append(("find", command[5:-1]))
-            elif command.startswith("quit()"):
-                # Quit actually shuts down the robot.
-                parsed_commands.append(("quit", ""))
-                break
-            elif command.startswith("end()"):
-                # Stop parsing if we see the end command
-                # This really shouldn't happen, but just in case
-                break
-
-        return parsed_commands
+        commands = [line for line in response.split("\n") if line]
+        # for line in response.split("\n"):
+        #     if line.startswith("pickup("):
+        #         commands.append(line)
+        #     elif line.startswith("place("):
+        #         commands.append(line)
+        #     elif line.startswith("say("):
+        #         commands.append(line)
+        #     elif line.startswith("wave()"):
+        #         commands.append(line)
+        #     elif line.startswith("go_home()"):
+        #         commands.append(line)
+        #     elif line.startswith("explore()"):
+        #         commands.append(line)
+        #     elif line.startswith("nod_head()"):
+        #         commands.append(line)
+        #     elif line.startswith("shake_head()"):
+        #         commands.append(line)
+        #     elif line.startswith("avert_gaze()"):
+        #         commands.append(line)
+        #     elif line.startswith("quit()"):
+        #         commands.append(line)
+        #     elif line.startswith("find("):
+        #         commands.append(line)
+        #     elif line.startswith("end()"):
+        #         # Stop parsing if we see the end command
+        #         break
+
+        # # Now go through commands and parse into a tuple (command, args)
+        # parsed_commands = []
+        # for command in commands:
+        #     if command.startswith("say("):
+        #         parsed_commands.append(("say", command[4:-1]))
+        #     elif command.startswith("pickup("):
+        #         parsed_commands.append(("pickup", command[7:-1]))
+        #     elif command.startswith("place("):
+        #         parsed_commands.append(("place", command[6:-1]))
+        #     elif command.startswith("wave()"):
+        #         parsed_commands.append(("wave", ""))
+        #     elif command.startswith("go_home()"):
+        #         parsed_commands.append(("go_home", ""))
+        #     elif command.startswith("explore()"):
+        #         parsed_commands.append(("explore", ""))
+        #     elif command.startswith("nod_head()"):
+        #         parsed_commands.append(("nod_head", ""))
+        #     elif command.startswith("shake_head()"):
+        #         parsed_commands.append(("shake_head", ""))
+        #     elif command.startswith("avert_gaze()"):
+        #         parsed_commands.append(("avert_gaze", ""))
+        #     elif command.startswith("find("):
+        #         parsed_commands.append(("find", command[5:-1]))
+        #     elif command.startswith("quit()"):
+        #         # Quit actually shuts down the robot.
+        #         parsed_commands.append(("quit", ""))
+        #         break
+        #     elif command.startswith("end()"):
+        #         # Stop parsing if we see the end command
+        #         # This really shouldn't happen, but just in case
+        #         break
+
+        return commands
 
     def get_object(self, response: List[Tuple[str, str]]) -> str:
         """Return the object from the response."""

From eed46a4353f196f31ce92ee54a88577991689180 Mon Sep 17 00:00:00 2001
From: Atharva Pusalkar <atharvapusalkar18@gmail.com>
Date: Mon, 7 Oct 2024 11:27:51 -0400
Subject: [PATCH 377/410] Replace conditional checks with operations

---
 src/stretch/agent/operations/__init__.py      |   2 +
 src/stretch/agent/operations/explore.py       |  28 +++++
 src/stretch/agent/operations/go_home.py       |  28 +++++
 .../agent/task/pickup/pickup_executor.py      | 106 +++++++++++-------
 src/stretch/app/ai_pickup.py                  |   2 +-
 src/stretch/llms/base.py                      |   4 +
 src/stretch/llms/prompts/pickup_prompt.py     |  73 +++---------
 7 files changed, 142 insertions(+), 101 deletions(-)
 create mode 100644 src/stretch/agent/operations/explore.py
 create mode 100644 src/stretch/agent/operations/go_home.py

diff --git a/src/stretch/agent/operations/__init__.py b/src/stretch/agent/operations/__init__.py
index 4d7a4dee7..e49ce5416 100644
--- a/src/stretch/agent/operations/__init__.py
+++ b/src/stretch/agent/operations/__init__.py
@@ -18,6 +18,8 @@
 )
 
 # from .grasp_closed_loop import ClosedLoopGraspObjectOperation
+from .explore import ExploreOperation
+from .go_home import GoHomeOperation
 from .go_to import GoToOperation
 from .grasp_object import GraspObjectOperation
 from .grasp_open_loop import OpenLoopGraspObjectOperation
diff --git a/src/stretch/agent/operations/explore.py b/src/stretch/agent/operations/explore.py
new file mode 100644
index 000000000..1c14abf4e
--- /dev/null
+++ b/src/stretch/agent/operations/explore.py
@@ -0,0 +1,28 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+from stretch.agent.base import ManagedOperation
+
+
+class ExploreOperation(ManagedOperation):
+    """Make the robot explore the environment"""
+
+    _successful = False
+
+    def can_start(self) -> bool:
+        return self.agent is not None
+
+    def run(self) -> None:
+        self.intro(f"Attempting to go home")
+        self.agent.run_exploration()
+        self._successful = True
+        self.cheer(f"Done going home")
+
+    def was_successful(self) -> bool:
+        return self._successful
diff --git a/src/stretch/agent/operations/go_home.py b/src/stretch/agent/operations/go_home.py
new file mode 100644
index 000000000..3e0b735cb
--- /dev/null
+++ b/src/stretch/agent/operations/go_home.py
@@ -0,0 +1,28 @@
+# Copyright (c) Hello Robot, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in the root directory
+# of this source tree.
+#
+# Some code may be adapted from other open-source works with their respective licenses. Original
+# license information maybe found below, if so.
+
+from stretch.agent.base import ManagedOperation
+
+
+class GoHomeOperation(ManagedOperation):
+    """Make the robot go home"""
+
+    _successful = False
+
+    def can_start(self) -> bool:
+        return self.agent is not None
+
+    def run(self) -> None:
+        self.intro(f"Attempting to go home")
+        self.agent.go_home()
+        self._successful = True
+        self.cheer(f"Done going home")
+
+    def was_successful(self) -> bool:
+        return self._successful
diff --git a/src/stretch/agent/task/pickup/pickup_executor.py b/src/stretch/agent/task/pickup/pickup_executor.py
index cf58cd4cb..ba62acee9 100644
--- a/src/stretch/agent/task/pickup/pickup_executor.py
+++ b/src/stretch/agent/task/pickup/pickup_executor.py
@@ -10,16 +10,21 @@
 from typing import List, Tuple
 
 from stretch.agent.operations import (
+    AvertGazeOperation,
+    ExploreOperation,
+    GoHomeOperation,
     GoToNavOperation,
     GoToOperation,
     GraspObjectOperation,
     NavigateToObjectOperation,
+    NodHeadOperation,
     OpenLoopGraspObjectOperation,
     PreGraspObjectOperation,
     PlaceObjectOperation,
     RotateInPlaceOperation,
     SearchForObjectOnFloorOperation,
     SearchForReceptacleOperation,
+    ShakeHeadOperation,
     SpeakOperation,
     WaveOperation,
 )
@@ -44,6 +49,7 @@ def __init__(
         self,
         robot: AbstractRobotClient,
         agent: RobotAgent,
+        available_actions: List[str],
         match_method: str = "feature",
         open_loop: bool = False,
         dry_run: bool = False,
@@ -74,6 +80,7 @@ def __init__(
         # Managed task
         self._task = Task()
         self._operation_count = 0
+        self.available_actions = available_actions
 
 
     def say(self, text: str) -> None:
@@ -206,6 +213,51 @@ def pickup(self, target_object: str, target_receptacle: str) -> None:
 
         self._task.connect_on_cannot_start(go_to_object.name, search_for_object.name)
 
+    def wave(self) -> None:
+        """Wave to the user."""
+        wave_operation = WaveOperation(f"{str(self._operation_count)}_wave", self.agent, robot=self.robot)
+        self._task.add_operation(wave_operation, True)
+        self._operation_count += 1
+
+    def go_home(self) -> None:
+        """Go back to the home position."""
+        go_home_operation = GoHomeOperation(f"{str(self._operation_count)}_go_home", self.agent, robot=self.robot)
+        self._task.add_operation(go_home_operation, True)
+        self._operation_count += 1
+
+    def explore(self) -> None:
+        """Explore the environment."""
+        explore_operation = ExploreOperation(f"{str(self._operation_count)}_explore", self.agent, robot=self.robot)
+        self._task.add_operation(explore_operation, True)
+        self._operation_count += 1
+
+    def nod_head(self) -> None:
+        """Nod the head."""
+        nod_head_operation = NodHeadOperation(f"{str(self._operation_count)}_nod_head", self.agent, robot=self.robot)
+        self._task.add_operation(nod_head_operation, True)
+        self._operation_count += 1
+
+    def shake_head(self) -> None:
+        """Shake the head."""
+        shake_head_operation = ShakeHeadOperation(f"{str(self._operation_count)}_shake_head", self.agent, robot=self.robot)
+        self._task.add_operation(shake_head_operation, True)
+        self._operation_count += 1
+
+    def avert_gaze(self) -> None:
+        """Avert the gaze."""
+        avert_gaze_operation = AvertGazeOperation(f"{str(self._operation_count)}_avert_gaze", self.agent, robot=self.robot)
+        self._task.add_operation(avert_gaze_operation, True)
+        self._operation_count += 1
+
+    def find(self, object_name: str) -> None:
+        """Find the object."""
+        speak_not_implemented = SpeakOperation(
+            f"{str(self._operation_count)}_find_" + object_name, agent=self.agent, robot=self.robot
+        )
+        speak_not_implemented.configure(message="Find operation not implemented")
+        self._task.add_operation(speak_not_implemented, True)
+        self._operation_count += 1
+
     def _pickup(self, target_object: str, target_receptacle: str) -> None:
 
         # After the robot has started...
@@ -248,49 +300,21 @@ def __call__(self, response: List[Tuple[str, str]]) -> bool:
         while i < len(response):
             command, args = response[i]
             logger.info(f"{i} {command} {args}")
-            if command == "say":
-                # Use TTS to say the text
-                logger.info(f"[Pickup task] Saying: {args}")
-                self.agent.robot_say(args)
-            elif command == "pickup":
-                logger.info(f"[Pickup task] Pickup: {args}")
-                target_object = args
-                i += 1
-                next_command, next_args = response[i]
-                if next_command != "place":
-                    i -= 1
-                    logger.error("Pickup without place! Doing nothing.")
+
+            if command in self.available_actions:
+                command_with_args = "self." + command + "(" + args + ")"
+
+                if command == "pickup":
+                    if (i + 1 < len(response)) and (response[i + 1][0] == "place"):
+                        eval(command_with_args)
+                        i += 1
+                    else:
+                        logger.error("Pickup without place! Doing nothing.")
                 else:
-                    logger.info(f"{i} {next_command} {next_args}")
-                    logger.info(f"[Pickup task] Place: {next_args}")
-                target_receptacle = next_args
-                self._pickup(target_object, target_receptacle)
-            elif command == "place":
-                logger.error("Place without pickup! Doing nothing.")
-            elif command == "wave":
-                self.agent.move_to_manip_posture()
-                self.emote_task.get_task("wave").run()
-                self.agent.move_to_manip_posture()
-            elif command == "go_home":
-                self.agent.go_home()
-            elif command == "explore":
-                self.agent.explore()
-            elif command == "nod_head":
-                self.emote_task.get_task("nod_head").run()
-            elif command == "shake_head":
-                self.emote_task.get_task("shake_head").run()
-            elif command == "avert_gaze":
-                self.emote_task.get_task("avert_gaze").run()
-            elif command == "quit":
-                logger.info("[Pickup task] Quitting.")
-                self.robot.stop()
-                return False
-            elif command == "end":
-                logger.info("[Pickup task] Ending.")
-                break
+                    eval(command_with_args)
             else:
                 logger.error(f"Skipping unknown command: {command}")
-
+            
             i += 1
-        # If we did not explicitly receive a quit command, we are not yet done.
+
         return True
diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index 0d99e023d..e202ca0d6 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -152,7 +152,7 @@ def main(
 
     # Create the prompt we will use to control the robot
     prompt = PickupPromptBuilder()
-    executor = PickupExecutor(robot, agent, dry_run=False)
+    executor = PickupExecutor(robot, agent, available_actions=prompt.get_available_actions(), dry_run=False)
 
     # Get the LLM client
     llm_client = None
diff --git a/src/stretch/llms/base.py b/src/stretch/llms/base.py
index a147eff16..0195753c9 100644
--- a/src/stretch/llms/base.py
+++ b/src/stretch/llms/base.py
@@ -37,6 +37,10 @@ def __call__(self, kwargs: Optional[Dict[str, Any]] = None) -> str:
     def parse_response(self, response: str) -> Any:
         """Parse the response from the LLM. Usually does nothing."""
         return response
+    
+    def get_available_actions(self) -> List[str]:
+        """Return a list of available actions."""
+        return []
 
 
 class AbstractLLMClient(ABC):
diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index f9613e824..2baa53afa 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -154,66 +154,21 @@ def configure(self, **kwargs) -> str:
     def parse_response(self, response: str):
         """Parse the pickup, place, and say commands from the response into a list."""
         commands = [line for line in response.split("\n") if line]
-        # for line in response.split("\n"):
-        #     if line.startswith("pickup("):
-        #         commands.append(line)
-        #     elif line.startswith("place("):
-        #         commands.append(line)
-        #     elif line.startswith("say("):
-        #         commands.append(line)
-        #     elif line.startswith("wave()"):
-        #         commands.append(line)
-        #     elif line.startswith("go_home()"):
-        #         commands.append(line)
-        #     elif line.startswith("explore()"):
-        #         commands.append(line)
-        #     elif line.startswith("nod_head()"):
-        #         commands.append(line)
-        #     elif line.startswith("shake_head()"):
-        #         commands.append(line)
-        #     elif line.startswith("avert_gaze()"):
-        #         commands.append(line)
-        #     elif line.startswith("quit()"):
-        #         commands.append(line)
-        #     elif line.startswith("find("):
-        #         commands.append(line)
-        #     elif line.startswith("end()"):
-        #         # Stop parsing if we see the end command
-        #         break
-
-        # # Now go through commands and parse into a tuple (command, args)
-        # parsed_commands = []
-        # for command in commands:
-        #     if command.startswith("say("):
-        #         parsed_commands.append(("say", command[4:-1]))
-        #     elif command.startswith("pickup("):
-        #         parsed_commands.append(("pickup", command[7:-1]))
-        #     elif command.startswith("place("):
-        #         parsed_commands.append(("place", command[6:-1]))
-        #     elif command.startswith("wave()"):
-        #         parsed_commands.append(("wave", ""))
-        #     elif command.startswith("go_home()"):
-        #         parsed_commands.append(("go_home", ""))
-        #     elif command.startswith("explore()"):
-        #         parsed_commands.append(("explore", ""))
-        #     elif command.startswith("nod_head()"):
-        #         parsed_commands.append(("nod_head", ""))
-        #     elif command.startswith("shake_head()"):
-        #         parsed_commands.append(("shake_head", ""))
-        #     elif command.startswith("avert_gaze()"):
-        #         parsed_commands.append(("avert_gaze", ""))
-        #     elif command.startswith("find("):
-        #         parsed_commands.append(("find", command[5:-1]))
-        #     elif command.startswith("quit()"):
-        #         # Quit actually shuts down the robot.
-        #         parsed_commands.append(("quit", ""))
-        #         break
-        #     elif command.startswith("end()"):
-        #         # Stop parsing if we see the end command
-        #         # This really shouldn't happen, but just in case
-        #         break
-
         return commands
+    
+    def get_available_actions(self) -> List[str]:
+        return super().get_available_actions() + [
+            "pickup",
+            "place",
+            "say",
+            "wave",
+            "go_home",
+            "explore",
+            "nod_head",
+            "shake_head",
+            "avert_gaze",
+            "find",
+        ]
 
     def get_object(self, response: List[Tuple[str, str]]) -> str:
         """Return the object from the response."""

From 45fcb099e6c4d5c06023f89d2c5c4d13ceba9802 Mon Sep 17 00:00:00 2001
From: Atharva Pusalkar <atharvapusalkar18@gmail.com>
Date: Wed, 9 Oct 2024 13:44:56 -0400
Subject: [PATCH 378/410] Actually run the task

---
 .../agent/task/pickup/pickup_executor.py      | 48 +++++++++++++------
 src/stretch/app/ai_pickup.py                  |  4 +-
 src/stretch/llms/base.py                      |  2 +-
 src/stretch/llms/prompts/pickup_prompt.py     |  2 +-
 4 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/src/stretch/agent/task/pickup/pickup_executor.py b/src/stretch/agent/task/pickup/pickup_executor.py
index ba62acee9..38baff510 100644
--- a/src/stretch/agent/task/pickup/pickup_executor.py
+++ b/src/stretch/agent/task/pickup/pickup_executor.py
@@ -14,13 +14,12 @@
     ExploreOperation,
     GoHomeOperation,
     GoToNavOperation,
-    GoToOperation,
     GraspObjectOperation,
     NavigateToObjectOperation,
     NodHeadOperation,
     OpenLoopGraspObjectOperation,
-    PreGraspObjectOperation,
     PlaceObjectOperation,
+    PreGraspObjectOperation,
     RotateInPlaceOperation,
     SearchForObjectOnFloorOperation,
     SearchForReceptacleOperation,
@@ -32,7 +31,7 @@
 from stretch.agent.task.emote import EmoteTask
 from stretch.agent.task.pickup.pickup_task import PickupTask
 from stretch.core import AbstractRobotClient
-from stretch.core.task import Operation, Task
+from stretch.core.task import Task
 from stretch.utils.logger import Logger
 
 logger = Logger(__name__)
@@ -82,10 +81,11 @@ def __init__(
         self._operation_count = 0
         self.available_actions = available_actions
 
-
     def say(self, text: str) -> None:
         """Use the robot to say the text."""
-        say_operation = SpeakOperation(f"{str(self._operation_count)}_say_" + text, agent=self.agent, robot=self.robot)
+        say_operation = SpeakOperation(
+            f"{str(self._operation_count)}_say_" + text, agent=self.agent, robot=self.robot
+        )
         say_operation.configure(message=text)
         self._task.add_operation(say_operation, True)
         self._operation_count += 1
@@ -100,7 +100,9 @@ def pickup(self, target_object: str, target_receptacle: str) -> None:
 
         # Spin in place to find objects.
         rotate_in_place = RotateInPlaceOperation(
-            f"{str(self._operation_count)}_rotate_in_place", self.agent, parent=go_to_navigation_mode
+            f"{str(self._operation_count)}_rotate_in_place",
+            self.agent,
+            parent=go_to_navigation_mode,
         )
         self._operation_count += 1
 
@@ -187,7 +189,9 @@ def pickup(self, target_object: str, target_receptacle: str) -> None:
             self._operation_count += 1
 
         place_object_on_receptacle = PlaceObjectOperation(
-            f"{str(self._operation_count)}_place_object_on_receptacle", self.agent, on_cannot_start=go_to_receptacle
+            f"{str(self._operation_count)}_place_object_on_receptacle",
+            self.agent,
+            on_cannot_start=go_to_receptacle,
         )
         self._operation_count += 1
 
@@ -215,37 +219,49 @@ def pickup(self, target_object: str, target_receptacle: str) -> None:
 
     def wave(self) -> None:
         """Wave to the user."""
-        wave_operation = WaveOperation(f"{str(self._operation_count)}_wave", self.agent, robot=self.robot)
+        wave_operation = WaveOperation(
+            f"{str(self._operation_count)}_wave", self.agent, robot=self.robot
+        )
         self._task.add_operation(wave_operation, True)
         self._operation_count += 1
 
     def go_home(self) -> None:
         """Go back to the home position."""
-        go_home_operation = GoHomeOperation(f"{str(self._operation_count)}_go_home", self.agent, robot=self.robot)
+        go_home_operation = GoHomeOperation(
+            f"{str(self._operation_count)}_go_home", self.agent, robot=self.robot
+        )
         self._task.add_operation(go_home_operation, True)
         self._operation_count += 1
 
     def explore(self) -> None:
         """Explore the environment."""
-        explore_operation = ExploreOperation(f"{str(self._operation_count)}_explore", self.agent, robot=self.robot)
+        explore_operation = ExploreOperation(
+            f"{str(self._operation_count)}_explore", self.agent, robot=self.robot
+        )
         self._task.add_operation(explore_operation, True)
         self._operation_count += 1
 
     def nod_head(self) -> None:
         """Nod the head."""
-        nod_head_operation = NodHeadOperation(f"{str(self._operation_count)}_nod_head", self.agent, robot=self.robot)
+        nod_head_operation = NodHeadOperation(
+            f"{str(self._operation_count)}_nod_head", self.agent, robot=self.robot
+        )
         self._task.add_operation(nod_head_operation, True)
         self._operation_count += 1
 
     def shake_head(self) -> None:
         """Shake the head."""
-        shake_head_operation = ShakeHeadOperation(f"{str(self._operation_count)}_shake_head", self.agent, robot=self.robot)
+        shake_head_operation = ShakeHeadOperation(
+            f"{str(self._operation_count)}_shake_head", self.agent, robot=self.robot
+        )
         self._task.add_operation(shake_head_operation, True)
         self._operation_count += 1
 
     def avert_gaze(self) -> None:
         """Avert the gaze."""
-        avert_gaze_operation = AvertGazeOperation(f"{str(self._operation_count)}_avert_gaze", self.agent, robot=self.robot)
+        avert_gaze_operation = AvertGazeOperation(
+            f"{str(self._operation_count)}_avert_gaze", self.agent, robot=self.robot
+        )
         self._task.add_operation(avert_gaze_operation, True)
         self._operation_count += 1
 
@@ -288,6 +304,8 @@ def __call__(self, response: List[Tuple[str, str]]) -> bool:
             True if we should keep going, False if we should stop.
         """
         i = 0
+        self._task = Task()
+        self._operation_count = 0
 
         if response is None or len(response) == 0:
             logger.error("No commands to execute!")
@@ -314,7 +332,9 @@ def __call__(self, response: List[Tuple[str, str]]) -> bool:
                     eval(command_with_args)
             else:
                 logger.error(f"Skipping unknown command: {command}")
-            
+
             i += 1
 
+        self._task.run()
+
         return True
diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index e202ca0d6..ca355d50f 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -152,7 +152,9 @@ def main(
 
     # Create the prompt we will use to control the robot
     prompt = PickupPromptBuilder()
-    executor = PickupExecutor(robot, agent, available_actions=prompt.get_available_actions(), dry_run=False)
+    executor = PickupExecutor(
+        robot, agent, available_actions=prompt.get_available_actions(), dry_run=False
+    )
 
     # Get the LLM client
     llm_client = None
diff --git a/src/stretch/llms/base.py b/src/stretch/llms/base.py
index 0195753c9..1b88c67b7 100644
--- a/src/stretch/llms/base.py
+++ b/src/stretch/llms/base.py
@@ -37,7 +37,7 @@ def __call__(self, kwargs: Optional[Dict[str, Any]] = None) -> str:
     def parse_response(self, response: str) -> Any:
         """Parse the response from the LLM. Usually does nothing."""
         return response
-    
+
     def get_available_actions(self) -> List[str]:
         """Return a list of available actions."""
         return []
diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index 2baa53afa..de433b27b 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -155,7 +155,7 @@ def parse_response(self, response: str):
         """Parse the pickup, place, and say commands from the response into a list."""
         commands = [line for line in response.split("\n") if line]
         return commands
-    
+
     def get_available_actions(self) -> List[str]:
         return super().get_available_actions() + [
             "pickup",

From fe043092bea1a0e3932baa3f2c320ab6729c31c4 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 9 Oct 2024 16:55:41 -0400
Subject: [PATCH 379/410] add documentation

---
 src/stretch/agent/zmq_client.py            | 6 ++++--
 src/stretch/dynav/robot_agent_manip_mdp.py | 4 ++--
 src/stretch/dynav/run_ok_nav.py            | 3 ++-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index f504809fe..1caaafc1d 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -759,11 +759,13 @@ def switch_to_navigation_mode(self):
         self._wait_for_mode("navigation")
         assert self.in_navigation_mode()
 
-    def switch_to_manipulation_mode(self):
+    def switch_to_manipulation_mode(self, verbose: bool = False):
         next_action = {"control_mode": "manipulation"}
         self.send_action(next_action)
         time.sleep(0.1)
-        self._wait_for_mode("manipulation")
+        if verbose:
+            logger.info("Waiting for manipulation mode")
+        self._wait_for_mode("manipulation", verbose=verbose)
         assert self.in_manipulation_mode()
 
     def move_to_nav_posture(self):
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index 13dd039a4..cf4778218 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -28,7 +28,7 @@
     INIT_WRIST_YAW,
     TOP_CAMERA_NODE,
 )
-from stretch.dynav.ok_robot_hw.robot import HelloRobot as Manipulation_Wrapper
+from stretch.dynav.ok_robot_hw.robot import HelloRobot as ManipulationWrapper
 from stretch.dynav.ok_robot_hw.utils.grasper_utils import (
     capture_and_process_image,
     move_to_point,
@@ -70,7 +70,7 @@ def __init__(
             stretch_gripper_max = 0.64
             end_link = "link_gripper_s3_body"
         self.transform_node = end_link
-        self.manip_wrapper = Manipulation_Wrapper(
+        self.manip_wrapper = ManipulationWrapper(
             self.robot, stretch_gripper_max=stretch_gripper_max, end_link=end_link
         )
         self.robot.move_to_nav_posture()
diff --git a/src/stretch/dynav/run_ok_nav.py b/src/stretch/dynav/run_ok_nav.py
index ae8b1d729..f23d6f88e 100644
--- a/src/stretch/dynav/run_ok_nav.py
+++ b/src/stretch/dynav/run_ok_nav.py
@@ -34,7 +34,7 @@ def compute_tilt(camera_xyz, target_xyz):
 
 @click.command()
 # by default you are running these codes on your workstation, not on your robot.
-@click.option("--ip", default="127.0.0.1", type=str)
+@click.option("--ip", default="", type=str, help="IP address for the MDP agent")
 @click.option("--manual-wait", default=False, is_flag=True)
 @click.option("--random-goals", default=False, is_flag=True)
 @click.option("--explore-iter", default=-1)
@@ -72,6 +72,7 @@ def main(
     """
     click.echo("Will connect to a Stretch robot and collect a short trajectory.")
     robot = RobotClient(robot_ip=robot_ip)
+    robot.move_to_nav_posture()
 
     print("- Load parameters")
     parameters = get_parameters("dynav_config.yaml")

From f2f462d2246ff5af64c2584545df69a0b4ee1319 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 9 Oct 2024 14:03:53 -0700
Subject: [PATCH 380/410] update data

---
 src/stretch_ros2_bridge/stretch_ros2_bridge/remote/api.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/api.py b/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/api.py
index 116814635..384afe98d 100644
--- a/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/api.py
+++ b/src/stretch_ros2_bridge/stretch_ros2_bridge/remote/api.py
@@ -118,6 +118,10 @@ def switch_to_navigation_mode(self):
 
         return result_pre and result_post
 
+    @property
+    def base_control_mode(self) -> ControlMode:
+        return self._base_control_mode
+
     def switch_to_busy_mode(self) -> bool:
         """Switch to a mode that says we are occupied doing something blocking"""
         self._base_control_mode = ControlMode.BUSY

From b3e7f8d8272378a7996b93c7a0ced0db1007d3d7 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 9 Oct 2024 17:06:28 -0400
Subject: [PATCH 381/410] renaming some things and cleaning up the interfaces

---
 src/stretch/agent/zmq_client.py            |  8 ++++----
 src/stretch/dynav/robot_agent_manip_mdp.py | 12 ++++++------
 src/stretch/dynav/run_ok_nav.py            |  8 +++++---
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 1caaafc1d..4b3fc04ac 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -755,17 +755,17 @@ def gripper_to(self, target: float, blocking: bool = True):
     def switch_to_navigation_mode(self):
         """Velocity control of the robot base."""
         next_action = {"control_mode": "navigation"}
-        self.send_action(next_action)
-        self._wait_for_mode("navigation")
+        action = self.send_action(next_action)
+        self._wait_for_mode("navigation", resend_action=action)
         assert self.in_navigation_mode()
 
     def switch_to_manipulation_mode(self, verbose: bool = False):
         next_action = {"control_mode": "manipulation"}
-        self.send_action(next_action)
+        action = self.send_action(next_action)
         time.sleep(0.1)
         if verbose:
             logger.info("Waiting for manipulation mode")
-        self._wait_for_mode("manipulation", verbose=verbose)
+        self._wait_for_mode("manipulation", resend_action=action, verbose=verbose)
         assert self.in_manipulation_mode()
 
     def move_to_nav_posture(self):
diff --git a/src/stretch/dynav/robot_agent_manip_mdp.py b/src/stretch/dynav/robot_agent_manip_mdp.py
index cf4778218..24333a47c 100644
--- a/src/stretch/dynav/robot_agent_manip_mdp.py
+++ b/src/stretch/dynav/robot_agent_manip_mdp.py
@@ -45,7 +45,7 @@ def __init__(
         self,
         robot: RobotClient,
         parameters: Dict[str, Any],
-        ip: str,
+        server_ip: str,
         image_port: int = 5558,
         text_port: int = 5556,
         manip_port: int = 5557,
@@ -82,7 +82,7 @@ def __init__(
         self.guarantee_instance_is_reachable = parameters.guarantee_instance_is_reachable
 
         self.image_sender = ImageSender(
-            ip=ip, image_port=image_port, text_port=text_port, manip_port=manip_port
+            server_ip=server_ip, image_port=image_port, text_port=text_port, manip_port=manip_port
         )
         if method == "dynamem":
             from stretch.dynav.voxel_map_server import ImageProcessor as VoxelMapImageProcessor
@@ -347,7 +347,7 @@ class ImageSender:
     def __init__(
         self,
         stop_and_photo=False,
-        ip="100.108.67.79",
+        server_ip="100.108.67.79",
         image_port=5560,
         text_port=5561,
         manip_port=5557,
@@ -359,11 +359,11 @@ def __init__(
     ):
         context = zmq.Context()
         self.img_socket = context.socket(zmq.REQ)
-        self.img_socket.connect("tcp://" + str(ip) + ":" + str(image_port))
+        self.img_socket.connect("tcp://" + str(server_ip) + ":" + str(image_port))
         self.text_socket = context.socket(zmq.REQ)
-        self.text_socket.connect("tcp://" + str(ip) + ":" + str(text_port))
+        self.text_socket.connect("tcp://" + str(server_ip) + ":" + str(text_port))
         self.manip_socket = context.socket(zmq.REQ)
-        self.manip_socket.connect("tcp://" + str(ip) + ":" + str(manip_port))
+        self.manip_socket.connect("tcp://" + str(server_ip) + ":" + str(manip_port))
 
     def query_text(self, text, start):
         self.text_socket.send_string(text)
diff --git a/src/stretch/dynav/run_ok_nav.py b/src/stretch/dynav/run_ok_nav.py
index f23d6f88e..2321bd7ab 100644
--- a/src/stretch/dynav/run_ok_nav.py
+++ b/src/stretch/dynav/run_ok_nav.py
@@ -34,7 +34,7 @@ def compute_tilt(camera_xyz, target_xyz):
 
 @click.command()
 # by default you are running these codes on your workstation, not on your robot.
-@click.option("--ip", default="", type=str, help="IP address for the MDP agent")
+@click.option("--server_ip", default="", type=str, help="IP address for the MDP agent")
 @click.option("--manual-wait", default=False, is_flag=True)
 @click.option("--random-goals", default=False, is_flag=True)
 @click.option("--explore-iter", default=-1)
@@ -52,7 +52,7 @@ def compute_tilt(camera_xyz, target_xyz):
     help="Input path with default value 'output.npy'",
 )
 def main(
-    ip,
+    server_ip,
     manual_wait,
     navigate_home: bool = False,
     explore_iter: int = 5,
@@ -84,7 +84,9 @@ def main(
     robot.set_velocity(v=30.0, w=15.0)
 
     print("- Start robot agent with data collection")
-    demo = RobotAgentMDP(robot, parameters, ip=ip, re=re, env_num=env, test_num=test, method=method)
+    demo = RobotAgentMDP(
+        robot, parameters, server_ip=server_ip, re=re, env_num=env, test_num=test, method=method
+    )
 
     if input_path is None:
         demo.rotate_in_place()

From 629e0ff1d8cc4cbdfdd22003d24036e9c558e4a7 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 9 Oct 2024 17:16:23 -0400
Subject: [PATCH 382/410] sending the action

---
 src/stretch/agent/zmq_client.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 4b3fc04ac..c6da1ee13 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -760,6 +760,11 @@ def switch_to_navigation_mode(self):
         assert self.in_navigation_mode()
 
     def switch_to_manipulation_mode(self, verbose: bool = False):
+        """Move the robot to manipulation mode.
+
+        Args:
+            verbose: Whether to print out debug information
+        """
         next_action = {"control_mode": "manipulation"}
         action = self.send_action(next_action)
         time.sleep(0.1)

From 8f85e56eb67b0f932b689926489fbc8f974cb1ce Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Wed, 9 Oct 2024 17:16:58 -0400
Subject: [PATCH 383/410] updates

---
 src/stretch/agent/zmq_client.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index c6da1ee13..0216d0d9f 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -774,6 +774,7 @@ def switch_to_manipulation_mode(self, verbose: bool = False):
         assert self.in_manipulation_mode()
 
     def move_to_nav_posture(self):
+        """Move the robot to the navigation posture. This is where the head is looking forward and the arm is tucked in."""
         next_action = {"posture": "navigation", "step": self._iter}
         self.send_action(next_action)
         self._wait_for_head(constants.STRETCH_NAVIGATION_Q, resend_action={"posture": "navigation"})
@@ -782,6 +783,7 @@ def move_to_nav_posture(self):
         assert self.in_navigation_mode()
 
     def move_to_manip_posture(self):
+        """This is the pregrasp posture where the head is looking down and right and the arm is tucked in."""
         next_action = {"posture": "manipulation", "step": self._iter}
         self.send_action(next_action)
         time.sleep(0.1)

From e0b3b3d47ff09fed2d6e8debbf66383dc335380e Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 10:04:44 -0400
Subject: [PATCH 384/410] update

---
 src/stretch/dynav/voxel_map_localizer.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/stretch/dynav/voxel_map_localizer.py b/src/stretch/dynav/voxel_map_localizer.py
index 69bef8bf2..49b4a7bee 100644
--- a/src/stretch/dynav/voxel_map_localizer.py
+++ b/src/stretch/dynav/voxel_map_localizer.py
@@ -20,6 +20,10 @@
 from transformers import AutoModel, AutoProcessor, Owlv2ForObjectDetection
 
 from stretch.dynav.mapping_utils.voxelized_pcd import VoxelizedPointcloud
+from stretch.utils.logger import Logger
+
+# Create a logger
+logger = Logger(__name__)
 
 
 def get_inv_intrinsics(intrinsics):
@@ -85,6 +89,8 @@ def get_xyz(depth, pose, intrinsics):
 
 
 class VoxelMapLocalizer:
+    """This localizes a query in the voxel map."""
+
     def __init__(
         self,
         voxel_map_wrapper=None,
@@ -94,7 +100,7 @@ def __init__(
         device="cuda",
         siglip=True,
     ):
-        print("Localizer V3")
+        logger.info("Localizer V3")
         self.voxel_map_wrapper = voxel_map_wrapper
         self.device = device
         # self.clip_model, self.preprocessor = clip.load(model_config, device=device)
@@ -114,7 +120,7 @@ def __init__(
         # self.exist_model = YOLOWorld("yolov8l-worldv2.pt")
         self.existence_checking_model = exist_model
         if exist_model:
-            print("WE ARE USING OWLV2!")
+            logger.info("WE ARE USING OWLV2!")
             self.exist_processor = AutoProcessor.from_pretrained(
                 "google/owlv2-base-patch16-ensemble"
             )
@@ -122,7 +128,7 @@ def __init__(
                 "google/owlv2-base-patch16-ensemble"
             ).to(self.device)
         else:
-            print("YOU ARE USING NOTHING!")
+            logger.info("YOU ARE USING NOTHING!")
 
     def add(
         self,
@@ -131,8 +137,16 @@ def add(
         rgb: Optional[Tensor],
         weights: Optional[Tensor] = None,
         obs_count: Optional[Tensor] = None,
-        # weight_decay: float = 0.95
     ):
+        """Adds a pointcloud to the voxel map.
+
+        Args:
+            points: The points tensor, with shape (N, 3)
+            features: The features tensor, with shape (N, C)
+            rgb: The RGB tensor, with shape (N, 3)
+            weights: The weights tensor, with shape (N)
+            obs_count: The observation count tensor, with shape (N)
+        """
         points = points.to(self.device)
         if features is not None:
             features = features.to(self.device)

From c361078673d71dafc1b7911ca7bccf3575905c91 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 10:07:41 -0400
Subject: [PATCH 385/410] fix names

---
 src/stretch/agent/robot_agent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 227555f98..3283674d3 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1982,4 +1982,4 @@ def save_map(self, filename: Optional[str] = None) -> None:
         memory.backup_saved_map()
 
         # Write the new map to the file
-        self.voxel_map.save_to_pickle(filename)
+        self.voxel_map.write_to_pickle(filename)

From b80e1f84790f8651e61f1fff6bedcfccf228ea7b Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 10:23:40 -0400
Subject: [PATCH 386/410] add a rotation behavior if we are exploring

---
 src/stretch/agent/robot_agent.py          |  8 ++++++++
 src/stretch/config/default_planner.yaml   |  6 +++---
 src/stretch/utils/dummy_stretch_client.py | 12 ++++++++++--
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 3283674d3..aae8a3acd 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1496,6 +1496,7 @@ def run_exploration(
         # Explore some number of times
         matches = []
         no_success_explore = True
+        rotated = False
         for i in range(explore_iter):
             print("\n" * 2)
             print("-" * 20, i + 1, "/", explore_iter, "-" * 20)
@@ -1517,6 +1518,7 @@ def run_exploration(
 
             # if it succeeds, execute a trajectory to this position
             if res.success:
+                rotated = False
                 no_success_explore = False
                 print("Plan successful!")
                 for i, pt in enumerate(res.trajectory):
@@ -1540,6 +1542,12 @@ def run_exploration(
                         rot_err_threshold=self.rot_err_threshold,
                     )
             else:
+                # Try rotating in place if we can't find a decent frontier to get to
+                if not rotated:
+                    self.rotate_in_place()
+                    rotate = True
+                    continue
+
                 # breakpoint()
                 # if it fails, try again or just quit
                 if self._retry_on_fail:
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 4363747e0..22f1b93ea 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -119,9 +119,9 @@ motion_planner:
   shortcut_iter: 100
   # Parameters for frontier exploration using the motion planner.
   frontier:
-    dilate_frontier_size: 5  # Used to shrink the frontier back from the edges of the world
-    dilate_obstacle_size: 6  # Used when selecting goals and computing what the "frontier" is 
-    default_expand_frontier_size: 10  # margin along the frontier where final robot position can be
+    dilate_frontier_size: 2  # Used to shrink the frontier back from the edges of the world
+    dilate_obstacle_size: 4  # Used when selecting goals and computing what the "frontier" is 
+    default_expand_frontier_size: 12  # margin along the frontier where final robot position can be
     # Distance away you search for frontier points
     min_dist: 0.1
     # Subsampling frontier space at this discretization
diff --git a/src/stretch/utils/dummy_stretch_client.py b/src/stretch/utils/dummy_stretch_client.py
index 7111b71b9..0a369c7e9 100644
--- a/src/stretch/utils/dummy_stretch_client.py
+++ b/src/stretch/utils/dummy_stretch_client.py
@@ -19,6 +19,7 @@
 from stretch.motion import Footprint, RobotModel
 
 # from stretch.motion.kinematics import HelloStretchKinematics
+from stretch.utils.geometry import xyt_base_to_global
 
 
 class DummyStretchClient(AbstractRobotClient, RobotModel):
@@ -55,10 +56,17 @@ def __init__(
         """
         self._robot_model = self
         self.dof = 3 + 2 + 4 + 2
+        self.xyt = np.zeros(3)
 
-    def navigate_to(self, xyt, relative=False, blocking=False):
+    def navigate_to(
+        self, xyt, relative: bool = False, blocking: bool = False, verbose: bool = False
+    ):
         """Move to xyt in global coordinates or relative coordinates."""
-        raise NotImplementedError()
+        if relative:
+            xyt_goal = xyt_base_to_global(xyt, self.xyt)
+        else:
+            xyt_goal = xyt
+        self.xyt = xyt_goal
 
     def reset(self):
         """Reset everything in the robot's internal state"""

From fbcf47216c105797ea5759c126f1c18fe7d2a04b Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 10:28:01 -0400
Subject: [PATCH 387/410] some more exploration tuning

---
 src/stretch/agent/robot_agent.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index aae8a3acd..aaf1013a7 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1384,6 +1384,7 @@ def plan_to_frontier(
         try_to_plan_iter: int = 10,
         fix_random_seed: bool = False,
         verbose: bool = True,
+        push_locations_to_stack: bool = False,
     ) -> PlanResult:
         """Motion plan to a frontier location. This is a location that is on the edge of the explored space. We use the voxel grid map created by our collector to sample free space, and then use our motion planner (RRT for now) to get there. At the end, we plan back to (0,0,0).
 
@@ -1394,6 +1395,7 @@ def plan_to_frontier(
             try_to_plan_iter(int): the number of tries to find a plan
             fix_random_seed(bool): whether to fix the random seed
             verbose(bool): extra info is printed
+            push_locations_to_stack(bool): whether to push visited locations to planning stack. can help get you unstuck, maybe?
 
         Returns:
             PlanResult: the result of the motion planner
@@ -1449,8 +1451,9 @@ def plan_to_frontier(
                     np.random.seed(0)
                     random.seed(0)
 
-                # print("Pushing locations to stack...")
-                self.planner.space.push_locations_to_stack(self.get_history(reversed=True))
+                if push_locations_to_stack:
+                    print("Pushing visited locations to stack...")
+                    self.planner.space.push_locations_to_stack(self.get_history(reversed=True))
                 # print("Planning to goal...")
                 res = self.planner.plan(start, goal.cpu().numpy())
                 if res.success:

From 406546553e1fe6e37a39cc88d778c58edc0870a7 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 10:32:09 -0400
Subject: [PATCH 388/410] updates to motion planning config

---
 src/stretch/agent/robot_agent.py        | 8 +++++++-
 src/stretch/config/default_planner.yaml | 5 +++++
 src/stretch/motion/algo/simplify.py     | 2 +-
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index aaf1013a7..178c56740 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -151,7 +151,13 @@ def __init__(
         if parameters["motion_planner"]["shortcut_plans"]:
             self.planner = Shortcut(self.planner, parameters["motion_planner"]["shortcut_iter"])
         if parameters["motion_planner"]["simplify_plans"]:
-            self.planner = SimplifyXYT(self.planner, min_step=0.05, max_step=1.0, num_steps=8)
+            self.planner = SimplifyXYT(
+                self.planner,
+                min_step=parameters["motion_planner"]["simplify"]["min_step"],
+                max_step=parameters["motion_planner"]["simplify"]["max_step"],
+                num_steps=parameters["motion_planner"]["simplify"]["num_steps"],
+                min_angle=parameters["motion_planner"]["simplify"]["min_angle"],
+            )
 
         if self._realtime_updates:
             # Locks
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 22f1b93ea..affde396a 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -116,6 +116,11 @@ motion_planner:
   rotation_step_size: 0.1
   simplify_plans: True
   shortcut_plans: True
+  simplify:
+    max_step: 0.5
+    min_step: 0.05
+    num_steps: 8
+    min_angle: 0.1
   shortcut_iter: 100
   # Parameters for frontier exploration using the motion planner.
   frontier:
diff --git a/src/stretch/motion/algo/simplify.py b/src/stretch/motion/algo/simplify.py
index 9eee76fa6..5561a8d85 100644
--- a/src/stretch/motion/algo/simplify.py
+++ b/src/stretch/motion/algo/simplify.py
@@ -36,7 +36,7 @@ def __init__(
         self,
         planner: Planner,
         min_step: float = 0.1,
-        max_step: float = 1.0,
+        max_step: float = 0.5,
         num_steps: int = 6,
         min_angle: float = np.deg2rad(5),
     ):

From 0fc1f0b4d4f401132664ea8c5c9e67314ec75c55 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 10:38:15 -0400
Subject: [PATCH 389/410] trying to improve recovery code

---
 src/stretch/agent/robot_agent.py | 45 ++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 178c56740..d258ae4c0 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -998,25 +998,29 @@ def recover_from_invalid_start(self, verbose: bool = True) -> bool:
         # Get current invalid pose
         start = self.robot.get_base_pose()
 
-        # Apply relative transformation to XYT
-        forward = np.array([-0.05, 0, 0])
-        backward = np.array([0.05, 0, 0])
-
-        xyt_goal_forward = xyt_base_to_global(forward, start)
-        xyt_goal_backward = xyt_base_to_global(backward, start)
-
-        # Is this valid?
-        if self.space.is_valid(xyt_goal_backward, verbose=True):
-            logger.warning("Trying to move backwards...")
-            # Compute the position forward or backward from the robot
-            self.robot.navigate_to(xyt_goal_backward, relative=False)
-        elif self.space.is_valid(xyt_goal_forward, verbose=True):
-            logger.warning("Trying to move forward...")
-            # Compute the position forward or backward from the robot
-            self.robot.navigate_to(xyt_goal_forward, relative=False)
-        else:
-            logger.warning("Could not recover from invalid start state!")
-            return False
+        steps = [0.05, 0.1]
+
+        for step in steps:
+            # Apply relative transformation to XYT
+            forward = np.array([-1 * step, 0, 0])
+            backward = np.array([step, 0, 0])
+
+            xyt_goal_forward = xyt_base_to_global(forward, start)
+            xyt_goal_backward = xyt_base_to_global(backward, start)
+
+            # Is this valid?
+            if self.space.is_valid(xyt_goal_backward, verbose=True):
+                logger.warning("Trying to move backwards...")
+                # Compute the position forward or backward from the robot
+                self.robot.navigate_to(xyt_goal_backward, relative=False)
+                break
+            elif self.space.is_valid(xyt_goal_forward, verbose=True):
+                logger.warning("Trying to move forward...")
+                # Compute the position forward or backward from the robot
+                self.robot.navigate_to(xyt_goal_forward, relative=False)
+                break
+            else:
+                logger.warning(f"Could not recover from invalid start state with step of {step}!")
 
         # Get the current position in case we are still invalid
         start = self.robot.get_base_pose()
@@ -1410,8 +1414,9 @@ def plan_to_frontier(
         # if start is not valid move backwards a bit
         if not start_is_valid:
             if verbose:
-                print("Start not valid. back up a bit.")
+                print("Start not valid. Try to recover.")
             ok = self.recover_from_invalid_start()
+            breakpoint()
             if not ok:
                 return PlanResult(False, reason="invalid start state")
         else:

From 8dd0e82bdb987e9e0966cf9b62bb0ffb7f57b93f Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 10:47:45 -0400
Subject: [PATCH 390/410] update graspign

---
 src/stretch/agent/robot_agent.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index d258ae4c0..98eab39c2 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -1009,6 +1009,7 @@ def recover_from_invalid_start(self, verbose: bool = True) -> bool:
             xyt_goal_backward = xyt_base_to_global(backward, start)
 
             # Is this valid?
+            logger.warning(f"Trying to recover with step of {step}...")
             if self.space.is_valid(xyt_goal_backward, verbose=True):
                 logger.warning("Trying to move backwards...")
                 # Compute the position forward or backward from the robot
@@ -1416,7 +1417,6 @@ def plan_to_frontier(
             if verbose:
                 print("Start not valid. Try to recover.")
             ok = self.recover_from_invalid_start()
-            breakpoint()
             if not ok:
                 return PlanResult(False, reason="invalid start state")
         else:
@@ -1519,9 +1519,13 @@ def run_exploration(
             # if start is not valid move backwards a bit
             if not start_is_valid:
                 print("Start not valid. back up a bit.")
-                print(f"robot base pose: {self.robot.get_base_pose()}")
-                self.robot.navigate_to([-0.1, 0, 0], relative=True)
-                continue
+                ok = self.recover_from_invalid_start()
+                if ok:
+                    start = self.robot.get_base_pose()
+                    start_is_valid = self.space.is_valid(start, verbose=True)
+                if not start_is_valid:
+                    print("Failed to recover from invalid start state!")
+                    break
 
             # Now actually plan to the frontier
             print("       Start:", start)

From 03bdb936c3d9483fa1e4ae9e37c72fe67e7b0ac7 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 10:50:35 -0400
Subject: [PATCH 391/410] updates to the robot agent

---
 src/stretch/agent/robot_agent.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 98eab39c2..8c0e47e9e 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -410,7 +410,10 @@ def rotate_in_place(
         logger.info("Rotate in place")
         if steps is None or steps <= 0:
             # Read the number of steps from the parameters
-            steps = self.parameters["agent"]["in_place_rotation_steps"]
+            if self.parameters["agent"]["use_realtime_updates"]:
+                steps = self.parameters["agent"]["realtime_rotation_steps"]
+            else:
+                steps = self.parameters["agent"]["in_place_rotation_steps"]
 
         step_size = 2 * np.pi / steps
         i = 0

From 4a9a9430e7aa79963834bceecf4874700169617d Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 10:55:13 -0400
Subject: [PATCH 392/410] update agent code

---
 src/stretch/agent/robot_agent.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 8c0e47e9e..f3db95cc3 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -64,7 +64,7 @@ def __init__(
         voxel_map: Optional[SparseVoxelMap] = None,
         show_instances_detected: bool = False,
         use_instance_memory: bool = False,
-        realtime_updates: bool = False,
+        enable_realtime_updates: bool = True,
         obs_sub_port: int = 4450,
     ):
         self.reset_object_plans()
@@ -92,7 +92,16 @@ def __init__(
         self.use_scene_graph = self.parameters["use_scene_graph"]
         self.tts = get_text_to_speech(self.parameters["tts_engine"])
         self._use_instance_memory = use_instance_memory
-        self._realtime_updates = realtime_updates
+        self._realtime_updates = (
+            enable_realtime_updates and self.parameters["agent"]["use_realtime_updates"]
+        )
+        if self._realtime_updates:
+            logger.alert("Using real-time updates")
+        if not enable_realtime_updates and self.parameters["agent"]["use_realtime_updates"]:
+            logger.warning(
+                "Real-time updates are not enabled but the agent is configured to use them."
+            )
+            logger.warning("We will not be able to update the map in real-time.")
 
         # ==============================================
         # Update configuration

From 55e171712b132a127de3a4d601a2926da5986d9f Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 10:56:34 -0400
Subject: [PATCH 393/410] update agent

---
 src/stretch/agent/robot_agent.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index f3db95cc3..04ea11a34 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -95,8 +95,6 @@ def __init__(
         self._realtime_updates = (
             enable_realtime_updates and self.parameters["agent"]["use_realtime_updates"]
         )
-        if self._realtime_updates:
-            logger.alert("Using real-time updates")
         if not enable_realtime_updates and self.parameters["agent"]["use_realtime_updates"]:
             logger.warning(
                 "Real-time updates are not enabled but the agent is configured to use them."
@@ -169,6 +167,8 @@ def __init__(
             )
 
         if self._realtime_updates:
+            logger.alert("Using real-time updates!")
+
             # Locks
             self._robot_lock = Lock()
             self._obs_history_lock = Lock()
@@ -419,7 +419,7 @@ def rotate_in_place(
         logger.info("Rotate in place")
         if steps is None or steps <= 0:
             # Read the number of steps from the parameters
-            if self.parameters["agent"]["use_realtime_updates"]:
+            if self._realtime_updates:
                 steps = self.parameters["agent"]["realtime_rotation_steps"]
             else:
                 steps = self.parameters["agent"]["in_place_rotation_steps"]

From 360c297c872ce0ca59fddc8aef79504105209b1f Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 11:01:55 -0400
Subject: [PATCH 394/410] disable realtime + make sure it works with realtime
 maps

---
 src/stretch/agent/robot_agent.py        | 1 +
 src/stretch/app/read_map.py             | 1 +
 src/stretch/config/default_planner.yaml | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 04ea11a34..9214e92e3 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -421,6 +421,7 @@ def rotate_in_place(
             # Read the number of steps from the parameters
             if self._realtime_updates:
                 steps = self.parameters["agent"]["realtime_rotation_steps"]
+                logger.info(f"Using real-time rotation steps: {steps}")
             else:
                 steps = self.parameters["agent"]["in_place_rotation_steps"]
 
diff --git a/src/stretch/app/read_map.py b/src/stretch/app/read_map.py
index e9e4d3544..5179aedd8 100644
--- a/src/stretch/app/read_map.py
+++ b/src/stretch/app/read_map.py
@@ -175,6 +175,7 @@ def main(
 
     print("- Load parameters")
     parameters = get_parameters(config_path)
+    parameters["agent"]["use_realtime_updates"] = False
 
     if run_segmentation:
         print("- Preparing perception pipeline")
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index affde396a..d77114e07 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -74,7 +74,7 @@ motion:
 
 # Exploration
 agent:
-  use_realtime_updates: True
+  use_realtime_updates: False
   realtime_update_rotation_steps: 4
   in_place_rotation_steps: 8  # If you are not moving the head, rotate more often
   sweep_head_on_update: False

From 34bdc601c1274b6ce143342664774b9185ab1488 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 12:34:42 -0400
Subject: [PATCH 395/410] realtime updates

---
 src/stretch/agent/robot_agent.py        |  2 +-
 src/stretch/app/mapping.py              | 11 ++++++++---
 src/stretch/config/default_planner.yaml |  2 +-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 9214e92e3..54c1ad54f 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -64,7 +64,7 @@ def __init__(
         voxel_map: Optional[SparseVoxelMap] = None,
         show_instances_detected: bool = False,
         use_instance_memory: bool = False,
-        enable_realtime_updates: bool = True,
+        enable_realtime_updates: bool = False,
         obs_sub_port: int = 4450,
     ):
         self.reset_object_plans()
diff --git a/src/stretch/app/mapping.py b/src/stretch/app/mapping.py
index 4bf7579c4..1cdaf7fa3 100644
--- a/src/stretch/app/mapping.py
+++ b/src/stretch/app/mapping.py
@@ -50,6 +50,7 @@
 )
 @click.option("--parameter-file", default="default_planner.yaml")
 @click.option("--reset", is_flag=True, help="Reset the robot to origin before starting")
+@click.option("--enable-realtime-updates", is_flag=True, help="Enable real-time updates")
 def main(
     visualize,
     manual_wait,
@@ -69,6 +70,7 @@ def main(
     local: bool = True,
     robot_ip: str = "192.168.1.15",
     reset: bool = False,
+    enable_realtime_updates: bool = False,
     **kwargs,
 ):
 
@@ -80,6 +82,7 @@ def main(
         use_remote_computer=(not local),
         parameters=parameters,
         enable_rerun_server=True,
+        publish_observations=enable_realtime_updates,
     )
     # Call demo_main with all the arguments
     demo_main(
@@ -100,6 +103,7 @@ def main(
         explore_iter=explore_iter,
         write_instance_images=write_instance_images,
         parameter_file=parameter_file,
+        enable_realtime_updates=enable_realtime_updates,
         reset=reset,
         **kwargs,
     )
@@ -124,6 +128,7 @@ def demo_main(
     parameters: Optional[Parameters] = None,
     parameter_file: str = "config/default.yaml",
     reset: bool = False,
+    enable_realtime_updates: bool = False,
     **kwargs,
 ):
     """
@@ -163,9 +168,9 @@ def demo_main(
         semantic_sensor = None
 
     print("- Start robot agent with data collection")
-    grasp_client = None  # GraspPlanner(robot, env=None, semantic_sensor=semantic_sensor)
-
-    demo = RobotAgent(robot, parameters, semantic_sensor, grasp_client=grasp_client)
+    demo = RobotAgent(
+        robot, parameters, semantic_sensor, enable_realtime_updates=enable_realtime_updates
+    )
     demo.start(goal=object_to_find, visualize_map_at_start=show_intermediate_maps)
     if reset:
         demo.move_closed_loop([0, 0, 0], max_time=60.0)
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index d77114e07..affde396a 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -74,7 +74,7 @@ motion:
 
 # Exploration
 agent:
-  use_realtime_updates: False
+  use_realtime_updates: True
   realtime_update_rotation_steps: 4
   in_place_rotation_steps: 8  # If you are not moving the head, rotate more often
   sweep_head_on_update: False

From fd50eff93f64cc8d9c51ee33c2f94484705876ce Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 12:48:13 -0400
Subject: [PATCH 396/410] updates to files and demos and apps

---
 docs/apps.md                     | 10 ++++++
 src/stretch/agent/robot_agent.py | 12 +++++--
 src/stretch/app/mapping.py       |  7 +++-
 src/stretch/app/vlm_planning.py  | 55 +++++++++++++++++++++++++++-----
 4 files changed, 72 insertions(+), 12 deletions(-)

diff --git a/docs/apps.md b/docs/apps.md
index 15c1d51ad..25f19e0ac 100644
--- a/docs/apps.md
+++ b/docs/apps.md
@@ -226,3 +226,13 @@ You can add the `--reset` flag to make it go back to the start position. The def
 ```
 python -m stretch.app.pickup --reset
 ```
+
+## Experimental
+
+### VLM Planning
+
+This is an experimental app that uses the voxel map to plan a path to a goal. It is not yet fully functional.
+
+```bash
+python -m stretch.app.vlm_planning
+```
diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 54c1ad54f..cb9e240c7 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -28,7 +28,7 @@
 from stretch.audio.text_to_speech import get_text_to_speech
 from stretch.core.interfaces import Observations
 from stretch.core.parameters import Parameters
-from stretch.core.robot import AbstractGraspClient, AbstractRobotClient
+from stretch.core.robot import AbstractRobotClient
 from stretch.mapping.instance import Instance
 from stretch.mapping.scene_graph import SceneGraph
 from stretch.mapping.voxel import SparseVoxelMap, SparseVoxelMapNavigationSpace
@@ -60,7 +60,6 @@ def __init__(
         robot: AbstractRobotClient,
         parameters: Union[Parameters, Dict[str, Any]],
         semantic_sensor: Optional[OvmmPerception] = None,
-        grasp_client: Optional[AbstractGraspClient] = None,
         voxel_map: Optional[SparseVoxelMap] = None,
         show_instances_detected: bool = False,
         use_instance_memory: bool = False,
@@ -75,7 +74,6 @@ def __init__(
         else:
             raise RuntimeError(f"parameters of unsupported type: {type(parameters)}")
         self.robot = robot
-        self.grasp_client = grasp_client
         self.show_instances_detected = show_instances_detected
 
         self.semantic_sensor = semantic_sensor
@@ -189,6 +187,14 @@ def __init__(
             self._update_map_thread = None
             self._get_observations_thread = None
 
+    def get_robot(self) -> AbstractRobotClient:
+        """Return the robot in use by this model"""
+        return self.robot
+
+    def get_voxel_map(self) -> SparseVoxelMap:
+        """Return the voxel map in use by this model"""
+        return self.voxel_map
+
     def __del__(self):
         if self._update_map_thread is not None and self._update_map_thread.is_alive():
             self._update_map_thread.join()
diff --git a/src/stretch/app/mapping.py b/src/stretch/app/mapping.py
index 1cdaf7fa3..f1ce66aaa 100644
--- a/src/stretch/app/mapping.py
+++ b/src/stretch/app/mapping.py
@@ -51,6 +51,7 @@
 @click.option("--parameter-file", default="default_planner.yaml")
 @click.option("--reset", is_flag=True, help="Reset the robot to origin before starting")
 @click.option("--enable-realtime-updates", is_flag=True, help="Enable real-time updates")
+@click.option("--save", is_flag=True, help="Save the map to memory")
 def main(
     visualize,
     manual_wait,
@@ -71,6 +72,7 @@ def main(
     robot_ip: str = "192.168.1.15",
     reset: bool = False,
     enable_realtime_updates: bool = False,
+    save: bool = True,
     **kwargs,
 ):
 
@@ -105,6 +107,7 @@ def main(
         parameter_file=parameter_file,
         enable_realtime_updates=enable_realtime_updates,
         reset=reset,
+        save=save,
         **kwargs,
     )
 
@@ -129,6 +132,7 @@ def demo_main(
     parameter_file: str = "config/default.yaml",
     reset: bool = False,
     enable_realtime_updates: bool = False,
+    save: bool = True,
     **kwargs,
 ):
     """
@@ -223,7 +227,8 @@ def demo_main(
             print(f"Write pkl to {output_pkl_filename}...")
             demo.voxel_map.write_to_pickle(output_pkl_filename)
 
-        demo.save_map()
+        if save:
+            demo.save_map()
 
         if write_instance_images:
             demo.save_instance_images(".")
diff --git a/src/stretch/app/vlm_planning.py b/src/stretch/app/vlm_planning.py
index 3febf76b3..251849729 100644
--- a/src/stretch/app/vlm_planning.py
+++ b/src/stretch/app/vlm_planning.py
@@ -26,6 +26,7 @@
 
 from stretch.agent import RobotAgent
 from stretch.agent.vlm_planner import VLMPlanner
+from stretch.agent.zmq_client import HomeRobotZMQClient
 from stretch.core import get_parameters
 from stretch.core.interfaces import Observations
 from stretch.perception import create_semantic_sensor
@@ -103,9 +104,11 @@ def images_to_video(image_list, output_path, fps=30):
     "--input-path",
     "-i",
     type=click.Path(),
-    default="output.pkl",
-    help="Input path with default value 'output.npy'",
+    default="",
+    help="Input path. If empty, run on the real robot.",
 )
+@click.option("--local", is_flag=True, help="Run code locally on the robot.")
+@click.option("--robot_ip", default="")
 @click.option("--task", "-t", type=str, default="", help="Task to run with the planner.")
 @click.option(
     "--config-path",
@@ -142,6 +145,8 @@ def main(
     show_instances: bool = False,
     api_key: str = None,
     task: str = "",
+    local: bool = False,
+    robot_ip: str = "",
 ):
     """Simple script to load a voxel map"""
     input_path = Path(input_path)
@@ -149,8 +154,6 @@ def main(
 
     loaded_voxel_map = None
 
-    dummy_robot = DummyStretchClient()
-
     print("- Load parameters")
     vlm_parameters = get_parameters(config_path)
     if not vlm_parameters.get("vlm_base_config"):
@@ -169,15 +172,26 @@ def main(
     print("Creating semantic sensors...")
     semantic_sensor = create_semantic_sensor(parameters=vlm_parameters)
 
+    if len(input_path) > 0:
+        robot = DummyStretchClient()
+    else:
+        robot = HomeRobotZMQClient(robot_ip=robot_ip, local=local)
+
     print("Creating robot agent...")
     agent = RobotAgent(
-        dummy_robot,
+        robot,
         vlm_parameters,
         voxel_map=loaded_voxel_map,
         semantic_sensor=semantic_sensor,
     )
     voxel_map = agent.voxel_map
-    voxel_map.read_from_pickle(input_path, num_frames=frame, perception=semantic_sensor)
+
+    if len(input_path) > 0:
+        # load from pickle
+        voxel_map.read_from_pickle(input_path, num_frames=frame, perception=semantic_sensor)
+    else:
+        # Scan the local area to get a map
+        agent.rotate_in_place()
 
     # get the task
     task = agent.get_command() if not task else task
@@ -191,6 +205,27 @@ def main(
     #     num_frames=frame,
     #     frame_skip=frame_skip,
     # )
+    run_vlm_planner(agent, task, show_svm, test_vlm, api_key, show_instances)
+
+
+def run_vlm_planner(
+    agent,
+    task,
+    show_svm: bool = False,
+    test_vlm: bool = False,
+    api_key: str = None,
+    show_instances: bool = False,
+):
+    """
+    Run the VLM planner with the given agent and task.
+
+    Args:
+        agent (RobotAgent): the robot agent to use.
+        task (str): the task to run.
+        show_svm (bool): whether to show the SVM.
+        test_vlm (bool): whether to test the VLM planner.
+        api_key (str): the OpenAI API key.
+    """
 
     # TODO: read this from file
     x0 = np.array([0, 0, 0])
@@ -200,6 +235,10 @@ def main(
     start_xyz = [x0[0], x0[1], 0]
 
     print("Agent loaded:", agent)
+    vlm_parameters = agent.get_parameters()
+    semantic_sensor = agent.get_semantic_sensor()
+    robot = agent.get_robot()
+    voxel_map = agent.get_voxel_map()
 
     # Create the VLM planner using the agent
     vlm_planner = VLMPlanner(agent, api_key=api_key)
@@ -207,7 +246,7 @@ def main(
     # Display with agent overlay
     space = agent.get_navigation_space()
     if show_svm:
-        footprint = dummy_robot.get_footprint()
+        footprint = robot.get_footprint()
         print(f"{x0} valid = {space.is_valid(x0)}")
         voxel_map.show(instances=show_instances, orig=start_xyz, xyt=x0, footprint=footprint)
 
@@ -275,7 +314,7 @@ def main(
 
                 # create a new agent for planning with the updated map
                 planning_agent = RobotAgent(
-                    dummy_robot,
+                    robot,
                     vlm_parameters,
                     voxel_map=new_map,
                     semantic_sensor=semantic_sensor,

From 42307653d47636945fbde868cc551af4093ab2cc Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 13:34:56 -0400
Subject: [PATCH 397/410] realtime updates parameters

---
 src/stretch/agent/robot_agent.py        | 7 +++++--
 src/stretch/app/mapping.py              | 7 ++++++-
 src/stretch/config/default_planner.yaml | 6 ++++++
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index cb9e240c7..6c2cd4b8e 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -126,6 +126,8 @@ def __init__(
         self._frontier_step_dist = parameters["motion_planner"]["frontier"]["step_dist"]
         self._manipulation_radius = parameters["motion_planner"]["goals"]["manipulation_radius"]
         self._voxel_size = parameters["voxel_size"]
+        self._realtime_matching_distance = parameters["agent"]["realtime"]["matching_distance"]
+        self._realtime_temporal_threshold = parameters["agent"]["realtime"]["temporal_threshold"]
 
         if voxel_map is not None:
             self.voxel_map = voxel_map
@@ -542,7 +544,7 @@ def update_map_with_pose_graph(self):
             gps_past = self.obs_history[idx].gps
 
             for vertex in self.pose_graph:
-                if abs(vertex[0] - lidar_timestamp) < 0.05:
+                if abs(vertex[0] - lidar_timestamp) < self._realtime_temporal_threshold:
                     # print(f"Exact match found! {vertex[0]} and obs {idx}: {lidar_timestamp}")
 
                     self.obs_history[idx].is_pose_graph_node = True
@@ -564,7 +566,8 @@ def update_map_with_pose_graph(self):
                         self.obs_history[idx] = self.semantic_sensor.predict(self.obs_history[idx])
                 # check if the gps is close to the gps of the pose graph node
                 elif (
-                    np.linalg.norm(gps_past - np.array([vertex[1], vertex[2]])) < 0.3
+                    np.linalg.norm(gps_past - np.array([vertex[1], vertex[2]]))
+                    < self._realtime_matching_distance
                     and self.obs_history[idx].pose_graph_timestamp is None
                 ):
                     # print(f"Close match found! {vertex[0]} and obs {idx}: {lidar_timestamp}")
diff --git a/src/stretch/app/mapping.py b/src/stretch/app/mapping.py
index f1ce66aaa..74440704b 100644
--- a/src/stretch/app/mapping.py
+++ b/src/stretch/app/mapping.py
@@ -50,7 +50,12 @@
 )
 @click.option("--parameter-file", default="default_planner.yaml")
 @click.option("--reset", is_flag=True, help="Reset the robot to origin before starting")
-@click.option("--enable-realtime-updates", is_flag=True, help="Enable real-time updates")
+@click.option(
+    "--enable-realtime-updates",
+    "--enable-realtime-updates",
+    is_flag=True,
+    help="Enable real-time updates so the robot will scan its environment and update the map as it moves around",
+)
 @click.option("--save", is_flag=True, help="Save the map to memory")
 def main(
     visualize,
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index affde396a..386d1ed23 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -74,6 +74,12 @@ motion:
 
 # Exploration
 agent:
+  realtime:
+    # This is the distance to pose graph nodes
+    matching_distance: 0.5
+    # This was 0.05 in Atharva's exerimetns
+    # It is how close lidar spins have to be to be considered the same
+    temporal_threshold: 0.1
   use_realtime_updates: True
   realtime_update_rotation_steps: 4
   in_place_rotation_steps: 8  # If you are not moving the head, rotate more often

From 2ce45a0e260849f9f42f0d402c61c5d141b07f31 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 13:48:25 -0400
Subject: [PATCH 398/410] debug without AI

---
 src/stretch/app/ai_pickup.py              | 6 ++----
 src/stretch/app/goto_and_execute_skill.py | 3 +--
 src/stretch/app/pickup.py                 | 5 +----
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index ca355d50f..46c81c616 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -132,11 +132,8 @@ def main(
         verbose=verbose,
     )
 
-    # Start moving the robot around
-    grasp_client = None
-
     # Agents wrap the robot high level planning interface for now
-    agent = RobotAgent(robot, parameters, semantic_sensor, grasp_client=grasp_client)
+    agent = RobotAgent(robot, parameters, semantic_sensor)
     agent.start(visualize_map_at_start=show_intermediate_maps)
     if reset:
         agent.move_closed_loop([0, 0, 0], max_time=60.0)
@@ -174,6 +171,7 @@ def main(
                 target_object = input("Enter the target object: ")
             if len(receptacle) == 0:
                 receptacle = input("Enter the target receptacle: ")
+            llm_response = [("pickup", target_object), ("place", receptacle)]
         else:
             # Call the LLM client and parse
             llm_response = chat_wrapper.query()
diff --git a/src/stretch/app/goto_and_execute_skill.py b/src/stretch/app/goto_and_execute_skill.py
index 9b57b54a5..c2dbcc151 100644
--- a/src/stretch/app/goto_and_execute_skill.py
+++ b/src/stretch/app/goto_and_execute_skill.py
@@ -204,7 +204,6 @@ def demo_main(
         semantic_sensor = None
 
     print("- Start robot agent")
-    grasp_client = None  # GraspPlanner(robot, env=None, semantic_sensor=semantic_sensor)
 
     pos_err_threshold = parameters["trajectory_pos_err_threshold"]
     rot_err_threshold = parameters["trajectory_rot_err_threshold"]
@@ -215,7 +214,7 @@ def demo_main(
     input_path = Path(input_path)
     print("Loading:", input_path)
 
-    demo = RobotAgent(robot, parameters, semantic_sensor, grasp_client=grasp_client, voxel_map=None)
+    demo = RobotAgent(robot, parameters, semantic_sensor, voxel_map=None)
     voxel_map = demo.voxel_map
     print("Reading from pkl file of raw observations...")
     voxel_map.read_from_pickle(input_path, num_frames=-1)
diff --git a/src/stretch/app/pickup.py b/src/stretch/app/pickup.py
index 369c12e29..cc89ac569 100644
--- a/src/stretch/app/pickup.py
+++ b/src/stretch/app/pickup.py
@@ -89,11 +89,8 @@ def main(
         verbose=verbose,
     )
 
-    # Start moving the robot around
-    grasp_client = None
-
     # Agents wrap the robot high level planning interface for now
-    agent = RobotAgent(robot, parameters, semantic_sensor, grasp_client=grasp_client)
+    agent = RobotAgent(robot, parameters, semantic_sensor)
     agent.start(visualize_map_at_start=show_intermediate_maps)
     if reset:
         agent.move_closed_loop([0, 0, 0], max_time=60.0)

From f26f35a1bdc83aa26212aef8734748af63fd1eb6 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 13:52:09 -0400
Subject: [PATCH 399/410] update

---
 src/stretch/agent/task/pickup/pickup_executor.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/stretch/agent/task/pickup/pickup_executor.py b/src/stretch/agent/task/pickup/pickup_executor.py
index 38baff510..5d96da842 100644
--- a/src/stretch/agent/task/pickup/pickup_executor.py
+++ b/src/stretch/agent/task/pickup/pickup_executor.py
@@ -320,15 +320,14 @@ def __call__(self, response: List[Tuple[str, str]]) -> bool:
             logger.info(f"{i} {command} {args}")
 
             if command in self.available_actions:
-                command_with_args = "self." + command + "(" + args + ")"
-
                 if command == "pickup":
                     if (i + 1 < len(response)) and (response[i + 1][0] == "place"):
-                        eval(command_with_args)
+                        self._pickup(args, response[i + 1][1])
                         i += 1
                     else:
                         logger.error("Pickup without place! Doing nothing.")
                 else:
+                    command_with_args = "self." + command + '("' + args + '")'
                     eval(command_with_args)
             else:
                 logger.error(f"Skipping unknown command: {command}")

From a2b095137dd81805499418acfa1fe44cea016971 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 13:55:22 -0400
Subject: [PATCH 400/410] break if used without llm agent

---
 src/stretch/app/ai_pickup.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/stretch/app/ai_pickup.py b/src/stretch/app/ai_pickup.py
index 46c81c616..490e65ac5 100644
--- a/src/stretch/app/ai_pickup.py
+++ b/src/stretch/app/ai_pickup.py
@@ -182,6 +182,9 @@ def main(
             # Send the robot home at the end!
             agent.go_home()
 
+        if llm_client is None:
+            break
+
     # At the end, disable everything
     robot.stop()
 

From 61da5d216fc3ca2900b09930d3d75b14c8a05854 Mon Sep 17 00:00:00 2001
From: Atharva Pusalkar <atharvapusalkar18@gmail.com>
Date: Thu, 10 Oct 2024 14:03:58 -0400
Subject: [PATCH 401/410] Pass in commands and args

---
 src/stretch/llms/prompts/pickup_prompt.py | 31 ++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index de433b27b..b637f8366 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -154,7 +154,36 @@ def configure(self, **kwargs) -> str:
     def parse_response(self, response: str):
         """Parse the pickup, place, and say commands from the response into a list."""
         commands = [line for line in response.split("\n") if line]
-        return commands
+
+        commands_with_args = []
+
+        for command in commands:
+            if command.startswith("pickup("):
+                commands_with_args.append(("pickup", command[7:-1]))
+            elif command.startswith("place("):
+                commands_with_args.append(("place", command[6:-1]))
+            elif command.startswith("say("):
+                commands_with_args.append(("say", command[4:-1]))
+            elif command.startswith("wave()"):
+                commands_with_args.append(("wave", ""))
+            elif command.startswith("go_home()"):
+                commands_with_args.append(("go_home", ""))
+            elif command.startswith("explore("):
+                commands_with_args.append(("explore", command[8:-1]))
+            elif command.startswith("nod_head()"):
+                commands_with_args.append(("nod_head", ""))
+            elif command.startswith("shake_head()"):
+                commands_with_args.append(("shake_head", ""))
+            elif command.startswith("avert_gaze()"):
+                commands_with_args.append(("avert_gaze", ""))
+            elif command.startswith("find("):
+                commands_with_args.append(("find", command[5:-1]))
+            elif command.startswith("quit()"):
+                commands_with_args.append(("quit", ""))
+            else:
+                raise ValueError(f"Unknown command: {command}")
+
+        return commands_with_args
 
     def get_available_actions(self) -> List[str]:
         return super().get_available_actions() + [

From a3ac6595588ba4570248cdf5edef53f94e189088 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 14:07:15 -0400
Subject: [PATCH 402/410] update

---
 src/stretch/agent/task/pickup/pickup_executor.py | 4 +++-
 src/stretch/llms/prompts/pickup_prompt.py        | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/stretch/agent/task/pickup/pickup_executor.py b/src/stretch/agent/task/pickup/pickup_executor.py
index 5d96da842..88de58574 100644
--- a/src/stretch/agent/task/pickup/pickup_executor.py
+++ b/src/stretch/agent/task/pickup/pickup_executor.py
@@ -307,6 +307,8 @@ def __call__(self, response: List[Tuple[str, str]]) -> bool:
         self._task = Task()
         self._operation_count = 0
 
+        print(response)
+
         if response is None or len(response) == 0:
             logger.error("No commands to execute!")
             self.agent.robot_say("I'm sorry, I didn't understand that.")
@@ -322,7 +324,7 @@ def __call__(self, response: List[Tuple[str, str]]) -> bool:
             if command in self.available_actions:
                 if command == "pickup":
                     if (i + 1 < len(response)) and (response[i + 1][0] == "place"):
-                        self._pickup(args, response[i + 1][1])
+                        self.pickup(args, response[i + 1][1])
                         i += 1
                     else:
                         logger.error("Pickup without place! Doing nothing.")
diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index b637f8366..d081a9418 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -158,6 +158,7 @@ def parse_response(self, response: str):
         commands_with_args = []
 
         for command in commands:
+            print(command)
             if command.startswith("pickup("):
                 commands_with_args.append(("pickup", command[7:-1]))
             elif command.startswith("place("):
@@ -180,6 +181,8 @@ def parse_response(self, response: str):
                 commands_with_args.append(("find", command[5:-1]))
             elif command.startswith("quit()"):
                 commands_with_args.append(("quit", ""))
+            elif command.startswith("end()"):
+                break
             else:
                 raise ValueError(f"Unknown command: {command}")
 

From 62cdedf7cde379bfac5b89bdb6c2262b5daf3ab4 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 14:18:08 -0400
Subject: [PATCH 403/410] add pickup

---
 .../agent/task/pickup/pickup_executor.py      | 277 +++---------------
 src/stretch/llms/prompts/pickup_prompt.py     |  85 +++---
 2 files changed, 91 insertions(+), 271 deletions(-)

diff --git a/src/stretch/agent/task/pickup/pickup_executor.py b/src/stretch/agent/task/pickup/pickup_executor.py
index 88de58574..6e97a9f90 100644
--- a/src/stretch/agent/task/pickup/pickup_executor.py
+++ b/src/stretch/agent/task/pickup/pickup_executor.py
@@ -9,29 +9,10 @@
 
 from typing import List, Tuple
 
-from stretch.agent.operations import (
-    AvertGazeOperation,
-    ExploreOperation,
-    GoHomeOperation,
-    GoToNavOperation,
-    GraspObjectOperation,
-    NavigateToObjectOperation,
-    NodHeadOperation,
-    OpenLoopGraspObjectOperation,
-    PlaceObjectOperation,
-    PreGraspObjectOperation,
-    RotateInPlaceOperation,
-    SearchForObjectOnFloorOperation,
-    SearchForReceptacleOperation,
-    ShakeHeadOperation,
-    SpeakOperation,
-    WaveOperation,
-)
 from stretch.agent.robot_agent import RobotAgent
 from stretch.agent.task.emote import EmoteTask
 from stretch.agent.task.pickup.pickup_task import PickupTask
 from stretch.core import AbstractRobotClient
-from stretch.core.task import Task
 from stretch.utils.logger import Logger
 
 logger = Logger(__name__)
@@ -48,10 +29,10 @@ def __init__(
         self,
         robot: AbstractRobotClient,
         agent: RobotAgent,
-        available_actions: List[str],
         match_method: str = "feature",
         open_loop: bool = False,
         dry_run: bool = False,
+        available_actions: List[str] = None,
     ) -> None:
         """Initialize the executor.
 
@@ -62,6 +43,7 @@ def __init__(
         """
         self.robot = robot
         self.agent = agent
+        self.available_actions = available_actions
 
         # Do type checks
         if not isinstance(self.robot, AbstractRobotClient):
@@ -76,204 +58,6 @@ def __init__(
         self._match_method = match_method
         self._open_loop = open_loop
 
-        # Managed task
-        self._task = Task()
-        self._operation_count = 0
-        self.available_actions = available_actions
-
-    def say(self, text: str) -> None:
-        """Use the robot to say the text."""
-        say_operation = SpeakOperation(
-            f"{str(self._operation_count)}_say_" + text, agent=self.agent, robot=self.robot
-        )
-        say_operation.configure(message=text)
-        self._task.add_operation(say_operation, True)
-        self._operation_count += 1
-
-    def pickup(self, target_object: str, target_receptacle: str) -> None:
-        """Pick up the object and place it in the receptacle."""
-        # Put the robot into navigation mode
-        go_to_navigation_mode = GoToNavOperation(
-            f"{str(self._operation_count)}_go to navigation mode", self.agent, retry_on_failure=True
-        )
-        self._operation_count += 1
-
-        # Spin in place to find objects.
-        rotate_in_place = RotateInPlaceOperation(
-            f"{str(self._operation_count)}_rotate_in_place",
-            self.agent,
-            parent=go_to_navigation_mode,
-        )
-        self._operation_count += 1
-
-        # Look for the target receptacle
-        search_for_receptacle = SearchForReceptacleOperation(
-            f"{str(self._operation_count)}_search_for_{self.target_receptacle}",
-            self.agent,
-            parent=rotate_in_place,
-            retry_on_failure=True,
-            match_method=self._match_method,
-        )
-        self._operation_count += 1
-
-        # Try to expand the frontier and find an object; or just wander around for a while.
-        search_for_object = SearchForObjectOnFloorOperation(
-            f"{str(self._operation_count)}_search_for_{self.target_object}_on_floor",
-            self.agent,
-            retry_on_failure=True,
-            match_method=self._match_method,
-        )
-        self._operation_count += 1
-        if self.agent.target_object is not None:
-            # Overwrite the default object to search for
-            search_for_object.set_target_object_class(self.agent.target_object)
-        if self.agent.target_receptacle is not None:
-            search_for_receptacle.set_target_object_class(self.agent.target_receptacle)
-
-        # After searching for object, we should go to an instance that we've found. If we cannot do that, keep searching.
-        go_to_object = NavigateToObjectOperation(
-            f"{str(self._operation_count)}_go_to_object",
-            self.agent,
-            parent=search_for_object,
-            on_cannot_start=search_for_object,
-            to_receptacle=False,
-        )
-        self._operation_count += 1
-
-        # After searching for object, we should go to an instance that we've found. If we cannot do that, keep searching.
-        go_to_receptacle = NavigateToObjectOperation(
-            f"{str(self._operation_count)}_go_to_receptacle",
-            self.agent,
-            on_cannot_start=search_for_receptacle,
-            to_receptacle=True,
-        )
-        self._operation_count += 1
-
-        # When about to start, run object detection and try to find the object. If not in front of us, explore again.
-        # If we cannot find the object, we should go back to the search_for_object operation.
-        # To determine if we can start, we just check to see if there's a detectable object nearby.
-        pregrasp_object = PreGraspObjectOperation(
-            f"{str(self._operation_count)}_prepare_to_grasp",
-            self.agent,
-            on_failure=None,
-            on_cannot_start=go_to_object,
-            retry_on_failure=True,
-        )
-        self._operation_count += 1
-        # If we cannot start, we should go back to the search_for_object operation.
-        # To determine if we can start, we look at the end effector camera and see if there's anything detectable.
-        if self.use_visual_servoing_for_grasp:
-            grasp_object = GraspObjectOperation(
-                f"{str(self._operation_count)}_grasp_the_{self.target_object}",
-                self.agent,
-                parent=pregrasp_object,
-                on_failure=pregrasp_object,
-                on_cannot_start=go_to_object,
-                retry_on_failure=False,
-            )
-            grasp_object.set_target_object_class(self.agent.target_object)
-            grasp_object.servo_to_grasp = True
-            grasp_object.match_method = self._match_method
-            self._operation_count += 1
-        else:
-            grasp_object = OpenLoopGraspObjectOperation(
-                f"{str(self._operation_count)}_grasp_the_{self.target_object}",
-                self.agent,
-                parent=pregrasp_object,
-                on_failure=pregrasp_object,
-                on_cannot_start=go_to_object,
-                retry_on_failure=False,
-            )
-            grasp_object.set_target_object_class(self.agent.target_object)
-            grasp_object.match_method = self._match_method
-            self._operation_count += 1
-
-        place_object_on_receptacle = PlaceObjectOperation(
-            f"{str(self._operation_count)}_place_object_on_receptacle",
-            self.agent,
-            on_cannot_start=go_to_receptacle,
-        )
-        self._operation_count += 1
-
-        self._task.add_operation(go_to_navigation_mode)
-        self._task.add_operation(rotate_in_place)
-        self._task.add_operation(search_for_receptacle)
-        self._task.add_operation(search_for_object)
-        self._task.add_operation(go_to_object)
-        self._task.add_operation(pregrasp_object)
-        self._task.add_operation(grasp_object)
-        self._task.add_operation(go_to_receptacle)
-        self._task.add_operation(place_object_on_receptacle)
-
-        self._task.connect_on_success(go_to_navigation_mode.name, search_for_receptacle.name)
-        self._task.connect_on_success(search_for_receptacle.name, search_for_object.name)
-        self._task.connect_on_success(search_for_object.name, go_to_object.name)
-        self._task.connect_on_success(go_to_object.name, pregrasp_object.name)
-        self._task.connect_on_success(pregrasp_object.name, grasp_object.name)
-        self._task.connect_on_success(grasp_object.name, go_to_receptacle.name)
-        self._task.connect_on_success(go_to_receptacle.name, place_object_on_receptacle.name)
-
-        self._task.connect_on_success(search_for_receptacle.name, search_for_object.name)
-
-        self._task.connect_on_cannot_start(go_to_object.name, search_for_object.name)
-
-    def wave(self) -> None:
-        """Wave to the user."""
-        wave_operation = WaveOperation(
-            f"{str(self._operation_count)}_wave", self.agent, robot=self.robot
-        )
-        self._task.add_operation(wave_operation, True)
-        self._operation_count += 1
-
-    def go_home(self) -> None:
-        """Go back to the home position."""
-        go_home_operation = GoHomeOperation(
-            f"{str(self._operation_count)}_go_home", self.agent, robot=self.robot
-        )
-        self._task.add_operation(go_home_operation, True)
-        self._operation_count += 1
-
-    def explore(self) -> None:
-        """Explore the environment."""
-        explore_operation = ExploreOperation(
-            f"{str(self._operation_count)}_explore", self.agent, robot=self.robot
-        )
-        self._task.add_operation(explore_operation, True)
-        self._operation_count += 1
-
-    def nod_head(self) -> None:
-        """Nod the head."""
-        nod_head_operation = NodHeadOperation(
-            f"{str(self._operation_count)}_nod_head", self.agent, robot=self.robot
-        )
-        self._task.add_operation(nod_head_operation, True)
-        self._operation_count += 1
-
-    def shake_head(self) -> None:
-        """Shake the head."""
-        shake_head_operation = ShakeHeadOperation(
-            f"{str(self._operation_count)}_shake_head", self.agent, robot=self.robot
-        )
-        self._task.add_operation(shake_head_operation, True)
-        self._operation_count += 1
-
-    def avert_gaze(self) -> None:
-        """Avert the gaze."""
-        avert_gaze_operation = AvertGazeOperation(
-            f"{str(self._operation_count)}_avert_gaze", self.agent, robot=self.robot
-        )
-        self._task.add_operation(avert_gaze_operation, True)
-        self._operation_count += 1
-
-    def find(self, object_name: str) -> None:
-        """Find the object."""
-        speak_not_implemented = SpeakOperation(
-            f"{str(self._operation_count)}_find_" + object_name, agent=self.agent, robot=self.robot
-        )
-        speak_not_implemented.configure(message="Find operation not implemented")
-        self._task.add_operation(speak_not_implemented, True)
-        self._operation_count += 1
-
     def _pickup(self, target_object: str, target_receptacle: str) -> None:
 
         # After the robot has started...
@@ -304,10 +88,6 @@ def __call__(self, response: List[Tuple[str, str]]) -> bool:
             True if we should keep going, False if we should stop.
         """
         i = 0
-        self._task = Task()
-        self._operation_count = 0
-
-        print(response)
 
         if response is None or len(response) == 0:
             logger.error("No commands to execute!")
@@ -320,22 +100,49 @@ def __call__(self, response: List[Tuple[str, str]]) -> bool:
         while i < len(response):
             command, args = response[i]
             logger.info(f"{i} {command} {args}")
-
-            if command in self.available_actions:
-                if command == "pickup":
-                    if (i + 1 < len(response)) and (response[i + 1][0] == "place"):
-                        self.pickup(args, response[i + 1][1])
-                        i += 1
-                    else:
-                        logger.error("Pickup without place! Doing nothing.")
+            if command == "say":
+                # Use TTS to say the text
+                logger.info(f"[Pickup task] Saying: {args}")
+                self.agent.robot_say(args)
+            elif command == "pickup":
+                logger.info(f"[Pickup task] Pickup: {args}")
+                target_object = args
+                i += 1
+                next_command, next_args = response[i]
+                if next_command != "place":
+                    i -= 1
+                    logger.error("Pickup without place! Doing nothing.")
                 else:
-                    command_with_args = "self." + command + '("' + args + '")'
-                    eval(command_with_args)
+                    logger.info(f"{i} {next_command} {next_args}")
+                    logger.info(f"[Pickup task] Place: {next_args}")
+                target_receptacle = next_args
+                self._pickup(target_object, target_receptacle)
+            elif command == "place":
+                logger.error("Place without pickup! Doing nothing.")
+            elif command == "wave":
+                self.agent.move_to_manip_posture()
+                self.emote_task.get_task("wave").run()
+                self.agent.move_to_manip_posture()
+            elif command == "go_home":
+                self.agent.go_home()
+            elif command == "explore":
+                self.agent.explore()
+            elif command == "nod_head":
+                self.emote_task.get_task("nod_head").run()
+            elif command == "shake_head":
+                self.emote_task.get_task("shake_head").run()
+            elif command == "avert_gaze":
+                self.emote_task.get_task("avert_gaze").run()
+            elif command == "quit":
+                logger.info("[Pickup task] Quitting.")
+                self.robot.stop()
+                return False
+            elif command == "end":
+                logger.info("[Pickup task] Ending.")
+                break
             else:
                 logger.error(f"Skipping unknown command: {command}")
 
             i += 1
-
-        self._task.run()
-
+        # If we did not explicitly receive a quit command, we are not yet done.
         return True
diff --git a/src/stretch/llms/prompts/pickup_prompt.py b/src/stretch/llms/prompts/pickup_prompt.py
index d081a9418..80e890cc3 100644
--- a/src/stretch/llms/prompts/pickup_prompt.py
+++ b/src/stretch/llms/prompts/pickup_prompt.py
@@ -153,54 +153,67 @@ def configure(self, **kwargs) -> str:
 
     def parse_response(self, response: str):
         """Parse the pickup, place, and say commands from the response into a list."""
-        commands = [line for line in response.split("\n") if line]
-
-        commands_with_args = []
+        commands = []
+        for line in response.split("\n"):
+            if line.startswith("pickup("):
+                commands.append(line)
+            elif line.startswith("place("):
+                commands.append(line)
+            elif line.startswith("say("):
+                commands.append(line)
+            elif line.startswith("wave()"):
+                commands.append(line)
+            elif line.startswith("go_home()"):
+                commands.append(line)
+            elif line.startswith("explore()"):
+                commands.append(line)
+            elif line.startswith("nod_head()"):
+                commands.append(line)
+            elif line.startswith("shake_head()"):
+                commands.append(line)
+            elif line.startswith("avert_gaze()"):
+                commands.append(line)
+            elif line.startswith("quit()"):
+                commands.append(line)
+            elif line.startswith("find("):
+                commands.append(line)
+            elif line.startswith("end()"):
+                # Stop parsing if we see the end command
+                break
 
+        # Now go through commands and parse into a tuple (command, args)
+        parsed_commands = []
         for command in commands:
-            print(command)
-            if command.startswith("pickup("):
-                commands_with_args.append(("pickup", command[7:-1]))
+            if command.startswith("say("):
+                parsed_commands.append(("say", command[4:-1]))
+            elif command.startswith("pickup("):
+                parsed_commands.append(("pickup", command[7:-1]))
             elif command.startswith("place("):
-                commands_with_args.append(("place", command[6:-1]))
-            elif command.startswith("say("):
-                commands_with_args.append(("say", command[4:-1]))
+                parsed_commands.append(("place", command[6:-1]))
             elif command.startswith("wave()"):
-                commands_with_args.append(("wave", ""))
+                parsed_commands.append(("wave", ""))
             elif command.startswith("go_home()"):
-                commands_with_args.append(("go_home", ""))
-            elif command.startswith("explore("):
-                commands_with_args.append(("explore", command[8:-1]))
+                parsed_commands.append(("go_home", ""))
+            elif command.startswith("explore()"):
+                parsed_commands.append(("explore", ""))
             elif command.startswith("nod_head()"):
-                commands_with_args.append(("nod_head", ""))
+                parsed_commands.append(("nod_head", ""))
             elif command.startswith("shake_head()"):
-                commands_with_args.append(("shake_head", ""))
+                parsed_commands.append(("shake_head", ""))
             elif command.startswith("avert_gaze()"):
-                commands_with_args.append(("avert_gaze", ""))
+                parsed_commands.append(("avert_gaze", ""))
             elif command.startswith("find("):
-                commands_with_args.append(("find", command[5:-1]))
+                parsed_commands.append(("find", command[5:-1]))
             elif command.startswith("quit()"):
-                commands_with_args.append(("quit", ""))
+                # Quit actually shuts down the robot.
+                parsed_commands.append(("quit", ""))
+                break
             elif command.startswith("end()"):
+                # Stop parsing if we see the end command
+                # This really shouldn't happen, but just in case
                 break
-            else:
-                raise ValueError(f"Unknown command: {command}")
-
-        return commands_with_args
-
-    def get_available_actions(self) -> List[str]:
-        return super().get_available_actions() + [
-            "pickup",
-            "place",
-            "say",
-            "wave",
-            "go_home",
-            "explore",
-            "nod_head",
-            "shake_head",
-            "avert_gaze",
-            "find",
-        ]
+
+        return parsed_commands
 
     def get_object(self, response: List[Tuple[str, str]]) -> str:
         """Return the object from the response."""

From 9690722f2d13cf69337e64cfdcc636bf058fb47d Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 14:22:39 -0400
Subject: [PATCH 404/410] update pickup executor

---
 src/stretch/agent/task/pickup/pickup_executor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/task/pickup/pickup_executor.py b/src/stretch/agent/task/pickup/pickup_executor.py
index 6e97a9f90..57c8d4826 100644
--- a/src/stretch/agent/task/pickup/pickup_executor.py
+++ b/src/stretch/agent/task/pickup/pickup_executor.py
@@ -99,10 +99,10 @@ def __call__(self, response: List[Tuple[str, str]]) -> bool:
         # Else, execute things as they come
         while i < len(response):
             command, args = response[i]
-            logger.info(f"{i} {command} {args}")
+            logger.info(f"Command: {i} {command} {args}")
             if command == "say":
                 # Use TTS to say the text
-                logger.info(f"[Pickup task] Saying: {args}")
+                logger.info(f"Saying: {args}")
                 self.agent.robot_say(args)
             elif command == "pickup":
                 logger.info(f"[Pickup task] Pickup: {args}")

From a5ad4471839b2548fa091daa2e805ce3f49e601b Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 14:51:53 -0400
Subject: [PATCH 405/410] update zmq client

---
 src/stretch/agent/robot_agent.py        | 40 +++++++++++--------------
 src/stretch/agent/zmq_client.py         | 24 +--------------
 src/stretch/app/mapping.py              |  2 +-
 src/stretch/config/default_planner.yaml |  4 +--
 4 files changed, 21 insertions(+), 49 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 6c2cd4b8e..e81c31e53 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -20,7 +20,6 @@
 
 import numpy as np
 import torch
-import zmq
 from PIL import Image
 
 import stretch.utils.logger as logger
@@ -181,10 +180,10 @@ def __init__(
             self._get_observations_thread.start()
 
             # Set up ZMQ subscriber
-            self.context = self.robot.get_zmq_context()
-            self.sub_socket = self.context.socket(zmq.SUB)
-            self.sub_socket.connect(f"tcp://localhost:{obs_sub_port}")
-            self.sub_socket.setsockopt_string(zmq.SUBSCRIBE, "")
+            # self.context = self.robot.get_zmq_context()
+            # self.sub_socket = self.context.socket(zmq.SUB)
+            # self.sub_socket.connect(f"tcp://localhost:{obs_sub_port}")
+            # self.sub_socket.setsockopt_string(zmq.SUBSCRIBE, "")
         else:
             self._update_map_thread = None
             self._get_observations_thread = None
@@ -499,34 +498,29 @@ def get_observations_loop(self) -> None:
         """Threaded function that gets observations in real-time. This is useful for when we are processing real-time updates."""
 
         while self.robot.running:
-            obs = None
             t0 = timeit.default_timer()
 
             self._obs_history_lock.acquire()
 
-            while obs is None:
-                # obs = self.robot.get_observation()
-                obs = self.sub_socket.recv_pyobj()
-                # print(obs)
-                if obs is None:
-                    continue
-
-                obs = Observations.from_dict(obs)
+            obs = self.robot.get_observation()
+            if obs is None:
+                time.sleep(0.1)
+                continue
 
-                if (len(self.obs_history) > 0) and (
-                    obs.lidar_timestamp == self.obs_history[-1].lidar_timestamp
-                ):
-                    obs = None
-                t1 = timeit.default_timer()
-                if t1 - t0 > 10:
-                    logger.error("Failed to get observation")
-                    break
+            if (len(self.obs_history) > 0) and (
+                obs.lidar_timestamp == self.obs_history[-1].lidar_timestamp
+            ):
+                obs = None
+            t1 = timeit.default_timer()
+            if t1 - t0 > 10:
+                logger.error("Failed to get observation")
+                break
 
             # t1 = timeit.default_timer()
             self.obs_history.append(obs)
             self._obs_history_lock.release()
             self.obs_count += 1
-            time.sleep(0.05)
+            time.sleep(0.1)
 
     def update_map_with_pose_graph(self):
         """Update our voxel map using a pose graph"""
diff --git a/src/stretch/agent/zmq_client.py b/src/stretch/agent/zmq_client.py
index 0216d0d9f..cd063ed75 100644
--- a/src/stretch/agent/zmq_client.py
+++ b/src/stretch/agent/zmq_client.py
@@ -79,19 +79,6 @@ def get_zmq_context(self) -> zmq.Context:
         """
         return self.context
 
-    def _create_pub_obs_socket(self, port: int):
-        send_socket = self.context.socket(zmq.PUB)
-        send_socket.setsockopt(zmq.SNDHWM, 1)
-        send_socket.setsockopt(zmq.RCVHWM, 1)
-
-        # Publish within the computer
-        send_address = "tcp://*:" + str(port)
-        print(f"Binding to {send_address} to send action messages...")
-        send_socket.bind(send_address)
-        print("...bound.")
-
-        return send_socket
-
     def __init__(
         self,
         robot_ip: str = "",
@@ -195,12 +182,7 @@ def __init__(
             recv_servo_port, robot_ip, use_remote_computer, message_type="visual servoing data"
         )
 
-        if self._publish_observations:
-            self.pub_obs_socket = self._create_pub_obs_socket(pub_obs_port)
-        else:
-            self.pub_obs_socket = None
-
-        # SEnd actions back to the robot for execution
+        # Send actions back to the robot for execution
         self.send_socket = self.context.socket(zmq.PUB)
         self.send_socket.setsockopt(zmq.SNDHWM, 1)
         self.send_socket.setsockopt(zmq.RCVHWM, 1)
@@ -1091,8 +1073,6 @@ def _update_obs(self, obs):
         """Update observation internally with lock"""
         with self._obs_lock:
             self._obs = obs
-            if self._publish_observations:
-                self.pub_obs_socket.send_pyobj(obs)
             self._last_step = obs["step"]
             if self._iter <= 0:
                 self._iter = max(self._last_step, self._iter)
@@ -1626,8 +1606,6 @@ def stop(self):
         self.recv_state_socket.close()
         self.recv_servo_socket.close()
         self.send_socket.close()
-        if self.pub_obs_socket is not None:
-            self.pub_obs_socket.close()
         self.context.term()
 
 
diff --git a/src/stretch/app/mapping.py b/src/stretch/app/mapping.py
index 74440704b..2e15c2cd7 100644
--- a/src/stretch/app/mapping.py
+++ b/src/stretch/app/mapping.py
@@ -52,7 +52,7 @@
 @click.option("--reset", is_flag=True, help="Reset the robot to origin before starting")
 @click.option(
     "--enable-realtime-updates",
-    "--enable-realtime-updates",
+    "--enable_realtime_updates",
     is_flag=True,
     help="Enable real-time updates so the robot will scan its environment and update the map as it moves around",
 )
diff --git a/src/stretch/config/default_planner.yaml b/src/stretch/config/default_planner.yaml
index 386d1ed23..d88efd670 100644
--- a/src/stretch/config/default_planner.yaml
+++ b/src/stretch/config/default_planner.yaml
@@ -40,8 +40,8 @@ filters:
   # smooth_kernel_size: 4
   # smooth_kernel_size: 0
   use_median_filter: True
-  # median_filter_size: 4
-  median_filter_size: 2
+  median_filter_size: 4
+  # median_filter_size: 2
   median_filter_max_error: 0.01
   use_derivative_filter: False
   derivative_filter_threshold: 0.1

From 94281692dcf00e8e13e9f86b7629db5221f5aacf Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 15:25:21 -0400
Subject: [PATCH 406/410] fixes

---
 src/stretch/agent/robot_agent.py | 62 ++++++++++++++++++++------------
 1 file changed, 40 insertions(+), 22 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index e81c31e53..6659b2fe9 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -498,23 +498,31 @@ def get_observations_loop(self) -> None:
         """Threaded function that gets observations in real-time. This is useful for when we are processing real-time updates."""
 
         while self.robot.running:
+            obs = None
             t0 = timeit.default_timer()
 
             self._obs_history_lock.acquire()
 
-            obs = self.robot.get_observation()
-            if obs is None:
-                time.sleep(0.1)
-                continue
+            while obs is None:
+                obs = self.robot.get_observation()
+                # obs = self.sub_socket.recv_pyobj()
+                # print(obs)
+                if obs is None:
+                    continue
 
-            if (len(self.obs_history) > 0) and (
-                obs.lidar_timestamp == self.obs_history[-1].lidar_timestamp
-            ):
-                obs = None
-            t1 = timeit.default_timer()
-            if t1 - t0 > 10:
-                logger.error("Failed to get observation")
-                break
+                # obs = Observations.from_dict(obs)
+
+                if (len(self.obs_history) > 0) and (
+                    obs.lidar_timestamp == self.obs_history[-1].lidar_timestamp
+                ):
+                    obs = None
+                t1 = timeit.default_timer()
+                if t1 - t0 > 10:
+                    logger.error("Failed to get observation")
+                    break
+
+                # Add a delay to make sure we don't get too many observations
+                time.sleep(0.05)
 
             # t1 = timeit.default_timer()
             self.obs_history.append(obs)
@@ -522,7 +530,7 @@ def get_observations_loop(self) -> None:
             self.obs_count += 1
             time.sleep(0.1)
 
-    def update_map_with_pose_graph(self):
+    def update_map_with_pose_graph(self, verbose: bool = True):
         """Update our voxel map using a pose graph"""
 
         t0 = timeit.default_timer()
@@ -539,7 +547,8 @@ def update_map_with_pose_graph(self):
 
             for vertex in self.pose_graph:
                 if abs(vertex[0] - lidar_timestamp) < self._realtime_temporal_threshold:
-                    # print(f"Exact match found! {vertex[0]} and obs {idx}: {lidar_timestamp}")
+                    if verbose:
+                        print(f"Exact match found! {vertex[0]} and obs {idx}: {lidar_timestamp}")
 
                     self.obs_history[idx].is_pose_graph_node = True
                     self.obs_history[idx].gps = np.array([vertex[1], vertex[2]])
@@ -549,9 +558,10 @@ def update_map_with_pose_graph(self):
                         ]
                     )
 
-                    # print(
-                    #     f"obs gps: {self.obs_history[idx].gps}, compass: {self.obs_history[idx].compass}"
-                    # )
+                    if verbose:
+                        print(
+                            f"obs gps: {self.obs_history[idx].gps}, compass: {self.obs_history[idx].compass}"
+                        )
 
                     if (
                         self.obs_history[idx].task_observations is None
@@ -564,7 +574,8 @@ def update_map_with_pose_graph(self):
                     < self._realtime_matching_distance
                     and self.obs_history[idx].pose_graph_timestamp is None
                 ):
-                    # print(f"Close match found! {vertex[0]} and obs {idx}: {lidar_timestamp}")
+                    if verbose:
+                        print(f"Close match found! {vertex[0]} and obs {idx}: {lidar_timestamp}")
 
                     self.obs_history[idx].is_pose_graph_node = True
                     self.obs_history[idx].pose_graph_timestamp = vertex[0]
@@ -595,14 +606,21 @@ def update_map_with_pose_graph(self):
                     self.obs_history[idx].compass = new_compass
 
         t2 = timeit.default_timer()
-        # print(f"Done updating past observations. Time: {t2- t1}")
+        if verbose:
+            print(f"Done updating past observations. Time: {t2- t1}")
+
+            print("Updating voxel map")
 
-        # print("Updating voxel map")
         t3 = timeit.default_timer()
         self.voxel_map.reset()
+        added = 0
         for obs in self.obs_history:
             if obs.is_pose_graph_node:
                 self.voxel_map.add_obs(obs)
+                added += 1
+        if verbose:
+            print("----")
+            print(f"Added {added} observations to voxel map")
 
         self._obs_history_lock.release()
 
@@ -1912,10 +1930,10 @@ def execute(self, plan: str) -> bool:
     def get_object_centric_world_representation(
         self,
         instance_memory,
-        max_context_length: int,
-        sample_strategy: str,
         task: str = None,
         text_features=None,
+        max_context_length=20,
+        sample_strategy="clip",
         scene_graph=None,
     ) -> ObjectCentricObservations:
         """Get version that LLM can handle - convert images into torch if not already in that format. This will also clip the number of instances to the max context length.

From b50c13e1645f23116c04b01ee235b805bb28765b Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 15:27:12 -0400
Subject: [PATCH 407/410] update

---
 src/stretch/agent/robot_agent.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 6659b2fe9..042c98489 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -621,6 +621,7 @@ def update_map_with_pose_graph(self, verbose: bool = True):
         if verbose:
             print("----")
             print(f"Added {added} observations to voxel map")
+            print()
 
         self._obs_history_lock.release()
 

From 8532365636e7616d88906768b87912f132fdcec8 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 16:46:22 -0400
Subject: [PATCH 408/410] update

---
 src/stretch/agent/robot_agent.py | 297 ++++++++++++++++---------------
 1 file changed, 156 insertions(+), 141 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index 042c98489..de93d6604 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -152,6 +152,9 @@ def __init__(
         # Previously sampled goal during exploration
         self._previous_goal = None
 
+        # Track if we are still running
+        self._running = True
+
         # Create a simple motion planner
         self.planner: Planner = RRTConnect(self.space, self.space.is_valid)
         if parameters["motion_planner"]["shortcut_plans"]:
@@ -165,12 +168,14 @@ def __init__(
                 min_angle=parameters["motion_planner"]["simplify"]["min_angle"],
             )
 
+        # Locks
+        self._robot_lock = Lock()
+        self._obs_history_lock = Lock()
+        self._map_lock = Lock()
+
         if self._realtime_updates:
             logger.alert("Using real-time updates!")
 
-            # Locks
-            self._robot_lock = Lock()
-            self._obs_history_lock = Lock()
             # Map updates
             self._update_map_thread = Thread(target=self.update_map_loop)
             self._update_map_thread.start()
@@ -197,8 +202,9 @@ def get_voxel_map(self) -> SparseVoxelMap:
         return self.voxel_map
 
     def __del__(self):
-        if self._update_map_thread is not None and self._update_map_thread.is_alive():
-            self._update_map_thread.join()
+        self._running = False
+        # if self._update_map_thread is not None and self._update_map_thread.is_alive():
+        #    self._update_map_thread.join()
 
     def _create_voxel_map(self, parameters: Parameters) -> SparseVoxelMap:
         """Create a voxel map from parameters.
@@ -497,7 +503,7 @@ def show_map(self) -> None:
     def get_observations_loop(self) -> None:
         """Threaded function that gets observations in real-time. This is useful for when we are processing real-time updates."""
 
-        while self.robot.running:
+        while self.robot.running and self._running:
             obs = None
             t0 = timeit.default_timer()
 
@@ -530,8 +536,13 @@ def get_observations_loop(self) -> None:
             self.obs_count += 1
             time.sleep(0.1)
 
-    def update_map_with_pose_graph(self, verbose: bool = True):
-        """Update our voxel map using a pose graph"""
+    def stop_updates(self):
+        """Stop the update threads."""
+        self._running = False
+
+    def update_map_with_pose_graph(self, verbose: bool = False):
+        """Update our voxel map using a pose graph.
+        Updates the map from pose graph. This is useful for when we are processing real-time updates."""
 
         t0 = timeit.default_timer()
         self.pose_graph = self.robot.get_pose_graph()
@@ -612,12 +623,13 @@ def update_map_with_pose_graph(self, verbose: bool = True):
             print("Updating voxel map")
 
         t3 = timeit.default_timer()
-        self.voxel_map.reset()
-        added = 0
-        for obs in self.obs_history:
-            if obs.is_pose_graph_node:
-                self.voxel_map.add_obs(obs)
-                added += 1
+        with self._map_lock:
+            self.voxel_map.reset()
+            added = 0
+            for obs in self.obs_history:
+                if obs.is_pose_graph_node:
+                    self.voxel_map.add_obs(obs)
+                    added += 1
         if verbose:
             print("----")
             print(f"Added {added} observations to voxel map")
@@ -669,7 +681,7 @@ def update_map_with_pose_graph(self, verbose: bool = True):
 
     def update_map_loop(self):
         """Threaded function that updates our voxel map in real-time."""
-        while self.robot.running:
+        while self.robot.running and self._running:
             with self._robot_lock:
                 self.update_map_with_pose_graph()
             time.sleep(0.5)
@@ -888,39 +900,40 @@ def plan_to_instance(
             rotation_offset(float): added to rotation of goal to change final relative orientation
         """
 
-        res = None
-        if verbose:
-            for j, view in enumerate(instance.instance_views):
-                print(f"- instance {instance_id} view {j} at {view.cam_to_world}")
+        with self._map_lock:
+            res = None
+            if verbose:
+                for j, view in enumerate(instance.instance_views):
+                    print(f"- instance {instance_id} view {j} at {view.cam_to_world}")
 
-        start_is_valid = self.space.is_valid(start, verbose=False)
-        if not start_is_valid:
-            return PlanResult(success=False, reason="invalid start state")
+            start_is_valid = self.space.is_valid(start, verbose=False)
+            if not start_is_valid:
+                return PlanResult(success=False, reason="invalid start state")
 
-        # plan to the sampled goal
-        has_plan = False
-        if use_cache and instance_id >= 0 and instance_id in self._cached_plans:
-            res = self._cached_plans[instance_id]
-            has_plan = res.success
-            if verbose:
-                print(f"- try retrieving cached plan for {instance_id}: {has_plan=}")
-
-        # Plan to the instance
-        if not has_plan:
-            # Call planner
-            res = self.plan_to_bounds(
-                instance.bounds,
-                start,
-                verbose,
-                max_tries,
-                radius_m,
-                rotation_offset=rotation_offset,
-            )
-            if instance_id >= 0:
-                self._cached_plans[instance_id] = res
+            # plan to the sampled goal
+            has_plan = False
+            if use_cache and instance_id >= 0 and instance_id in self._cached_plans:
+                res = self._cached_plans[instance_id]
+                has_plan = res.success
+                if verbose:
+                    print(f"- try retrieving cached plan for {instance_id}: {has_plan=}")
+
+            # Plan to the instance
+            if not has_plan:
+                # Call planner
+                res = self.plan_to_bounds(
+                    instance.bounds,
+                    start,
+                    verbose,
+                    max_tries,
+                    radius_m,
+                    rotation_offset=rotation_offset,
+                )
+                if instance_id >= 0:
+                    self._cached_plans[instance_id] = res
 
-        # Finally, return plan result
-        return res
+            # Finally, return plan result
+            return res
 
     def plan_to_bounds(
         self,
@@ -943,45 +956,46 @@ def plan_to_bounds(
             PlanResult: the result of the motion planner
         """
 
-        mask = self.voxel_map.mask_from_bounds(bounds)
-        try_count = 0
-        res = None
-        start_is_valid = self.space.is_valid(start, verbose=False)
-
-        conservative_sampling = True
-        # We will sample conservatively, staying away from obstacles and the edges of explored space -- at least at first.
-        for goal in self.space.sample_near_mask(
-            mask,
-            radius_m=radius_m,
-            conservative=conservative_sampling,
-            rotation_offset=rotation_offset,
-        ):
-            goal = goal.cpu().numpy()
-            if verbose:
-                print("       Start:", start)
-                print("Sampled Goal:", goal)
-            show_goal = np.zeros(3)
-            show_goal[:2] = goal[:2]
-            goal_is_valid = self.space.is_valid(goal, verbose=False)
-            if verbose:
-                print("Start is valid:", start_is_valid)
-                print(" Goal is valid:", goal_is_valid)
-            if not goal_is_valid:
+        with self._map_lock:
+            mask = self.voxel_map.mask_from_bounds(bounds)
+            try_count = 0
+            res = None
+            start_is_valid = self.space.is_valid(start, verbose=False)
+
+            conservative_sampling = True
+            # We will sample conservatively, staying away from obstacles and the edges of explored space -- at least at first.
+            for goal in self.space.sample_near_mask(
+                mask,
+                radius_m=radius_m,
+                conservative=conservative_sampling,
+                rotation_offset=rotation_offset,
+            ):
+                goal = goal.cpu().numpy()
+                if verbose:
+                    print("       Start:", start)
+                    print("Sampled Goal:", goal)
+                show_goal = np.zeros(3)
+                show_goal[:2] = goal[:2]
+                goal_is_valid = self.space.is_valid(goal, verbose=False)
                 if verbose:
-                    print(" -> resample goal.")
-                continue
+                    print("Start is valid:", start_is_valid)
+                    print(" Goal is valid:", goal_is_valid)
+                if not goal_is_valid:
+                    if verbose:
+                        print(" -> resample goal.")
+                    continue
 
-            res = self.planner.plan(start, goal, verbose=False)
-            if verbose:
-                print("Found plan:", res.success)
-            try_count += 1
-            if res.success or try_count > max_tries:
-                break
+                res = self.planner.plan(start, goal, verbose=False)
+                if verbose:
+                    print("Found plan:", res.success)
+                try_count += 1
+                if res.success or try_count > max_tries:
+                    break
 
-        # Planning failed
-        if res is None:
-            return PlanResult(success=False, reason="no valid plans found")
-        return res
+            # Planning failed
+            if res is None:
+                return PlanResult(success=False, reason="no valid plans found")
+            return res
 
     def move_to_instance(self, instance: Instance, max_try_per_instance=10) -> bool:
         """Move to a specific instance. Goes until a motion plan is found.
@@ -1446,71 +1460,72 @@ def plan_to_frontier(
         Returns:
             PlanResult: the result of the motion planner
         """
-        start_is_valid = self.space.is_valid(start, verbose=True)
-        # if start is not valid move backwards a bit
-        if not start_is_valid:
-            if verbose:
-                print("Start not valid. Try to recover.")
-            ok = self.recover_from_invalid_start()
-            if not ok:
-                return PlanResult(False, reason="invalid start state")
-        else:
-            print("Planning to frontier...")
-
-        # sample a goal
-        if random_goals:
-            goal = next(self.space.sample_random_frontier()).cpu().numpy()
-        else:
-            # Get frontier sampler
-            sampler = self.space.sample_closest_frontier(
-                start,
-                verbose=False,
-                expand_size=self._default_expand_frontier_size,
-                min_dist=self._frontier_min_dist,
-                step_dist=self._frontier_step_dist,
-            )
-            for i, goal in enumerate(sampler):
-                if goal is None:
-                    # No more positions to sample
-                    if verbose:
-                        print("No more frontiers to sample.")
-                    break
+        with self._map_lock:
+            start_is_valid = self.space.is_valid(start, verbose=True)
+            # if start is not valid move backwards a bit
+            if not start_is_valid:
+                if verbose:
+                    print("Start not valid. Try to recover.")
+                ok = self.recover_from_invalid_start()
+                if not ok:
+                    return PlanResult(False, reason="invalid start state")
+            else:
+                print("Planning to frontier...")
 
-                # print("Sampled Goal is:", goal.cpu().numpy())
-                if self._previous_goal is not None:
-                    if np.linalg.norm(goal.cpu().numpy() - self._previous_goal) < 0.1:
+            # sample a goal
+            if random_goals:
+                goal = next(self.space.sample_random_frontier()).cpu().numpy()
+            else:
+                # Get frontier sampler
+                sampler = self.space.sample_closest_frontier(
+                    start,
+                    verbose=False,
+                    expand_size=self._default_expand_frontier_size,
+                    min_dist=self._frontier_min_dist,
+                    step_dist=self._frontier_step_dist,
+                )
+                for i, goal in enumerate(sampler):
+                    if goal is None:
+                        # No more positions to sample
                         if verbose:
-                            print("Same goal as last time. Skipping.")
-                        self._previous_goal = goal.cpu().numpy()
-                        continue
-                self._previous_goal = goal.cpu().numpy()
+                            print("No more frontiers to sample.")
+                        break
 
-                if self.space.is_valid(goal.cpu().numpy(), verbose=True):
-                    if verbose:
-                        print("       Start:", start)
-                        print("Sampled Goal:", goal.cpu().numpy())
-                else:
-                    continue
+                    # print("Sampled Goal is:", goal.cpu().numpy())
+                    if self._previous_goal is not None:
+                        if np.linalg.norm(goal.cpu().numpy() - self._previous_goal) < 0.1:
+                            if verbose:
+                                print("Same goal as last time. Skipping.")
+                            self._previous_goal = goal.cpu().numpy()
+                            continue
+                    self._previous_goal = goal.cpu().numpy()
 
-                # For debugging, we can set the random seed to 0
-                if fix_random_seed:
-                    np.random.seed(0)
-                    random.seed(0)
+                    if self.space.is_valid(goal.cpu().numpy(), verbose=True):
+                        if verbose:
+                            print("       Start:", start)
+                            print("Sampled Goal:", goal.cpu().numpy())
+                    else:
+                        continue
 
-                if push_locations_to_stack:
-                    print("Pushing visited locations to stack...")
-                    self.planner.space.push_locations_to_stack(self.get_history(reversed=True))
-                # print("Planning to goal...")
-                res = self.planner.plan(start, goal.cpu().numpy())
-                if res.success:
-                    if verbose:
-                        print("Plan successful!")
-                    return res
-                else:
-                    # print("Plan failed. Reason:", res.reason)
-                    if verbose:
-                        print("Plan failed. Reason:", res.reason)
-        return PlanResult(False, reason="no valid plans found")
+                    # For debugging, we can set the random seed to 0
+                    if fix_random_seed:
+                        np.random.seed(0)
+                        random.seed(0)
+
+                    if push_locations_to_stack:
+                        print("Pushing visited locations to stack...")
+                        self.planner.space.push_locations_to_stack(self.get_history(reversed=True))
+                    # print("Planning to goal...")
+                    res = self.planner.plan(start, goal.cpu().numpy())
+                    if res.success:
+                        if verbose:
+                            print("Plan successful!")
+                        return res
+                    else:
+                        # print("Plan failed. Reason:", res.reason)
+                        if verbose:
+                            print("Plan failed. Reason:", res.reason)
+            return PlanResult(False, reason="no valid plans found")
 
     def get_history(self, reversed: bool = False) -> List[np.ndarray]:
         """Get the history of the robot's positions."""

From 924a1993ea0b58bc65b3b54eff38835c753db330 Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 16:47:43 -0400
Subject: [PATCH 409/410] update

---
 src/stretch/agent/robot_agent.py | 2 +-
 src/stretch/app/mapping.py       | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index de93d6604..fc9b84610 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -536,7 +536,7 @@ def get_observations_loop(self) -> None:
             self.obs_count += 1
             time.sleep(0.1)
 
-    def stop_updates(self):
+    def stop_realtime_updates(self):
         """Stop the update threads."""
         self._running = False
 
diff --git a/src/stretch/app/mapping.py b/src/stretch/app/mapping.py
index 2e15c2cd7..ba5cf485f 100644
--- a/src/stretch/app/mapping.py
+++ b/src/stretch/app/mapping.py
@@ -52,7 +52,7 @@
 @click.option("--reset", is_flag=True, help="Reset the robot to origin before starting")
 @click.option(
     "--enable-realtime-updates",
-    "--enable_realtime_updates",
+    "--enable_realtime-updates",
     is_flag=True,
     help="Enable real-time updates so the robot will scan its environment and update the map as it moves around",
 )
@@ -219,6 +219,10 @@ def demo_main(
     except Exception as e:
         raise (e)
     finally:
+
+        # Stop updating the map
+        demo.stop_realtime_updates()
+
         if show_final_map:
             pc_xyz, pc_rgb = demo.voxel_map.show()
         else:

From 9051fe0c773afbb4292da923b3fc5c41263807db Mon Sep 17 00:00:00 2001
From: Chris Paxton <cpaxton@hello-robot.com>
Date: Thu, 10 Oct 2024 16:48:25 -0400
Subject: [PATCH 410/410] updates to robot agent

---
 src/stretch/agent/robot_agent.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
index fc9b84610..e5729373a 100644
--- a/src/stretch/agent/robot_agent.py
+++ b/src/stretch/agent/robot_agent.py
@@ -2057,5 +2057,6 @@ def save_map(self, filename: Optional[str] = None) -> None:
         # Backup the saved map
         memory.backup_saved_map()
 
-        # Write the new map to the file
-        self.voxel_map.write_to_pickle(filename)
+        with self._map_lock:
+            # Write the new map to the file
+            self.voxel_map.write_to_pickle(filename)