From 35a252862a9d33ec02a8be066e3ca888bd8dc798 Mon Sep 17 00:00:00 2001
From: Kavit Shah <kavitsha@meta.com>
Date: Wed, 1 May 2024 18:37:01 -0700
Subject: [PATCH 1/7] Move Spot to Quest's location

---
 .../episodic_memory_robotic_fetch_quest3_c.py | 272 ++++++++++++++++++
 .../perception_and_utils/utils/math_utils.py  |   2 +-
 .../configs/ros_frame_names.yaml              |   3 +-
 3 files changed, 275 insertions(+), 2 deletions(-)
 create mode 100644 aria_data_loaders/aria_data_utils/episodic_memory_robotic_fetch_quest3_c.py

diff --git a/aria_data_loaders/aria_data_utils/episodic_memory_robotic_fetch_quest3_c.py b/aria_data_loaders/aria_data_utils/episodic_memory_robotic_fetch_quest3_c.py
new file mode 100644
index 00000000..182beb0d
--- /dev/null
+++ b/aria_data_loaders/aria_data_utils/episodic_memory_robotic_fetch_quest3_c.py
@@ -0,0 +1,272 @@
+# Copyright (c) Meta Platforms, Inc. and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+
+import click
+import numpy as np
+import rospy
+import sophus as sp
+from geometry_msgs.msg import PoseStamped
+from perception_and_utils.utils.conversions import (
+    ros_PoseStamped_to_sophus_SE3,
+    ros_TransformStamped_to_sophus_SE3,
+    sophus_SE3_to_ros_PoseStamped,
+    sophus_SE3_to_ros_TransformStamped,
+    xyt_to_sophus_SE3,
+)
+from spot_rl.envs.skill_manager import SpotSkillManager
+from spot_rl.utils.utils import ros_frames as rf
+from spot_wrapper.spot import Spot, SpotCamIds
+from spot_wrapper.spot_qr_detector import SpotQRDetector
+from std_msgs.msg import Bool
+
+# from spot_rl.envs.skill_manager import SpotSkillManager
+# from spot_rl.utils.utils import ros_frames as rf
+# from spot_wrapper.spot import Spot, SpotCamIds
+# from spot_wrapper.spot_qr_detector import SpotQRDetector
+# from std_msgs.msg import Bool
+from tf2_ros import (
+    ConnectivityException,
+    ExtrapolationException,
+    LookupException,
+    StaticTransformBroadcaster,
+)
+from tf2_ros.buffer import Buffer
+from tf2_ros.transform_listener import TransformListener
+
+# For information on useful reference frame, please check https://github.com/facebookresearch/spot-sim2real?tab=readme-ov-file#eyeglasses-run-spot-aria-project-code
+
+# Waypoints on left of the dock to observe the QR code clearly
+SPOT_DOCK_OBSERVER_WAYPOINT_LEFT = [
+    0.3840912007597151,
+    -0.5816728569741766,
+    149.58030524832756,
+]
+# Waypoints on right of the dock to observe the QR code clearly
+SPOT_DOCK_OBSERVER_WAYPOINT_RIGHT = [
+    0.5419185005639034,
+    0.5319243891865243,
+    -154.1506754378722,
+]
+
+
+class EpisodicMemoryRoboticFetchQuest3:
+    """
+    This class handles the core logic of the Episodic Memory Robotic Fetch
+    It begines with Spot looking at the marker and detecting it which results
+    in a static tf of marker w.r.t spotWorld frame being published.
+
+    Then it waits for Aria to detect marker and publish ariaWorld w.r.t marker
+    as a static tf. This is done by AriaLiveReader node. This step ensures both
+    Spot & Aria worlds are now mutually transformable which is necessary for the
+    rest of the logic.
+
+    It constantly listens to Aria's current pose w.r.t ariaWorld frame.
+    It also waits for Aria to detect the object and publish that pose of interest
+    (i.e. aria's cpf frame when it saw the object) w.r.t ariaWorld frame.
+
+    Once both the poses are received, it waits for the spot_fetch trigger to be True
+    which will be set by the user via `rostopic pub` on cli.
+
+    Once the trigger is True, it will run the following steps:
+        1. Nav to Pose of interest (Aria's cpf frame when it saw the object)
+        2. Pick the object (object is hardcoded to be bottle)
+        3. Nav to Pose of interest (Aria wearer's last location before triggering spot_fetch)
+        4. Place the object (object is hardcoded to be bottle)
+
+    Args:
+        spot: Spot object
+        verbose: bool indicating whether to print debug logs
+
+    Warning: Do not create objects of this class directly. It is a standalone node
+    """
+
+    def __init__(self, spot: Spot, verbose: bool = True, use_policies: bool = True):
+        # @FIXME: This is initializing a ros-node silently (at core of inheritence in
+        # SpotRobotSubscriberMixin). Make it explicit
+        self.skill_manager = SpotSkillManager(
+            spot=spot,
+            verbose=verbose,
+            use_policies=use_policies,
+        )
+
+        # Navigate Spot to SPOT_DOCK_OBSERVER_WAYPOINT_LEFT so that it can see the marker
+        status, msg = self.skill_manager.nav(
+            SPOT_DOCK_OBSERVER_WAYPOINT_LEFT[0],
+            SPOT_DOCK_OBSERVER_WAYPOINT_LEFT[1],
+            np.deg2rad(SPOT_DOCK_OBSERVER_WAYPOINT_LEFT[2]),
+        )
+        if not status:
+            rospy.logerr(
+                f"Failed to navigate to spot dock observer waypoint. Error: {msg}. Exiting..."
+            )
+            return
+
+        # Sit Spot down
+        self.skill_manager.sit()
+
+        # Instantiate static transform broadcaster for publishing marker w.r.t spotWorld transforms
+        self.static_tf_broadcaster = StaticTransformBroadcaster()
+
+        self._spot_fetch_trigger_sub = rospy.Subscriber(
+            "/spot_fetch_trigger", Bool, self.spot_fetch_trigger_callback
+        )
+        self._spot_fetch_flag = False
+
+        # Detect the marker and get the average pose of marker w.r.t spotWorld frame
+        cam_id = SpotCamIds.HAND_COLOR
+        spot_qr = SpotQRDetector(spot=spot, cam_ids=[cam_id])
+        avg_spotWorld_T_marker = spot_qr.get_avg_spotWorld_T_marker(cam_id=cam_id)
+
+        # Publish marker w.r.t spotWorld transforms for 5 seconds so it can be seen in rviz
+        start_time = rospy.Time.now()
+        while rospy.Time.now() - start_time < rospy.Duration(2.0):
+            self.static_tf_broadcaster.sendTransform(
+                sophus_SE3_to_ros_TransformStamped(
+                    sp_se3=avg_spotWorld_T_marker,
+                    parent_frame=rf.SPOT_WORLD,
+                    child_frame=rf.MARKER,
+                )
+            )
+            rospy.sleep(0.5)
+
+        # Instantiate publisher for publishing go-to pose for handoff
+        self._nav_PoseStamped_pub_for_place = rospy.Publisher(
+            "/nav_pose_for_place_viz", PoseStamped, queue_size=10
+        )
+        self.nav_xyt_for_handoff = None
+
+        # Initialize static transform subscriber for listening to ariaWorld w.r.t marker transform
+        self._tf_buffer = Buffer()
+        self._tf_listener = TransformListener(self._tf_buffer)
+
+        # Wait for transform from camera to spotWorld to be available
+        while not rospy.is_shutdown() and not self._spot_fetch_flag:
+            while not rospy.is_shutdown() and not self._tf_buffer.can_transform(
+                target_frame=rf.SPOT_WORLD, source_frame="camera", time=rospy.Time()
+            ):
+                rospy.logwarn_throttle(
+                    5.0, "Waiting for transform from camera to spotWorld"
+                )
+                rospy.sleep(0.5)
+            try:
+                transform_stamped_spotWorld_T_camera = self._tf_buffer.lookup_transform(
+                    target_frame=rf.SPOT_WORLD,
+                    source_frame="camera",
+                    time=rospy.Time(0),
+                )
+            except (LookupException, ConnectivityException, ExtrapolationException):
+                raise RuntimeError(
+                    "Unable to lookup transform from ariaWorld to spotWorld"
+                )
+            spotWorld_T_camera = ros_TransformStamped_to_sophus_SE3(
+                ros_trf_stamped=transform_stamped_spotWorld_T_camera
+            )
+
+            self.nav_xyt_for_handoff = self.get_nav_xyt_to_wearer(spotWorld_T_camera)
+            # Publish place pose in spotWorld frame so it can be seen in rviz until spot_fetch is triggered
+            self._nav_PoseStamped_pub_for_place.publish(
+                sophus_SE3_to_ros_PoseStamped(
+                    sp_se3=xyt_to_sophus_SE3(
+                        np.asarray(self.nav_xyt_for_handoff, dtype=np.float32)
+                    ),
+                    parent_frame=rf.SPOT_WORLD,
+                )
+            )
+
+            rospy.logwarn_throttle(
+                5.0, "Waiting for demo trigger on /spot_fetch_trigger topic"
+            )
+            rospy.sleep(1.0)
+
+        # Set run status to true in the beginning
+        run_status = True
+
+        if run_status:
+            # Nav to x,y,theta of interest (Aria's cpf frame when it saw the object)
+            nav_xyt_for_handoff = self.nav_xyt_for_handoff
+            rospy.loginfo(f"Nav 2D location for pick: {nav_xyt_for_handoff}")
+
+            self.skill_manager.nav(
+                nav_xyt_for_handoff[0], nav_xyt_for_handoff[1], nav_xyt_for_handoff[2]
+            )
+
+            self.skill_manager.sit()
+
+    def spot_fetch_trigger_callback(self, msg):
+        """
+        Save _spot_fetch_flag as True when spot fetch is to be triggered
+
+        Args:
+            msg (Bool): Message received on /spot_fetch_trigger topic
+
+        Updates:
+            self._spot_fetch_flag (bool): True when spot fetch is to be triggered
+        """
+        self._spot_fetch_flag = True
+
+    def get_nav_xyt_to_wearer(
+        self, aria_pose: sp.SE3, shift_offset: float = 0.6
+    ) -> Tuple[float, float, float]:
+        """
+        Converts Sophus SE3 aria current pose to Tuple of x, y, theta, then flips it by 180 degrees and shifts it by a given offset.
+        At the end, the final x, y, theta is such that the robot will face the wearer.
+
+        Args:
+            aria_pose (sp.SE3): Sophus SE3 object for aria's current pose (as cpf frame) w.r.t spotWorld frame
+
+        Returns:
+            Tuple[float, float, float]: Tuple of x,y,theta as floats representing nav target for robot
+        """
+        # ARROW STARTS FROM ORIGIN
+
+        # get position and rotation as x, y, theta
+        position = aria_pose.translation()
+        # Find the angle made by CPF's z axis with spotWorld's x axis
+        # as robot should orient to the CPF's z axis. First 3 elements of
+        # column 3 from spotWorld_T_cpf represents cpf's z axis in spotWorld frame
+        cpf_z_axis_in_spotWorld = aria_pose.matrix()[:3, 2]
+        x_component_of_z_axis = cpf_z_axis_in_spotWorld[0]
+        y_component_of_z_axis = cpf_z_axis_in_spotWorld[1]
+        rotation = float(
+            np.arctan2(
+                y_component_of_z_axis,
+                x_component_of_z_axis,
+            )
+        )  # tan^-1(y/x)
+        x, y, theta = position[0], position[1], rotation
+
+        # push fwd this point along theta
+        x += shift_offset * np.cos(theta)
+        y += shift_offset * np.sin(theta)
+        # rotate theta by pi
+        theta += np.pi
+
+        return (x, y, theta)
+
+
+@click.command()
+@click.option("--verbose", type=bool, default=True)
+@click.option("--use-policies", type=bool, default=False)
+def main(verbose: bool, use_policies: bool):
+    # rospy.init_node("episode")
+    # rospy.logwarn("Starting up ROS node")
+
+    spot = Spot("EpisodicMemoryRoboticFetchQuest3Node")
+    with spot.get_lease(hijack=True) as lease:
+        if lease is None:
+            raise RuntimeError("Could not get lease")
+        else:
+            rospy.loginfo("Acquired lease")
+
+        _ = EpisodicMemoryRoboticFetchQuest3(
+            spot=spot, verbose=verbose, use_policies=use_policies
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/perception_and_utils_root/perception_and_utils/utils/math_utils.py b/perception_and_utils_root/perception_and_utils/utils/math_utils.py
index 740e38d2..61be50c4 100644
--- a/perception_and_utils_root/perception_and_utils/utils/math_utils.py
+++ b/perception_and_utils_root/perception_and_utils/utils/math_utils.py
@@ -100,7 +100,7 @@ def get_running_avg_a_T_b(
 
         a_T_b_position_list.append(b_position)
 
-        # Ensure quaternion's w is always positive for effective averaging as multiple quaternions can represent the same rotation
+        # Ensure quaternion's w is always negative for effective averaging as multiple quaternions can represent the same rotation
         quat = Rotation.from_matrix(a_T_b.rotationMatrix()).as_quat()
         if quat[3] > 0:
             quat = -1.0 * quat
diff --git a/spot_rl_experiments/configs/ros_frame_names.yaml b/spot_rl_experiments/configs/ros_frame_names.yaml
index 3461933f..7aebaec2 100644
--- a/spot_rl_experiments/configs/ros_frame_names.yaml
+++ b/spot_rl_experiments/configs/ros_frame_names.yaml
@@ -4,7 +4,8 @@ SPOT: "spot"
 ARIA_WORLD: "ariaWorld"
 ARIA: "aria" # aria in cpf frame
 ARIA_DEVICE: "ariaDevice" # aria in device frame
-
+QUEST3_WORLD: "quest3World"
+QUEST3_CAMERA: "camera"
 MARKER: "marker"
 
 

From 8624e02bc4cfd4c6cf083c2119a60f206a9f013a Mon Sep 17 00:00:00 2001
From: Kavit Shah <kavitsha@meta.com>
Date: Tue, 11 Jun 2024 15:54:32 -0700
Subject: [PATCH 2/7] LSC-Quest3 MVP

---
 .../aria_data_utils/rviz/spot-quest3.rviz     | 147 ++++++++++
 spot_rl_experiments/spot_rl/envs/lsc_hitl.py  | 267 ++++++++++++++++++
 2 files changed, 414 insertions(+)
 create mode 100644 aria_data_loaders/aria_data_utils/rviz/spot-quest3.rviz
 create mode 100644 spot_rl_experiments/spot_rl/envs/lsc_hitl.py

diff --git a/aria_data_loaders/aria_data_utils/rviz/spot-quest3.rviz b/aria_data_loaders/aria_data_utils/rviz/spot-quest3.rviz
new file mode 100644
index 00000000..1960ddfb
--- /dev/null
+++ b/aria_data_loaders/aria_data_utils/rviz/spot-quest3.rviz
@@ -0,0 +1,147 @@
+Panels:
+  - Class: rviz/Displays
+    Help Height: 78
+    Name: Displays
+    Property Tree Widget:
+      Expanded:
+        - /Global Options1
+        - /Status1
+        - /TF1
+        - /TF1/Frames1
+      Splitter Ratio: 0.5
+    Tree Height: 719
+  - Class: rviz/Selection
+    Name: Selection
+  - Class: rviz/Tool Properties
+    Expanded:
+      - /2D Pose Estimate1
+      - /2D Nav Goal1
+      - /Publish Point1
+    Name: Tool Properties
+    Splitter Ratio: 0.5886790156364441
+  - Class: rviz/Views
+    Expanded:
+      - /Current View1
+    Name: Views
+    Splitter Ratio: 0.5
+  - Class: rviz/Time
+    Name: Time
+    SyncMode: 0
+    SyncSource: ""
+Preferences:
+  PromptSaveOnExit: true
+Toolbars:
+  toolButtonStyle: 2
+Visualization Manager:
+  Class: ""
+  Displays:
+    - Alpha: 0.5
+      Cell Size: 1
+      Class: rviz/Grid
+      Color: 160; 160; 164
+      Enabled: true
+      Line Style:
+        Line Width: 0.029999999329447746
+        Value: Lines
+      Name: Grid
+      Normal Cell Count: 0
+      Offset:
+        X: 0
+        Y: 0
+        Z: 0
+      Plane: XY
+      Plane Cell Count: 10
+      Reference Frame: <Fixed Frame>
+      Value: true
+    - Class: rviz/TF
+      Enabled: true
+      Filter (blacklist): ""
+      Filter (whitelist): ""
+      Frame Timeout: 15
+      Frames:
+        All Enabled: false
+        map:
+          Value: false
+        spot:
+          Value: true
+        spotWorld:
+          Value: true
+      Marker Alpha: 1
+      Marker Scale: 1
+      Name: TF
+      Show Arrows: true
+      Show Axes: true
+      Show Names: true
+      Tree:
+        map:
+          spotWorld:
+            spot:
+              {}
+      Update Interval: 0
+      Value: true
+  Enabled: true
+  Global Options:
+    Background Color: 48; 48; 48
+    Default Light: true
+    Fixed Frame: map
+    Frame Rate: 30
+  Name: root
+  Tools:
+    - Class: rviz/Interact
+      Hide Inactive Objects: true
+    - Class: rviz/MoveCamera
+    - Class: rviz/Select
+    - Class: rviz/FocusCamera
+    - Class: rviz/Measure
+    - Class: rviz/SetInitialPose
+      Theta std deviation: 0.2617993950843811
+      Topic: /initialpose
+      X std deviation: 0.5
+      Y std deviation: 0.5
+    - Class: rviz/SetGoal
+      Topic: /move_base_simple/goal
+    - Class: rviz/PublishPoint
+      Single click: true
+      Topic: /clicked_point
+  Value: true
+  Views:
+    Current:
+      Class: rviz/Orbit
+      Distance: 2.778170108795166
+      Enable Stereo Rendering:
+        Stereo Eye Separation: 0.05999999865889549
+        Stereo Focal Distance: 1
+        Swap Stereo Eyes: false
+        Value: false
+      Field of View: 0.7853981852531433
+      Focal Point:
+        X: 0.4939827620983124
+        Y: 0.11082470417022705
+        Z: -0.30550795793533325
+      Focal Shape Fixed Size: true
+      Focal Shape Size: 0.05000000074505806
+      Invert Z Axis: false
+      Name: Current View
+      Near Clip Distance: 0.009999999776482582
+      Pitch: 1.1453975439071655
+      Target Frame: <Fixed Frame>
+      Yaw: 6.2535834312438965
+    Saved: ~
+Window Geometry:
+  Displays:
+    collapsed: false
+  Height: 1016
+  Hide Left Dock: false
+  Hide Right Dock: true
+  QMainWindow State: 000000ff00000000fd0000000400000000000001560000035afc0200000008fb0000001200530065006c0065006300740069006f006e00000001e10000009b0000005c00fffffffb0000001e0054006f006f006c002000500072006f007000650072007400690065007302000001ed000001df00000185000000a3fb000000120056006900650077007300200054006f006f02000001df000002110000018500000122fb000000200054006f006f006c002000500072006f0070006500720074006900650073003203000002880000011d000002210000017afb000000100044006900730070006c006100790073010000003d0000035a000000c900fffffffb0000002000730065006c0065006300740069006f006e00200062007500660066006500720200000138000000aa0000023a00000294fb00000014005700690064006500530074006500720065006f02000000e6000000d2000003ee0000030bfb0000000c004b0069006e0065006300740200000186000001060000030c00000261000000010000010f000002b0fc0200000003fb0000001e0054006f006f006c002000500072006f00700065007200740069006500730100000041000000780000000000000000fb0000000a00560069006500770073000000003d000002b0000000a400fffffffb0000001200530065006c0065006300740069006f006e010000025a000000b200000000000000000000000200000490000000a9fc0100000001fb0000000a00560069006500770073030000004e00000080000002e10000019700000003000007800000003efc0100000002fb0000000800540069006d00650100000000000007800000033700fffffffb0000000800540069006d00650100000000000004500000000000000000000006240000035a00000004000000040000000800000008fc0000000100000002000000010000000a0054006f006f006c00730100000000ffffffff0000000000000000
+  Selection:
+    collapsed: false
+  Time:
+    collapsed: false
+  Tool Properties:
+    collapsed: false
+  Views:
+    collapsed: true
+  Width: 1920
+  X: 0
+  Y: 27
diff --git a/spot_rl_experiments/spot_rl/envs/lsc_hitl.py b/spot_rl_experiments/spot_rl/envs/lsc_hitl.py
new file mode 100644
index 00000000..d2890ec3
--- /dev/null
+++ b/spot_rl_experiments/spot_rl/envs/lsc_hitl.py
@@ -0,0 +1,267 @@
+# Copyright (c) Meta Platforms, Inc. and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import os
+import subprocess
+import time
+from collections import Counter
+from typing import Any, Dict, Tuple
+
+import numpy as np
+import rospy
+import sophus as sp
+from hydra import compose, initialize
+from perception_and_utils.utils.conversions import (
+    ros_PoseStamped_to_sophus_SE3,
+    ros_TransformStamped_to_sophus_SE3,
+    sophus_SE3_to_ros_PoseStamped,
+    sophus_SE3_to_ros_TransformStamped,
+    xyt_to_sophus_SE3,
+)
+from spot_rl.envs.skill_manager import SpotSkillManager
+from spot_rl.llm.src.rearrange_llm import RearrangeEasyChain
+from spot_rl.models.sentence_similarity import SentenceSimilarity
+from spot_rl.utils.utils import construct_config, get_default_parser, get_waypoint_yaml
+from spot_rl.utils.utils import ros_frames as rf
+from spot_rl.utils.whisper_translator import WhisperTranslator
+from spot_wrapper.spot import Spot, SpotCamIds
+from spot_wrapper.spot_qr_detector import SpotQRDetector
+from tf2_ros import (
+    ConnectivityException,
+    ExtrapolationException,
+    LookupException,
+    StaticTransformBroadcaster,
+)
+from tf2_ros.buffer import Buffer
+from tf2_ros.transform_listener import TransformListener
+
+DOCK_ID = int(os.environ.get("SPOT_DOCK_ID", 549))
+
+# Waypoints on left of the dock to observe the QR code clearly
+SPOT_DOCK_OBSERVER_WAYPOINT_LEFT = [
+    0.3840912007597151,
+    -0.5816728569741766,
+    149.58030524832756,
+]
+# Waypoints on right of the dock to observe the QR code clearly
+SPOT_DOCK_OBSERVER_WAYPOINT_RIGHT = [
+    0.5419185005639034,
+    0.5319243891865243,
+    -154.1506754378722,
+]
+
+
+def get_nav_xyt_to_wearer(
+    pose: sp.SE3, shift_offset: float = 0.6
+) -> Tuple[float, float, float]:
+    """
+    Converts Sophus SE3 aria current pose to Tuple of x, y, theta, then flips it by 180 degrees and shifts it by a given offset.
+    At the end, the final x, y, theta is such that the robot will face the wearer.
+
+    Args:
+        aria_pose (sp.SE3): Sophus SE3 object for aria's current pose (as cpf frame) w.r.t spotWorld frame
+
+    Returns:
+        Tuple[float, float, float]: Tuple of x,y,theta as floats representing nav target for robot
+    """
+    # ARROW STARTS FROM ORIGIN
+
+    # get position and rotation as x, y, theta
+    position = pose.translation()
+    # Find the angle made by CPF's z axis with spotWorld's x axis
+    # as robot should orient to the CPF's z axis. First 3 elements of
+    # column 3 from spotWorld_T_cpf represents cpf's z axis in spotWorld frame
+    cpf_z_axis_in_spotWorld = pose.matrix()[:3, 2]
+    x_component_of_z_axis = cpf_z_axis_in_spotWorld[0]
+    y_component_of_z_axis = cpf_z_axis_in_spotWorld[1]
+    rotation = float(
+        np.arctan2(
+            y_component_of_z_axis,
+            x_component_of_z_axis,
+        )
+    )  # tan^-1(y/x)
+    x, y, theta = position[0], position[1], rotation
+
+    # push fwd this point along theta
+    x += shift_offset * np.cos(theta)
+    y += shift_offset * np.sin(theta)
+    # rotate theta by pi
+    theta += np.pi
+
+    return (x, y, theta)
+
+
+def get_place_xyz_to_wearer(pose: sp.SE3) -> Tuple[float, float, float]:
+    """
+    Returns the xyz coordinates of the place pose in spot's local frame. The x,y,z coordinates
+    are present in the local frame of the robot. Make sure to pass the "is_local" flag as true to skill_manager.place()
+
+    Args:
+        aria_pose (sp.SE3): Sophus SE3 object for aria's current pose (as cpf frame) w.r.t spotWorld frame
+
+    Returns:
+        Tuple[float, float, float]: Tuple of x,y,z as floats representing place pose in spot's local frame
+    """
+    return (
+        0.7,
+        0.0,
+        0.2,
+    )
+
+
+def get_handoff_to_human_pose(tf_buffer: Buffer, source: str, target: str) -> sp.SE3:
+    while not rospy.is_shutdown() and not tf_buffer.can_transform(
+        target_frame=target, source_frame=source, time=rospy.Time()
+    ):
+        rospy.logwarn_throttle(5.0, f"Waiting for transform from {source} to {target}")
+        rospy.sleep(0.5)
+    try:
+        transform_stamped_spotWorld_T_camera = tf_buffer.lookup_transform(
+            target_frame=target,
+            source_frame=source,
+            time=rospy.Time(0),
+        )
+    except (LookupException, ConnectivityException, ExtrapolationException):
+        raise RuntimeError(f"Unable to lookup transform from {source} to {target}")
+    target_T_source = ros_TransformStamped_to_sophus_SE3(
+        ros_trf_stamped=transform_stamped_spotWorld_T_camera
+    )
+    return target_T_source
+
+
+def main(spot, config):
+
+    spot = SpotSkillManager(use_mobile_pick=True)
+
+    vp = SPOT_DOCK_OBSERVER_WAYPOINT_RIGHT
+    status, msg = spot.nav(
+        vp[0],
+        vp[1],
+        np.deg2rad(vp[2]),
+    )
+    if not status:
+        rospy.logerr(
+            f"Failed to navigate to spot dock observer waypoint. Error: {msg}. Exiting..."
+        )
+        return
+    # Sit Spot down
+    spot.sit()
+
+    cam_id = SpotCamIds.HAND_COLOR
+    spot_qr = SpotQRDetector(spot=spot.spot, cam_ids=[cam_id])
+    avg_spotWorld_T_marker = spot_qr.get_avg_spotWorld_T_marker(cam_id=cam_id)
+
+    # Publish marker w.r.t spotWorld transforms for 5 seconds so it can be seen in rviz
+    # Instantiate static transform broadcaster for publishing marker w.r.t spotWorld transforms
+    static_tf_broadcaster = StaticTransformBroadcaster()
+    tf_buffer = Buffer()
+    _ = TransformListener(tf_buffer)
+    start_time = rospy.Time.now()
+    while rospy.Time.now() - start_time < rospy.Duration(2.0):
+        static_tf_broadcaster.sendTransform(
+            sophus_SE3_to_ros_TransformStamped(
+                sp_se3=avg_spotWorld_T_marker,
+                parent_frame=rf.SPOT_WORLD,
+                child_frame=rf.MARKER,
+            )
+        )
+        rospy.sleep(0.5)
+
+    # Reset the viz params
+    rospy.set_param("/viz_pick", "None")
+    rospy.set_param("/viz_object", "None")
+    rospy.set_param("/viz_place", "None")
+
+    # Check if robot should return to base
+    return_to_base = config.RETURN_TO_BASE
+
+    # Get the waypoints from waypoints.yaml
+    waypoints_yaml_dict = get_waypoint_yaml()
+
+    audio_to_text = WhisperTranslator()
+    sentence_similarity = SentenceSimilarity()
+    with initialize(config_path="../llm/src/conf"):
+        llm_config = compose(config_name="config")
+    llm = RearrangeEasyChain(llm_config)
+
+    print(
+        "I am ready to take instructions!\n Sample Instructions : take the rubik cube from the dining table to the hamper"
+    )
+    print("-" * 100)
+    pre_in_dock = False
+    while True:
+        audio_transcription_success = False
+        while not audio_transcription_success:
+            # try:
+            input("Are you ready?")
+            audio_to_text.record()
+            instruction = audio_to_text.translate()
+            print("Transcribed instructions : ", instruction)
+
+            # Use LLM to convert user input to an instructions set
+            # Eg: nav_1, pick, nav_2 = 'bowl_counter', "container", 'coffee_counter'
+            t1 = time.time()
+            nav_1, pick, nav_2, _ = llm.parse_instructions(instruction)
+            print("PARSED", nav_1, pick, nav_2, f"Time {time.time()- t1} secs")
+
+            # Find closest nav_targets to the ones robot knows locations of
+            nav_1 = sentence_similarity.get_most_similar_in_list(
+                nav_1, list(waypoints_yaml_dict["nav_targets"].keys())
+            )
+            nav_2 = sentence_similarity.get_most_similar_in_list(
+                nav_2, list(waypoints_yaml_dict["nav_targets"].keys())
+            )
+            print("MOST SIMILAR: ", nav_1, pick, nav_2)
+            audio_transcription_success = True
+            # except Exception as e:
+            #     print(f"Exception encountered in Speech to text : {e} \n\n Retrying...")
+        # Used for Owlvit
+        rospy.set_param("object_target", pick)
+
+        # Used for Visualizations
+        rospy.set_param("viz_pick", nav_1)
+        rospy.set_param("viz_object", pick)
+        rospy.set_param("viz_place", nav_2)
+
+        if pre_in_dock:
+            spot.spot.power_robot()
+
+        # Do LSC by calling skills
+        slow_down = 0.5  # slow down time to increase the skill execution stability
+        spot.nav(nav_1)
+        time.sleep(slow_down)
+        spot.pick(pick)
+        time.sleep(slow_down)
+        if nav_2 == "human":
+            human_pose = get_handoff_to_human_pose(
+                tf_buffer, source="camera", target="spotWorld"
+            )
+            handoff_xyt = get_nav_xyt_to_wearer(human_pose)
+            spot.nav(handoff_xyt[0], handoff_xyt[1], handoff_xyt[2])
+            time.sleep(slow_down)
+
+            handoff_xyz = get_place_xyz_to_wearer(human_pose)
+            spot.place(handoff_xyz[0], handoff_xyz[1], handoff_xyz[2], is_local=True)
+            # spot.place( is_local=True, visualization=True)
+            time.sleep(slow_down)
+        else:
+            spot.nav(nav_2)
+            time.sleep(slow_down)
+            spot.place(nav_2)
+            time.sleep(slow_down)
+        if return_to_base:
+            spot.dock()
+            pre_in_dock = True
+
+
+if __name__ == "__main__":
+    parser = get_default_parser()
+    parser.add_argument("-m", "--use-mixer", action="store_true")
+    parser.add_argument("--output")
+    args = parser.parse_args()
+    config = construct_config(opts=args.opts)
+
+    spot = Spot("LSC_HITLNode")
+    main(None, config)

From c16acc15864d1aa1f889e86546b52291413a9003 Mon Sep 17 00:00:00 2001
From: Kavit Shah <kavitsha@meta.com>
Date: Tue, 2 Jul 2024 18:04:54 -0700
Subject: [PATCH 3/7] Review these changes

---
 .../episodic_memory_robotic_fetch_quest3_c.py | 138 +++++++++---------
 spot_rl_experiments/spot_rl/envs/lsc_hitl.py  |   5 +
 2 files changed, 73 insertions(+), 70 deletions(-)

diff --git a/aria_data_loaders/aria_data_utils/episodic_memory_robotic_fetch_quest3_c.py b/aria_data_loaders/aria_data_utils/episodic_memory_robotic_fetch_quest3_c.py
index 182beb0d..4bb37c1e 100644
--- a/aria_data_loaders/aria_data_utils/episodic_memory_robotic_fetch_quest3_c.py
+++ b/aria_data_loaders/aria_data_utils/episodic_memory_robotic_fetch_quest3_c.py
@@ -85,54 +85,54 @@ class EpisodicMemoryRoboticFetchQuest3:
     Warning: Do not create objects of this class directly. It is a standalone node
     """
 
-    def __init__(self, spot: Spot, verbose: bool = True, use_policies: bool = True):
-        # @FIXME: This is initializing a ros-node silently (at core of inheritence in
-        # SpotRobotSubscriberMixin). Make it explicit
-        self.skill_manager = SpotSkillManager(
-            spot=spot,
-            verbose=verbose,
-            use_policies=use_policies,
-        )
+    def __init__(self):
+        # # @FIXME: This is initializing a ros-node silently (at core of inheritence in
+        # # SpotRobotSubscriberMixin). Make it explicit
+        # self.skill_manager = SpotSkillManager(
+        #     spot=spot,
+        #     verbose=verbose,
+        #     use_policies=use_policies,
+        # )
 
         # Navigate Spot to SPOT_DOCK_OBSERVER_WAYPOINT_LEFT so that it can see the marker
-        status, msg = self.skill_manager.nav(
-            SPOT_DOCK_OBSERVER_WAYPOINT_LEFT[0],
-            SPOT_DOCK_OBSERVER_WAYPOINT_LEFT[1],
-            np.deg2rad(SPOT_DOCK_OBSERVER_WAYPOINT_LEFT[2]),
-        )
-        if not status:
-            rospy.logerr(
-                f"Failed to navigate to spot dock observer waypoint. Error: {msg}. Exiting..."
-            )
-            return
-
-        # Sit Spot down
-        self.skill_manager.sit()
-
-        # Instantiate static transform broadcaster for publishing marker w.r.t spotWorld transforms
-        self.static_tf_broadcaster = StaticTransformBroadcaster()
-
-        self._spot_fetch_trigger_sub = rospy.Subscriber(
-            "/spot_fetch_trigger", Bool, self.spot_fetch_trigger_callback
-        )
-        self._spot_fetch_flag = False
+        # status, msg = self.skill_manager.nav(
+        #     SPOT_DOCK_OBSERVER_WAYPOINT_LEFT[0],
+        #     SPOT_DOCK_OBSERVER_WAYPOINT_LEFT[1],
+        #     np.deg2rad(SPOT_DOCK_OBSERVER_WAYPOINT_LEFT[2]),
+        # )
+        # if not status:
+        #     rospy.logerr(
+        #         f"Failed to navigate to spot dock observer waypoint. Error: {msg}. Exiting..."
+        #     )
+        #     return
+
+        # # Sit Spot down
+        # self.skill_manager.sit()
+
+        # # Instantiate static transform broadcaster for publishing marker w.r.t spotWorld transforms
+        # self.static_tf_broadcaster = StaticTransformBroadcaster()
+
+        # self._spot_fetch_trigger_sub = rospy.Subscriber(
+        #     "/spot_fetch_trigger", Bool, self.spot_fetch_trigger_callback
+        # )
+        # self._spot_fetch_flag = False
 
         # Detect the marker and get the average pose of marker w.r.t spotWorld frame
-        cam_id = SpotCamIds.HAND_COLOR
-        spot_qr = SpotQRDetector(spot=spot, cam_ids=[cam_id])
-        avg_spotWorld_T_marker = spot_qr.get_avg_spotWorld_T_marker(cam_id=cam_id)
-
-        # Publish marker w.r.t spotWorld transforms for 5 seconds so it can be seen in rviz
-        start_time = rospy.Time.now()
-        while rospy.Time.now() - start_time < rospy.Duration(2.0):
-            self.static_tf_broadcaster.sendTransform(
-                sophus_SE3_to_ros_TransformStamped(
-                    sp_se3=avg_spotWorld_T_marker,
-                    parent_frame=rf.SPOT_WORLD,
-                    child_frame=rf.MARKER,
-                )
-            )
-            rospy.sleep(0.5)
+        # cam_id = SpotCamIds.HAND_COLOR
+        # spot_qr = SpotQRDetector(spot=spot, cam_ids=[cam_id])
+        # avg_spotWorld_T_marker = spot_qr.get_avg_spotWorld_T_marker(cam_id=cam_id)
+
+        # # Publish marker w.r.t spotWorld transforms for 5 seconds so it can be seen in rviz
+        # start_time = rospy.Time.now()
+        # while rospy.Time.now() - start_time < rospy.Duration(2.0):
+        #     self.static_tf_broadcaster.sendTransform(
+        #         sophus_SE3_to_ros_TransformStamped(
+        #             sp_se3=avg_spotWorld_T_marker,
+        #             parent_frame=rf.SPOT_WORLD,
+        #             child_frame=rf.MARKER,
+        #         )
+        #     )
+        #     rospy.sleep(0.5)
 
         # Instantiate publisher for publishing go-to pose for handoff
         self._nav_PoseStamped_pub_for_place = rospy.Publisher(
@@ -145,7 +145,7 @@ def __init__(self, spot: Spot, verbose: bool = True, use_policies: bool = True):
         self._tf_listener = TransformListener(self._tf_buffer)
 
         # Wait for transform from camera to spotWorld to be available
-        while not rospy.is_shutdown() and not self._spot_fetch_flag:
+        while not rospy.is_shutdown():
             while not rospy.is_shutdown() and not self._tf_buffer.can_transform(
                 target_frame=rf.SPOT_WORLD, source_frame="camera", time=rospy.Time()
             ):
@@ -178,24 +178,24 @@ def __init__(self, spot: Spot, verbose: bool = True, use_policies: bool = True):
                 )
             )
 
-            rospy.logwarn_throttle(
-                5.0, "Waiting for demo trigger on /spot_fetch_trigger topic"
-            )
+            # rospy.logwarn_throttle(
+            #     5.0, "Waiting for demo trigger on /spot_fetch_trigger topic"
+            # )
             rospy.sleep(1.0)
 
         # Set run status to true in the beginning
-        run_status = True
+        # run_status = True
 
-        if run_status:
-            # Nav to x,y,theta of interest (Aria's cpf frame when it saw the object)
-            nav_xyt_for_handoff = self.nav_xyt_for_handoff
-            rospy.loginfo(f"Nav 2D location for pick: {nav_xyt_for_handoff}")
+        # if run_status:
+        #     # Nav to x,y,theta of interest (Aria's cpf frame when it saw the object)
+        #     nav_xyt_for_handoff = self.nav_xyt_for_handoff
+        #     rospy.loginfo(f"Nav 2D location for pick: {nav_xyt_for_handoff}")
 
-            self.skill_manager.nav(
-                nav_xyt_for_handoff[0], nav_xyt_for_handoff[1], nav_xyt_for_handoff[2]
-            )
+        #     self.skill_manager.nav(
+        #         nav_xyt_for_handoff[0], nav_xyt_for_handoff[1], nav_xyt_for_handoff[2]
+        #     )
 
-            self.skill_manager.sit()
+        #     self.skill_manager.sit()
 
     def spot_fetch_trigger_callback(self, msg):
         """
@@ -253,19 +253,17 @@ def get_nav_xyt_to_wearer(
 @click.option("--verbose", type=bool, default=True)
 @click.option("--use-policies", type=bool, default=False)
 def main(verbose: bool, use_policies: bool):
-    # rospy.init_node("episode")
-    # rospy.logwarn("Starting up ROS node")
-
-    spot = Spot("EpisodicMemoryRoboticFetchQuest3Node")
-    with spot.get_lease(hijack=True) as lease:
-        if lease is None:
-            raise RuntimeError("Could not get lease")
-        else:
-            rospy.loginfo("Acquired lease")
-
-        _ = EpisodicMemoryRoboticFetchQuest3(
-            spot=spot, verbose=verbose, use_policies=use_policies
-        )
+    rospy.init_node("episode")
+    rospy.logwarn("Starting up ROS node")
+
+    # spot = Spot("EpisodicMemoryRoboticFetchQuest3Node")
+    # with spot.get_lease(hijack=True) as lease:
+    # if lease is None:
+    # raise RuntimeError("Could not get lease")
+    # else:
+    # rospy.loginfo("Acquired lease")
+
+    _ = EpisodicMemoryRoboticFetchQuest3()
 
 
 if __name__ == "__main__":
diff --git a/spot_rl_experiments/spot_rl/envs/lsc_hitl.py b/spot_rl_experiments/spot_rl/envs/lsc_hitl.py
index d2890ec3..1ae7e961 100644
--- a/spot_rl_experiments/spot_rl/envs/lsc_hitl.py
+++ b/spot_rl_experiments/spot_rl/envs/lsc_hitl.py
@@ -12,6 +12,7 @@
 import numpy as np
 import rospy
 import sophus as sp
+from geometry_msgs.msg import PoseStamped
 from hydra import compose, initialize
 from perception_and_utils.utils.conversions import (
     ros_PoseStamped_to_sophus_SE3,
@@ -153,6 +154,9 @@ def main(spot, config):
     spot_qr = SpotQRDetector(spot=spot.spot, cam_ids=[cam_id])
     avg_spotWorld_T_marker = spot_qr.get_avg_spotWorld_T_marker(cam_id=cam_id)
 
+    nav_PoseStamped_pub_for_place = rospy.Publisher(
+        "/nav_pose_for_place_viz", PoseStamped, queue_size=10
+    )
     # Publish marker w.r.t spotWorld transforms for 5 seconds so it can be seen in rviz
     # Instantiate static transform broadcaster for publishing marker w.r.t spotWorld transforms
     static_tf_broadcaster = StaticTransformBroadcaster()
@@ -228,6 +232,7 @@ def main(spot, config):
         if pre_in_dock:
             spot.spot.power_robot()
 
+        input("Start LSC-HITL?")
         # Do LSC by calling skills
         slow_down = 0.5  # slow down time to increase the skill execution stability
         spot.nav(nav_1)

From c1929ccddfc40ff176de8368021f4ab13d83c9b4 Mon Sep 17 00:00:00 2001
From: Kavit Shah <kavitsha@meta.com>
Date: Thu, 25 Jul 2024 15:52:34 -0700
Subject: [PATCH 4/7] Q3Streamer - Laptop : WIP

---
 .gitignore                                    |   2 +
 .../aria_data_utils/aria_live_streamer.py     |   5 +-
 .../aria_data_utils/aria_sdk_utils.py         |  17 +-
 .../human_sensor_data_streamer_interface.py   | 225 ++++++++++++
 .../aria_data_utils/quest3_data_streamer.py   | 332 ++++++++++++++++++
 .../unified_quest_camera_interface.py         |  59 ++++
 .../detector_wrappers/april_tag_detector.py   |  19 +-
 .../generic_detector_interface.py             |   4 +-
 .../human_motion_detector.py                  |  95 +++++
 .../detector_wrappers/object_detector.py      |  11 +-
 .../perception_and_utils/utils/conversions.py |   5 +
 .../perception_and_utils/utils/data_frame.py  |  20 ++
 .../perception_and_utils/utils/math_utils.py  |  16 +-
 perception_and_utils_root/setup.py            |   2 +-
 .../configs/ros_frame_names.yaml              |   1 +
 15 files changed, 789 insertions(+), 24 deletions(-)
 create mode 100644 aria_data_loaders/aria_data_utils/human_sensor_data_streamer_interface.py
 create mode 100644 aria_data_loaders/aria_data_utils/quest3_data_streamer.py
 create mode 100644 aria_data_loaders/aria_data_utils/unified_quest_camera_interface.py
 create mode 100644 perception_and_utils_root/perception_and_utils/perception/detector_wrappers/human_motion_detector.py
 create mode 100644 perception_and_utils_root/perception_and_utils/utils/data_frame.py

diff --git a/.gitignore b/.gitignore
index 40c00829..14974780 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,8 @@
 
 intel_realsense_payload_for_spotsim2real/dist/*
 intel_realsense_payload_for_spotsim2real/build/*
+aria_data_loaders/aria_data_utils/quest3_streamer/**
+
 bd_spot_wrapper/spot_wrapper.egg-info/**
 bd_spot_wrapper/spot_wrapper/home.txt
 spot_rl_experiments/spot_rl.egg-info/**
diff --git a/aria_data_loaders/aria_data_utils/aria_live_streamer.py b/aria_data_loaders/aria_data_utils/aria_live_streamer.py
index 5d6355ce..cad2f9c8 100644
--- a/aria_data_loaders/aria_data_utils/aria_live_streamer.py
+++ b/aria_data_loaders/aria_data_utils/aria_live_streamer.py
@@ -19,7 +19,7 @@
 except Exception as e:
     print(f"Cannot import sophuspy due to {e}. Import sophus instead")
     import sophus as sp
-from aria_data_utils.aria_sdk_utils import update_iptables
+from aria_data_utils.aria_sdk_utils import update_iptables_aria
 from geometry_msgs.msg import Pose, PoseStamped
 from nav_msgs.msg import Odometry
 from perception_and_utils.perception.detector_wrappers.april_tag_detector import (
@@ -85,7 +85,6 @@ class AriaLiveReader:
     """
 
     def __init__(self, verbose: bool = False) -> None:
-        super().__init__()
         self.verbose = verbose
         self._in_index = 0
         self._out_index = 0
@@ -576,7 +575,7 @@ def main(do_update_iptables: bool, read_only: bool, debug: bool, hz: int):
     rospy.init_node("aria_live_reader", log_level=_log_level)
     rospy.logwarn("Starting up ROS node")
     if do_update_iptables:
-        update_iptables()
+        update_iptables_aria()
     else:
         rospy.logwarn("Not updating iptables")
     object_name = "bottle"
diff --git a/aria_data_loaders/aria_data_utils/aria_sdk_utils.py b/aria_data_loaders/aria_data_utils/aria_sdk_utils.py
index ab7f2ef1..a58fd5ba 100644
--- a/aria_data_loaders/aria_data_utils/aria_sdk_utils.py
+++ b/aria_data_loaders/aria_data_utils/aria_sdk_utils.py
@@ -15,19 +15,19 @@
 import subprocess
 
 
-def update_iptables() -> None:
+def update_iptables(protocol:str="tcp", direction:str="INPUT") -> None:
     """
-    Update firewall to permit incoming UDP connections for DDS
+    Update firewall to permit incoming tcp / udp connections for DDS
     """
     update_iptables_cmd = [
         "sudo",
         "iptables",
         "-A",
-        "INPUT",
+        direction,
         "-p",
-        "udp",
+        protocol,
         "-m",
-        "udp",
+        protocol,
         "--dport",
         "7000:8000",
         "-j",
@@ -36,3 +36,10 @@ def update_iptables() -> None:
     print("Running the following command to update iptables:")
     print(update_iptables_cmd)
     subprocess.run(update_iptables_cmd)
+
+def update_iptables_aria():
+    update_iptables(protocol="udp", direction="INPUT")
+
+def update_iptables_quest3():
+    update_iptables(protocol="tcp", direction="INPUT")
+    update_iptables(protocol="tcp", direction="OUTPUT")
diff --git a/aria_data_loaders/aria_data_utils/human_sensor_data_streamer_interface.py b/aria_data_loaders/aria_data_utils/human_sensor_data_streamer_interface.py
new file mode 100644
index 00000000..d6642499
--- /dev/null
+++ b/aria_data_loaders/aria_data_utils/human_sensor_data_streamer_interface.py
@@ -0,0 +1,225 @@
+# Copyright (c) Meta Platforms, Inc. and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+import click
+import cv2
+import numpy as np
+import rospy
+import sophus as sp
+from perception_and_utils.perception.detector_wrappers.april_tag_detector import (
+    AprilTagDetectorWrapper,
+)
+from perception_and_utils.perception.detector_wrappers.object_detector import (
+    ObjectDetectorWrapper,
+)
+from perception_and_utils.perception.detector_wrappers.human_motion_detector import (
+    HumanMotionDetector,
+)
+
+from spot_rl.utils.utils import ros_frames as rf
+
+# ROS imports
+from geometry_msgs.msg import Pose, PoseStamped
+from nav_msgs.msg import Odometry
+from tf2_ros import StaticTransformBroadcaster
+from visualization_msgs.msg import Marker
+
+from perception_and_utils.utils.data_frame import DataFrame
+
+class CameraParams:
+    """
+    CameraParams class to store camera parameters like focal length, principal point etc.
+    It stores the image, timestamp and frame number of each received image
+    """
+    def __init__(self, focal_lengths: Tuple[float, float], principal_point: Tuple[float, float]):
+        self._focal_lengths = focal_lengths  # type: Optional[Tuple[float, float]]
+        self._principal_point = principal_point  # type: Optional[Tuple[float, float]]
+
+class HumanSensorDataStreamerInterface:
+    def __init__(self, verbose: bool = False) -> None:
+        self.verbose = verbose
+
+        self._is_connected = False # TODO : Maybe not needed
+
+        # TODO: Update this later with PerceptionPipeline
+        # Create all detectors
+        self.april_tag_detector = AprilTagDetectorWrapper()
+        self.object_detector = ObjectDetectorWrapper()
+        self.human_motion_detector = HumanMotionDetector()
+
+        # ROS publishers & broadcaster
+        self.static_tf_broadcaster = StaticTransformBroadcaster()
+
+        # self.pose_of_interest_publisher = rospy.Publisher(
+        #     "/pose_of_interest", PoseStamped, queue_size=10
+        # ) # pose of device when object was detected from rgb image
+        # self.pose_of_interest_marker_publisher = rospy.Publisher(
+        #     "/pose_of_interest_marker", Marker, queue_size=10
+        # )
+        # self.odom_publisher = rospy.Publisher(
+        #     "/odom", Odometry, queue_size=10
+        # )
+        # self.human_pose_publisher = rospy.Publisher(
+        #     "/current_pose", PoseStamped, queue_size=10
+        # )
+        # self.human_handoff_pose_publisher = rospy.Publisher(
+        #     "/human_handoff_pose", PoseStamped, queue_size=10
+        # )
+
+        # TODO: Define DEVICE FRAME as a visual frame of reference i.e. front facing frame of reference
+        self.device_T_camera: Optional[sp.SE3] = None # TODO: Should be initialized by implementations of this class
+
+        # Maintain a list of all poses where qr code is detected (w.r.t deviceWorld)
+        self.marker_positions_list = (
+            []
+        )  # type: List[np.ndarray] # List of  position as np.ndarray (x, y, z)
+        self.marker_quaternion_list = (
+            []
+        )  # type: List[np.ndarray] # List of quaternions as np.ndarray (x, y, z, w)
+        self.avg_deviceWorld_T_marker = None  # type: Optional[sp.SE3]
+
+    def connect(self):
+        """
+        Connect to Device
+        """
+        raise NotImplementedError
+
+    def disconnect(self):
+        """
+        Disconnects from Device cleanly
+        """
+        raise NotImplementedError
+
+    def _setup_device(self) -> Any:
+        """
+        Setup device with appropriate configurations for live streaming
+
+        Updates:
+            - self.device_T_camera - Sophus SE3 transform from device frame (left-slam camera) to rgb-camera frame
+
+        Returns:
+            Any : TODO: Decide what to return
+        """
+        raise NotImplementedError
+
+    def get_latest_data_frame(self) -> Optional[DataFrame]:
+        raise NotImplementedError
+
+    def process_frame(
+        self,
+        frame: dict,
+        outputs: dict,
+        detect_qr: bool = False,
+        detect_objects: bool = False,
+        object_label="milk bottle",
+    ) -> Tuple[np.ndarray, dict]:
+        raise NotImplementedError
+
+    def initialize_april_tag_detector(self, outputs: dict = {}):
+        """
+        Initialize the april tag detector
+
+        Args:
+            outputs (dict, optional): Dictionary of outputs from the april tag detector. Defaults to {}.
+
+        Updates:
+            - self.april_tag_detector: AprilTagDetectorWrapper object
+
+        Returns:
+            outputs (dict): Dictionary of outputs from the april tag detector with following keys:
+                - "tag_image_list" - List of np.ndarrays of images with detections
+                - "tag_image_metadata_list" - List of image metadata
+                - "tag_base_T_marker_list" - List of Sophus SE3 transforms from base frame to marker
+                                             where base is "device" frame for aria
+        """
+        raise NotImplementedError
+
+    def initialize_object_detector(
+        self, outputs: dict = {}, object_labels: list = [], meta_objects: List[str] = []
+    ):
+        """
+        Initialize the object detector
+
+        Args:
+            outputs (dict, optional): Dictionary of outputs from the object detector. Defaults to {}.
+            object_labels (list, optional): List of object labels to detect. Defaults to [].
+            meta_objects (List[str], optional): List of other objects to be detected in the image
+                                                apart from the object_labels; meta_objects are then
+                                                used internally to detect intersection of bbox with
+                                                objects of interest (eg: hand with bottle)
+
+        Updates:
+            - self.object_detector: ObjectDetectorWrapper object
+
+        Returns:
+            outputs (dict): Dictionary of outputs from the object detector with following keys:
+                - "object_image_list" - List of np.ndarrays of images with detections
+                - "object_image_metadata_list" - List of image metadata
+                - "object_image_segment" - List of Int signifying which segment the image
+                    belongs to; smaller number means latter the segment time-wise
+                - "object_score_list" - List of Float signifying the detection score
+        """
+        raise NotImplementedError
+
+    def initialize_human_motion_detector(self, outputs: dict = {}):
+        """
+        """
+        raise NotImplementedError
+
+    ##### ROS Publishers #####
+    def publish_human_pose(self, data_frame: DataFrame):
+        """
+        Publishes current pose of Device as a pose of interest for Spot
+
+        Args:
+            data_frame (DataFrame): DataFrame object containing data packet with rgb, depth
+                                    and all necessary transformations
+
+        Publishes:
+            - /human_pose_publisher: ROS PoseStamped message for quest3World_T_rgb # TODO: Update this from rgb to VIZ frame
+        """
+        # Publish as pose for detail
+        pose_stamped = PoseStamped()
+        pose_stamped.header.stamp = rospy.Time().now()
+        pose_stamped.header.seq = data_frame._frame_number
+        pose_stamped.header.frame_id = rf.QUEST3_CAMERA
+        pose_stamped.pose = data_frame._deviceWorld_T_camera_rgb # TODO: Use a VIZ frame which is forward facing
+        self.human_pose_publisher.publish(pose_stamped)
+
+    def publish_marker(self, data_frame: DataFrame, marker_scale: float=0.1):
+        """
+        Publishes marker at pose of interest
+
+        Args:
+            data_frame (DataFrame): DataFrame object containing data packet with rgb, depth
+                                    and all necessary transformations
+
+        Publishes:
+            - /pose_of_interest_marker_publisher: ROS PoseStamped message for quest3World_T_rgb # TODO: Update this from rgb to VIZ frame
+        """
+        # Publish as marker for interpretability
+        marker = Marker()
+        marker.header.stamp = rospy.Time().now()
+        marker.header.frame_id = rf.rf.QUEST3_CAMERA
+        marker.id = self._frame_number
+
+        marker.type = Marker.SPHERE
+        marker.action = Marker.ADD
+
+        marker.pose = data_frame._deviceWorld_T_camera_rgb
+
+        marker.scale.x = marker_scale
+        marker.scale.y = marker_scale
+        marker.scale.z = marker_scale
+
+        marker.color.a = 1.0
+        marker.color.r = 0.45
+        marker.color.g = 0.95
+        marker.color.b = 0.2
+
+        self.pose_of_interest_marker_publisher.publish(marker)
diff --git a/aria_data_loaders/aria_data_utils/quest3_data_streamer.py b/aria_data_loaders/aria_data_utils/quest3_data_streamer.py
new file mode 100644
index 00000000..d7d8c122
--- /dev/null
+++ b/aria_data_loaders/aria_data_utils/quest3_data_streamer.py
@@ -0,0 +1,332 @@
+from aria_data_utils.human_sensor_data_streamer_interface import CameraParams, DataFrame, HumanSensorDataStreamerInterface
+from aria_data_utils.aria_sdk_utils import update_iptables_quest3
+from perception_and_utils.utils.math_utils import get_running_avg_a_T_b
+from quest3_streamer.unified_quest_camera import UnifiedQuestCamera # From internal repo
+
+import rospy
+import numpy as np
+import sophus as sp
+import cv2
+from typing import Any, Dict, Optional, Tuple, List
+import click
+import logging
+import time
+from perception_and_utils.utils.conversions import (
+    sophus_SE3_to_ros_PoseStamped,
+    sophus_SE3_to_ros_TransformStamped,
+    xyt_to_sophus_SE3,
+)
+from spot_rl.utils.utils import ros_frames as rf
+
+class Quest3DataStreamer(HumanSensorDataStreamerInterface):
+    def __init__(self, verbose: bool = False) -> None:
+        super().__init__(verbose)
+
+        # Init Logging
+        self.logger = logging.getLogger("Quest3DataStreamer")
+        self.logger.setLevel(logging.INFO)
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+        handler.setFormatter(formatter)
+        self.logger.addHandler(handler)
+
+        self.unified_quest3_camera = UnifiedQuestCamera()
+        self.rgb_cam_params : CameraParams = None
+        self.depth_cam_params : CameraParams = None
+        self._frame_number = -1
+        self._setup_device()
+
+    def connect(self):
+        """
+        Connect to Device
+        """
+        self._is_connected = True
+        self.logger("Quest3DataStreamer :: Connected")
+
+    def disconnect(self):
+        """
+        Disconnects from Device cleanly
+        """
+        self._is_connected = False
+        self.logger("Quest3DataStreamer :: Disconnected")
+
+    def is_connected(self) -> bool:
+        return self._is_connected # TODO: Return device connection status from Bruno's APIs
+
+    def _setup_device(self) -> Any:
+        # Setup RGB camera params
+        while self.rgb_cam_params is None:
+            rgb_fl = self.unified_quest3_camera.get_rbg_focal_lengths()
+            rgb_pp = self.unified_quest3_camera.get_rgb_principal_point()
+
+            if rgb_fl is not None and rgb_pp is not None:
+                rospy.logwarn("RGB Camera Params: focal lengths = {}, principal point = {}".format(rgb_fl, rgb_pp))
+                self.rgb_cam_params = CameraParams(focal_lengths=rgb_fl, principal_point=rgb_pp)
+        else:
+            self.logger.info("Waiting for RGB camera params to be set...")
+            rospy.logwarn("Waiting for RGB camera params to be set...")
+            time.sleep(0.1)
+        rospy.loginfo("RGB Camera Params: focal lengths = {}, principal point = {}".format(self.rgb_cam_params._focal_lengths, self.rgb_cam_params._principal_point))
+
+        # Setup Depth camera params
+        while self.depth_cam_params is None:
+            depth_fl = self.unified_quest3_camera.get_depth_focal_lengths()
+            depth_pp = self.unified_quest3_camera.get_depth_principal_point()
+
+            if depth_fl is not None and depth_pp is not None:
+                rospy.logwarn("Depth Camera Params: focal lengths = {}, principal point = {}".format(depth_fl, depth_pp))
+                self.depth_cam_params = CameraParams(focal_lengths=depth_fl, principal_point=depth_pp)
+            else:
+                self.logger.info("Waiting for Depth camera params to be set...")
+                rospy.logwarn("Waiting for Depth camera params to be set...")
+                time.sleep(0.1)
+        print("Depth Camera Params: focal lengths = {}, principal point = {}".format(self.depth_cam_params._focal_lengths, self.depth_cam_params._principal_point))
+
+    def get_latest_data_frame(self) -> Optional[DataFrame]:
+        # Create DataFrame object
+        data_frame = DataFrame()
+
+        # Populate rgb frame data iff all the required data is available; else return None
+        rgb_frame = self.unified_quest3_camera.get_rgb()
+        deviceWorld_T_rgbCam = self.unified_quest3_camera.get_deviceWorld_T_rgbCamera()
+        device_T_rgbCam = self.unified_quest3_camera.get_device_T_rgbCamera()
+        if rgb_frame is None or deviceWorld_T_rgbCam is None or device_T_rgbCam is None:
+            rospy.logwarn("Returning None, as rgb_frame or deviceWorld_T_rgbCam or device_T_rgbCam is None.")
+            return None
+        else:
+            data_frame._rgb_frame = rgb_frame
+            data_frame._deviceWorld_T_camera_rgb = deviceWorld_T_rgbCam
+            data_frame._device_T_camera_rgb = device_T_rgbCam
+
+        # Populate depth frame data iff all the required data is available; else return None
+        depth_frame = self.unified_quest3_camera.get_depth()
+        deviceWorld_T_depthCam = self.unified_quest3_camera.get_deviceWorld_T_depthCamera()
+        device_T_depthCam = self.unified_quest3_camera.get_device_T_depthCamera()
+        if depth_frame is None or deviceWorld_T_depthCam is None:
+            rospy.logwarn("Returning None as depth_frame or deviceWorld_T_depthCam or device_T_depthCam is None.")
+            return None
+        else:
+            data_frame._depth_frame = depth_frame
+            data_frame._deviceWorld_T_camera_depth = deviceWorld_T_depthCam
+            data_frame._device_T_camera_depth = device_T_depthCam
+
+        # Update frame number and timestamp of DataFrame
+        self._frame_number += 1
+        data_frame._frame_number = self._frame_number
+        data_frame._timestamp_s = time.time()
+
+        data_frame._avg_rgb_fps = self.unified_quest3_camera.get_avg_fps_rgb()
+        data_frame._avg_depth_fps = self.unified_quest3_camera.get_avg_fps_depth()
+
+        # Align depth frame with rgb frame
+        data_frame._aligned_depth_frame = None # TODO: Add this logic
+        return data_frame
+
+    def process_frame(
+        self,
+        data_frame: DataFrame,
+        outputs: dict,
+        detect_qr: bool = False,
+        detect_objects: bool = False,
+        detect_human_motion: bool = False,
+        object_label="milk bottle",
+    ) -> Tuple[np.ndarray, dict]:
+        # Initialize object_scores to empty dict for current image frame
+        # object_scores = {}  # type: Dict[str, Any]
+
+        # Get rgb image from frame
+        viz_img = data_frame._rgb_frame
+        if viz_img is None:
+            rospy.logwarn("No image found in frame")
+
+        if detect_qr:
+            (viz_img, camera_T_marker) = self.april_tag_detector.process_frame(frame=data_frame)  # type: ignore
+
+            deviceWorld_T_camera = data_frame._deviceWorld_T_camera_rgb
+            # Broadcast camera frame wrt deviceWorld
+            self.static_tf_broadcaster.sendTransform(
+                sophus_SE3_to_ros_TransformStamped(
+                    sp_se3=deviceWorld_T_camera,
+                    parent_frame=rf.QUEST3_WORLD,
+                    child_frame=rf.QUEST3_CAMERA,
+                )
+            )
+
+            if camera_T_marker is not None:
+                # Frame: a = deviceWorld (quest3World for Quest3 & ariaWorld for Aria)
+                # Frame: intermediate = camera (quest3's rgb camera frame & cpf frame for Aria)
+                # Frame: b = marker / qr code
+                (
+                    self.marker_positions_list,
+                    self.marker_quaternion_list,
+                    self.avg_deviceWorld_T_marker,
+                ) = get_running_avg_a_T_b(
+                    current_avg_a_T_b=self.avg_deviceWorld_T_marker,
+                    a_T_b_position_list=self.marker_positions_list,
+                    a_T_b_quaternion_list=self.marker_quaternion_list,
+                    a_T_intermediate=deviceWorld_T_camera,
+                    intermediate_T_b=camera_T_marker,
+                )
+
+                # Publish marker pose in ariaWorld frame
+                if self.avg_deviceWorld_T_marker is not None:
+                    avg_marker_T_deviceWorld = self.avg_deviceWorld_T_marker.inverse()
+                    self.static_tf_broadcaster.sendTransform(
+                        sophus_SE3_to_ros_TransformStamped(
+                            sp_se3=avg_marker_T_deviceWorld,
+                            parent_frame=rf.MARKER,
+                            child_frame=rf.QUEST3_WORLD,
+                        )
+                    )
+
+                viz_img, outputs = self.april_tag_detector.get_outputs(
+                    frame=data_frame,
+                    outputs=outputs,
+                    base_T_marker=camera_T_marker,
+                    timestamp=data_frame._timestamp_s,
+                    img_metadata=None,
+                    viz_img=viz_img,
+                )
+
+        if detect_human_motion:
+            _, activity_str = self.human_motion_detector.process_frame(frame=data_frame)
+            viz_img, _ = self.human_motion_detector.get_outputs(viz_img, {"activity": activity_str})
+            print(f"Activity: {activity_str}")
+            rospy.set_param("human_activity", activity_str)
+
+        # if detect_objects:
+        #     viz_img, object_scores = self.object_detector.process_frame(viz_img)
+        #     if object_label in object_scores.keys():
+        #         if self.verbose:
+        #             plt.imsave(f"frame_{self.frame_number}.jpg", viz_img)
+        #         # self.publish_pose_of_interest(frame.get("ros_pose"))
+
+        return viz_img, outputs
+
+    def initialize_april_tag_detector(self, outputs: dict = {}):
+        """
+        Initialize the april tag detector
+
+        Args:
+            outputs (dict, optional): Dictionary of outputs from the april tag detector. Defaults to {}.
+
+        Updates:
+            - self.april_tag_detector: AprilTagDetectorWrapper object
+
+        Returns:
+            outputs (dict): Dictionary of outputs from the april tag detector with following keys:
+                - "tag_image_list" - List of np.ndarrays of images with detections
+                - "tag_image_metadata_list" - List of image metadata
+                - "tag_base_T_marker_list" - List of Sophus SE3 transforms from base frame to marker
+                                             where base is "device" frame for aria
+        """
+
+        focal_lengths = self.rgb_cam_params._focal_lengths  # type: ignore
+        principal_point = self.rgb_cam_params._principal_point  # type: ignore
+
+        outputs.update(
+            self.april_tag_detector._init_april_tag_detector(
+                focal_lengths=focal_lengths, principal_point=principal_point
+            )
+        )
+        return outputs
+
+    def initialize_object_detector(
+        self, outputs: dict = {}, object_labels: list = [], meta_objects: List[str] = []
+    ):
+        """
+        Initialize the object detector
+
+        Args:
+            outputs (dict, optional): Dictionary of outputs from the object detector. Defaults to {}.
+            object_labels (list, optional): List of object labels to detect. Defaults to [].
+            meta_objects (List[str], optional): List of other objects to be detected in the image
+                                                apart from the object_labels; meta_objects are then
+                                                used internally to detect intersection of bbox with
+                                                objects of interest (eg: hand with bottle)
+
+        Updates:
+            - self.object_detector: ObjectDetectorWrapper object
+
+        Returns:
+            outputs (dict): Dictionary of outputs from the object detector with following keys:
+                - "object_image_list" - List of np.ndarrays of images with detections
+                - "object_image_metadata_list" - List of image metadata
+                - "object_image_segment" - List of Int signifying which segment the image
+                    belongs to; smaller number means latter the segment time-wise
+                - "object_score_list" - List of Float signifying the detection score
+        """
+        outputs.update(
+            self.object_detector._init_object_detector(
+                object_labels + meta_objects,
+                verbose=self.verbose,
+                version=2,
+            )
+        )
+        self.object_detector._core_objects = object_labels
+        self.object_detector._meta_objects = meta_objects
+
+        return outputs
+
+    def initialize_human_motion_detector(self):
+        self.human_motion_detector._init_human_motion_detector()
+
+
+@click.command()
+@click.option("--do-update-iptables", is_flag=True, type=bool, default=False)
+@click.option("--debug", is_flag=True, default=False)
+@click.option("--hz", type=int, default=100)
+def main(do_update_iptables:bool, debug:bool, hz:int):
+    if debug:
+        _log_level = rospy.DEBUG
+    else:
+        _log_level = rospy.INFO
+
+    rospy.init_node("quest3_data_streamer", log_level=_log_level)
+    rospy.loginfo("Starting quest3_data_streamer node")
+
+    if do_update_iptables:
+        update_iptables_quest3()
+    else:
+        rospy.logwarn("Not updating iptables")
+
+    rate = rospy.Rate(hz)
+    outputs: Dict[str, Any] = {}
+    data_streamer = None
+    cv2.namedWindow("Image", cv2.WINDOW_NORMAL)
+    try:
+        data_streamer = Quest3DataStreamer()
+
+        outputs = data_streamer.initialize_april_tag_detector(outputs=outputs)
+        outputs = data_streamer.initialize_object_detector(outputs=outputs, object_labels=["milk bottle"])
+        data_streamer.initialize_human_motion_detector()
+        # data_streamer.connect()
+        while not rospy.is_shutdown():
+            data_frame = data_streamer.get_latest_data_frame()
+            if data_frame is not None:
+                if debug:
+                    rospy.loginfo("Received data frame")
+                viz_img, outputs = data_streamer.process_frame(
+                    data_frame=data_frame,
+                    outputs=outputs,
+                    detect_qr=True,
+                    detect_objects=False,
+                    detect_human_motion=True,
+                )
+                # data_streamer.publish_human_pose(data_frame=data_frame)
+                cv2.imshow("Image", viz_img)
+                cv2.waitKey(1)
+            else:
+                print("No data frame received.")
+    except Exception as e:
+        print("Ending script.")
+        rospy.logwarn(f"Exception: {e}")
+        if data_streamer is not None:
+            data_streamer.disconnect()
+
+    cv2.destroyAllWindows()
+    rospy.logwarn("Disconnecting and shutting down the node")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/aria_data_loaders/aria_data_utils/unified_quest_camera_interface.py b/aria_data_loaders/aria_data_utils/unified_quest_camera_interface.py
new file mode 100644
index 00000000..a6108b00
--- /dev/null
+++ b/aria_data_loaders/aria_data_utils/unified_quest_camera_interface.py
@@ -0,0 +1,59 @@
+# Copyright (c) Meta Platforms, Inc. and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional, Tuple
+import sophus as sp
+import numpy as np
+
+"""
+This file defines the interface to for Quest3 streaming receiver API calls
+
+The implementation for this Interface is available for Meta employees only at the moment
+
+TODO: Define the steps to get implementation file
+"""
+
+
+class UnifiedQuest3CameraInterface:
+    def __init__(self):
+        pass
+
+    def get_rgb(self) -> Optional[np.ndarray]:
+        raise NotImplementedError
+
+    def get_depth(self) -> Optional[np.ndarray]:
+        raise NotImplementedError
+
+    def get_rbg_focal_lengths(self) -> Optional[Tuple[float, float]]:
+        raise NotImplementedError
+
+    def get_rgb_principal_point(self) -> Optional[Tuple[float, float]]:
+        raise NotImplementedError
+
+    def get_depth_focal_lengths(self) -> Optional[Tuple[float, float]]:
+        raise NotImplementedError
+
+    def get_depth_principal_point(self) -> Optional[Tuple[float, float]]:
+        raise NotImplementedError
+
+    def get_deviceWorld_T_rgbCamera(self) -> Optional[sp.SE3]:
+        raise NotImplementedError
+
+    def get_device_T_rgbCamera(self) -> Optional[sp.SE3]:
+        raise NotImplementedError
+
+    def get_deviceWorld_T_depthCamera(self) -> Optional[sp.SE3]:
+        raise NotImplementedError
+
+    def get_device_T_depthCamera(self) -> Optional[sp.SE3]:
+        raise NotImplementedError
+
+    def get_avg_fps_rgb(self):
+        raise NotImplementedError
+    
+    def get_avg_fps_depth(self):
+        raise NotImplementedError
+
+    def is_connected():
+        raise NotImplementedError
diff --git a/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/april_tag_detector.py b/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/april_tag_detector.py
index 74eaecdd..1c427dcc 100644
--- a/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/april_tag_detector.py
+++ b/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/april_tag_detector.py
@@ -21,6 +21,8 @@
     GenericDetector,
 )
 from perception_and_utils.utils.image_utils import decorate_img_with_text_for_qr
+from perception_and_utils.utils.data_frame import DataFrame
+
 
 DOCK_ID = int(os.environ.get("SPOT_DOCK_ID", 520))
 
@@ -103,7 +105,7 @@ def _init_april_tag_detector(
 
     def process_frame(
         self,
-        img_frame: np.ndarray,
+        frame: DataFrame,
     ) -> Tuple[np.ndarray, sp.SE3]:
         """
         Process image frame to detect QR code and estimate marker pose wrt camera
@@ -115,6 +117,8 @@ def process_frame(
             updated_img_frame (np.ndarray) : Image frame with detections and text for visualization
             camera_T_marker (sp.SE3) : SE3 matrix representing marker frame as detected in camera frame
         """
+        img_frame = frame._rgb_frame
+
         # Do nothing if detector is not enabled
         if self.is_enabled is False:
             self._logger.warning(
@@ -131,11 +135,12 @@ def process_frame(
 
     def get_outputs(
         self,
-        img_frame: np.ndarray,
+        frame: DataFrame,
         outputs: Dict,
         base_T_marker: sp.SE3,
         timestamp: int,
         img_metadata: Any,
+        viz_img: np.ndarray,
     ) -> Tuple[np.ndarray, Dict]:
         """
         Update the outputs dictionary with the processed image frame and pose data
@@ -156,18 +161,20 @@ def get_outputs(
                 - tag_image_metadata_list (List[Any]) : List of image metadata
 
         """
+        img_frame = frame._rgb_frame
+
         # Decorate image with text for visualization
-        img = decorate_img_with_text_for_qr(
-            img=img_frame,
+        viz_img = decorate_img_with_text_for_qr(
+            img=viz_img,
             frame_name_str="device",
             qr_position=base_T_marker.translation(),
         )
         self._logger.debug(f"Time stamp with AprilTag Detections- {timestamp}")
 
         # Append data to lists for return
-        outputs["tag_image_list"].append(img)
+        outputs["tag_image_list"].append(viz_img)
         outputs["tag_base_T_marker_list"].append(base_T_marker)
         if img_metadata is not None:
             outputs["tag_image_metadata_list"].append(img_metadata)
 
-        return img, outputs
+        return viz_img, outputs
diff --git a/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/generic_detector_interface.py b/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/generic_detector_interface.py
index f17aa8ea..1ee2236e 100644
--- a/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/generic_detector_interface.py
+++ b/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/generic_detector_interface.py
@@ -6,7 +6,7 @@
 from typing import Any, Dict, Tuple
 
 import numpy as np
-
+from perception_and_utils.utils.data_frame import DataFrame
 
 class GenericDetector:
     def __init__(self):
@@ -18,5 +18,5 @@ def enable_detector(self):
     def disable_detector(self):
         self.is_enabled = False
 
-    def process_frame(self, img_frame: np.ndarray) -> Tuple[np.ndarray, Dict[str, Any]]:
+    def process_frame(self, frame: DataFrame) -> Tuple[np.ndarray, Dict[str, Any]]:
         raise NotImplementedError
diff --git a/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/human_motion_detector.py b/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/human_motion_detector.py
new file mode 100644
index 00000000..c39d0235
--- /dev/null
+++ b/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/human_motion_detector.py
@@ -0,0 +1,95 @@
+# Copyright (c) Meta Platforms, Inc. and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import logging
+import os
+from typing import Any, Dict, Tuple
+
+import numpy as np
+import sophus as sp
+from perception_and_utils.perception.detector_wrappers.generic_detector_interface import (
+    GenericDetector,
+)
+from perception_and_utils.utils.image_utils import label_img
+from perception_and_utils.utils.data_frame import DataFrame
+
+class HumanMotionDetector(GenericDetector):
+    """
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._logger = logging.getLogger("HumanMotionDetector")
+
+    def _init_human_motion_detector(self, velocity_threshold=0.75, time_horizon_ns=1000000000):
+        """
+        Initialize the human motion detector
+        """
+        self._logger.info("Initializing Human Motion Detector")
+        self.enable_detector()
+        self._velocity_threshold = velocity_threshold
+
+        self._time_horizon_ns = time_horizon_ns # 1 second
+        self._previous_frames = []
+        self._len_prev_frame_cache = 100 # Keep this value larger than fps
+
+    def process_frame(
+        self,
+        frame: DataFrame,
+    ) -> Tuple[np.ndarray, str]:
+        # Do nothing if detector is not enabled
+        if self.is_enabled is False:
+            self._logger.warning(
+                "Human Motion detector is disabled... Skipping processing of current frame."
+            )
+            return None, "Disabled"
+
+        # Calculate the current position from the transformation matrix
+        current_position = frame._deviceWorld_T_camera_rgb.matrix()[:3, 3]
+
+        # Add the current frame to the cache
+        self._previous_frames.append((current_position, frame._timestamp_s))
+
+        if len(self._previous_frames) < self._len_prev_frame_cache:
+            return None, "Standing"
+
+        # If the cache is full, remove the oldest frame
+        if len(self._previous_frames) > self._len_prev_frame_cache:
+            self._previous_frames.pop(0)
+
+        # Find an index in "_previous_positions" which is 1 sec before current value
+        index = len(self._previous_frames) - 60
+        # If the index is out of range, return "Standing"
+        if index < 0:
+            return None, "Standing"
+
+        # Calculate the Euclidean distance between now & then
+        print(f"Current position: {current_position}   |   Previous position: {self._previous_frames[index][0]}")
+        distance = np.linalg.norm(current_position - self._previous_frames[index][0])
+        time = frame._timestamp_s - self._previous_frames[index][1] # as we are considering x frames per second so our window is 1 sec
+        print(f"Distance: {distance} m   |   Time: {time} sec")
+        avg_velocity = distance / time
+
+        print(f"Avg velocity: {avg_velocity} m/s")
+        # Determine the activity based on the velocity threshold
+        if avg_velocity > self._velocity_threshold:
+            return None, "Walking"
+
+        return None, "Standing"
+
+    def get_outputs(
+        self,
+        img_frame: np.ndarray,
+        outputs: Dict,
+    ) -> Tuple[np.ndarray, Dict]:
+        """
+        """
+        # Decorate image with text for visualization
+        label_img(
+            img=img_frame,
+            text=f"Activity: {outputs['activity']}",
+            org=(50, 200),
+        )
+        return img_frame, outputs
diff --git a/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/object_detector.py b/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/object_detector.py
index 8d6db717..27540a81 100644
--- a/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/object_detector.py
+++ b/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/object_detector.py
@@ -14,6 +14,7 @@
     centered_object_detection_heuristic,
     check_bbox_intersection,
 )
+from perception_and_utils.utils.data_frame import DataFrame
 from spot_rl.models.owlvit import OwlVit
 
 
@@ -198,7 +199,7 @@ def _aria_fetch_demo_heuristics(
 
         return valid, score, stop
 
-    def process_frame(self, img_frame: np.ndarray) -> Tuple[np.ndarray, Dict[str, Any]]:
+    def process_frame(self, frame: DataFrame) -> Tuple[np.ndarray, Dict[str, Any]]:
         """
         Process image frame to detect object instances and score them based on heuristics
 
@@ -211,6 +212,8 @@ def process_frame(self, img_frame: np.ndarray) -> Tuple[np.ndarray, Dict[str, An
             updated_img_frame (np.ndarray) : Image frame with detections and text for visualization
             object_scores (Dict[str, float]) : Dictionary of scores for each object in the image frame
         """
+        img_frame = frame._rgb_frame
+
         # Do nothing if detector is not enabled
         if self.is_enabled is False:
             self._logger.warning(
@@ -223,7 +226,7 @@ def process_frame(self, img_frame: np.ndarray) -> Tuple[np.ndarray, Dict[str, An
         )
         return updated_img_frame, object_scores
 
-    def process_frame_offline(self, img_frame: np.ndarray):
+    def process_frame_offline(self, frame: DataFrame) -> Tuple[np.ndarray, Dict[str, Any]]:
         """
         Process image frame to detect object instances and score them based on heuristics
 
@@ -287,7 +290,7 @@ def process_frame_offline(self, img_frame: np.ndarray):
 
     def get_outputs(
         self,
-        img_frame: np.ndarray,
+        frame: DataFrame,
         outputs: Dict,
         object_scores: Dict,
         img_metadata: Any,
@@ -310,6 +313,8 @@ def get_outputs(
                 - object_image_segment_list (List[int]) : List of image segment ids
                 - object_score_list (List[float]) : List of scores for each image frame
         """
+        img_frame = frame._rgb_frame
+
         if object_scores is not {}:
             for object_name in object_scores.keys():
                 outputs["object_image_list"][object_name].append(img_frame)
diff --git a/perception_and_utils_root/perception_and_utils/utils/conversions.py b/perception_and_utils_root/perception_and_utils/utils/conversions.py
index c720b616..530f00ec 100644
--- a/perception_and_utils_root/perception_and_utils/utils/conversions.py
+++ b/perception_and_utils_root/perception_and_utils/utils/conversions.py
@@ -205,6 +205,11 @@ def sophus_SE3_to_ros_TransformStamped(
     ros_trf_stamped.transform.translation.z = float(trans[2])
 
     quat = Rotation.from_matrix(sp_se3.rotationMatrix()).as_quat()
+    quat = quat/np.linalg.norm(quat)
+    # Ensure quaternion's w is always positive for effective averaging as multiple quaternions can represent the same rotation
+    if quat[3] > 0:
+        quat = -1.0 * quat
+
     ros_trf_stamped.transform.rotation.x = float(quat[0])
     ros_trf_stamped.transform.rotation.y = float(quat[1])
     ros_trf_stamped.transform.rotation.z = float(quat[2])
diff --git a/perception_and_utils_root/perception_and_utils/utils/data_frame.py b/perception_and_utils_root/perception_and_utils/utils/data_frame.py
new file mode 100644
index 00000000..6cccf06c
--- /dev/null
+++ b/perception_and_utils_root/perception_and_utils/utils/data_frame.py
@@ -0,0 +1,20 @@
+import numpy as np
+import sophus as sp
+
+class DataFrame:
+    """
+    DataFrame class to store instantaneous data from Human Sensor stream.
+    It stores the image, timestamp and frame number of each received image
+    """
+    def __init__(self):
+        self._frame_number = -1
+        self._timestamp_s = -1
+        self._avg_rgb_fps = None # type: Optional[float]
+        self._avg_depth_fps = None # type: Optional[float]
+        self._rgb_frame = None # type: Optional[np.ndarray]
+        self._depth_frame = None # type: Optional[np.ndarray]
+        self._aligned_depth_frame = None # type: Optional[np.ndarray]
+        self._deviceWorld_T_camera_rgb = None # type: Optional[sp.SE3]
+        self._deviceWorld_T_camera_depth = None # type: Optional[sp.SE3]
+        self._device_T_camera_rgb = None # type: Optional[sp.SE3]
+        self._device_T_camera_depth = None # type: Optional[sp.SE3]
diff --git a/perception_and_utils_root/perception_and_utils/utils/math_utils.py b/perception_and_utils_root/perception_and_utils/utils/math_utils.py
index 61be50c4..1b9f0e66 100644
--- a/perception_and_utils_root/perception_and_utils/utils/math_utils.py
+++ b/perception_and_utils_root/perception_and_utils/utils/math_utils.py
@@ -15,7 +15,7 @@
 from scipy.spatial.transform import Rotation
 
 FILTER_DIST = 2.4  # in meters (distance for valid detection)
-FIXED_LIST_LENGTH = 10  # Fixed length of the a_T_b_{position,orientation} lists used for average computation
+FIXED_LIST_LENGTH = 1  # Fixed length of the a_T_b_{position,orientation} lists used for average computation
 
 
 def compute_avg_sophus_SE3_from_nplist(
@@ -40,9 +40,11 @@ def compute_avg_sophus_SE3_from_nplist(
     a_T_b_position_np = np.array(a_T_b_position_list)
     avg_a_T_b_position = np.mean(a_T_b_position_np, axis=0)
 
-    a_T_b_quaternion_np = np.array(a_T_b_quaternion_list)
+    a_T_b_quaternion_np = np.array(a_T_b_quaternion_list).reshape(-1, 4)
     avg_a_T_b_quaternion = np.mean(a_T_b_quaternion_np, axis=0)
-
+    # avg_a_T_b_quaternion = average_quaternion_numpy(
+    #     a_T_b_quaternion_np, W=None
+    # )
     avg_sp_se3 = sp.SE3(
         Rotation.from_quat(avg_a_T_b_quaternion).as_matrix(), avg_a_T_b_position
     )
@@ -102,7 +104,7 @@ def get_running_avg_a_T_b(
 
         # Ensure quaternion's w is always negative for effective averaging as multiple quaternions can represent the same rotation
         quat = Rotation.from_matrix(a_T_b.rotationMatrix()).as_quat()
-        if quat[3] > 0:
+        if quat[3] < 0.0:
             quat = -1.0 * quat
         a_T_b_quaternion_list.append(quat)
 
@@ -119,3 +121,9 @@ def get_running_avg_a_T_b(
         )
 
     return a_T_b_position_list, a_T_b_quaternion_list, avg_a_T_b
+
+def average_quaternion_numpy(Q, W=None):
+    if W is not None:
+        Q *= W[:, None]
+    eigvals, eigvecs = np.linalg.eig(Q.T@Q)
+    return eigvecs[:, eigvals.argmax()]
diff --git a/perception_and_utils_root/setup.py b/perception_and_utils_root/setup.py
index 25790bad..88700029 100644
--- a/perception_and_utils_root/setup.py
+++ b/perception_and_utils_root/setup.py
@@ -10,7 +10,7 @@
     version="0.0",
     author="Kavit Shah",
     author_email="kavits98@gmail.com",
-    description="SIRo friendly perception and utils for Aria and Spot",
+    description="SIRo friendly perception and utils for Aria+Quest3 and Spot",
     packages=setuptools.find_packages(),
     install_requires=[
         "click",
diff --git a/spot_rl_experiments/configs/ros_frame_names.yaml b/spot_rl_experiments/configs/ros_frame_names.yaml
index 7aebaec2..a6bdbf95 100644
--- a/spot_rl_experiments/configs/ros_frame_names.yaml
+++ b/spot_rl_experiments/configs/ros_frame_names.yaml
@@ -6,6 +6,7 @@ ARIA: "aria" # aria in cpf frame
 ARIA_DEVICE: "ariaDevice" # aria in device frame
 QUEST3_WORLD: "quest3World"
 QUEST3_CAMERA: "camera"
+QUEST3_DEVICE: "device"
 MARKER: "marker"
 
 

From e30c0504e9026706b3be844324d414ffb5af630e Mon Sep 17 00:00:00 2001
From: Kavit Shah <kavitsha@meta.com>
Date: Wed, 28 Aug 2024 19:44:17 -0700
Subject: [PATCH 5/7] Human motion recognition with fps display

---
 .../human_sensor_data_streamer_interface.py   | 38 ++++++-----
 .../aria_data_utils/quest3_data_streamer.py   | 30 +++++++--
 .../human_motion_detector.py                  | 66 +++++++++++++------
 .../perception_and_utils/utils/data_frame.py  |  1 +
 .../perception_and_utils/utils/image_utils.py | 30 +++++++++
 5 files changed, 122 insertions(+), 43 deletions(-)

diff --git a/aria_data_loaders/aria_data_utils/human_sensor_data_streamer_interface.py b/aria_data_loaders/aria_data_utils/human_sensor_data_streamer_interface.py
index d6642499..06c565d9 100644
--- a/aria_data_loaders/aria_data_utils/human_sensor_data_streamer_interface.py
+++ b/aria_data_loaders/aria_data_utils/human_sensor_data_streamer_interface.py
@@ -26,6 +26,7 @@
 # ROS imports
 from geometry_msgs.msg import Pose, PoseStamped
 from nav_msgs.msg import Odometry
+from std_msgs.msg import String
 from tf2_ros import StaticTransformBroadcaster
 from visualization_msgs.msg import Marker
 
@@ -55,21 +56,8 @@ def __init__(self, verbose: bool = False) -> None:
         # ROS publishers & broadcaster
         self.static_tf_broadcaster = StaticTransformBroadcaster()
 
-        # self.pose_of_interest_publisher = rospy.Publisher(
-        #     "/pose_of_interest", PoseStamped, queue_size=10
-        # ) # pose of device when object was detected from rgb image
-        # self.pose_of_interest_marker_publisher = rospy.Publisher(
-        #     "/pose_of_interest_marker", Marker, queue_size=10
-        # )
-        # self.odom_publisher = rospy.Publisher(
-        #     "/odom", Odometry, queue_size=10
-        # )
-        # self.human_pose_publisher = rospy.Publisher(
-        #     "/current_pose", PoseStamped, queue_size=10
-        # )
-        # self.human_handoff_pose_publisher = rospy.Publisher(
-        #     "/human_handoff_pose", PoseStamped, queue_size=10
-        # )
+        self.human_activity_history_pub = rospy.Publisher(
+            "/human_activity_history", String, queue_size=10)
 
         # TODO: Define DEVICE FRAME as a visual frame of reference i.e. front facing frame of reference
         self.device_T_camera: Optional[sp.SE3] = None # TODO: Should be initialized by implementations of this class
@@ -223,3 +211,23 @@ def publish_marker(self, data_frame: DataFrame, marker_scale: float=0.1):
         marker.color.b = 0.2
 
         self.pose_of_interest_marker_publisher.publish(marker)
+
+    def publish_human_activity_history(self, human_history: List[Tuple[float,str]]):
+        """
+        Publishes human activity history as a string
+
+        Args:
+            data_frame (DataFrame): DataFrame object containing data packet with rgb, depth
+                                    and all necessary transformations
+
+        Publishes:
+            - /human_activity_history: ROS String message for human activity history
+        """
+
+        # Iterate over human history and create a string
+        human_history_str = ""
+        for timestamp, activity in human_history:
+            human_history_str += f"{timestamp},{activity}|"  # str = "time1,Standing|time2,Walking"
+
+        # Publish human activity history string
+        self.human_activity_history_pub.publish(human_history_str)
diff --git a/aria_data_loaders/aria_data_utils/quest3_data_streamer.py b/aria_data_loaders/aria_data_utils/quest3_data_streamer.py
index d7d8c122..da03692c 100644
--- a/aria_data_loaders/aria_data_utils/quest3_data_streamer.py
+++ b/aria_data_loaders/aria_data_utils/quest3_data_streamer.py
@@ -1,7 +1,12 @@
 from aria_data_utils.human_sensor_data_streamer_interface import CameraParams, DataFrame, HumanSensorDataStreamerInterface
 from aria_data_utils.aria_sdk_utils import update_iptables_quest3
 from perception_and_utils.utils.math_utils import get_running_avg_a_T_b
-from quest3_streamer.unified_quest_camera import UnifiedQuestCamera # From internal repo
+from perception_and_utils.utils.frame_rate_counter import FrameRateCounter # Local Frame rate counter
+try:
+    from quest3_streamer.unified_quest_camera import UnifiedQuestCamera # From internal repo
+except ImportError:
+    print("Could not import Quest3 camera wrapper")
+
 
 import rospy
 import numpy as np
@@ -30,6 +35,8 @@ def __init__(self, verbose: bool = False) -> None:
         handler.setFormatter(formatter)
         self.logger.addHandler(handler)
 
+        self.frame_rate_counter = FrameRateCounter()
+
         self.unified_quest3_camera = UnifiedQuestCamera()
         self.rgb_cam_params : CameraParams = None
         self.depth_cam_params : CameraParams = None
@@ -115,11 +122,16 @@ def get_latest_data_frame(self) -> Optional[DataFrame]:
         data_frame._frame_number = self._frame_number
         data_frame._timestamp_s = time.time()
 
-        data_frame._avg_rgb_fps = self.unified_quest3_camera.get_avg_fps_rgb()
-        data_frame._avg_depth_fps = self.unified_quest3_camera.get_avg_fps_depth()
 
         # Align depth frame with rgb frame
         data_frame._aligned_depth_frame = None # TODO: Add this logic
+
+        # Update frame rate counter
+        self.frame_rate_counter.update()
+        data_frame._avg_rgb_fps = self.unified_quest3_camera.get_avg_fps_rgb()
+        data_frame._avg_depth_fps = self.unified_quest3_camera.get_avg_fps_depth()
+        data_frame._avg_data_frame_fps = self.frame_rate_counter.avg_value()
+
         return data_frame
 
     def process_frame(
@@ -189,10 +201,14 @@ def process_frame(
                 )
 
         if detect_human_motion:
-            _, activity_str = self.human_motion_detector.process_frame(frame=data_frame)
-            viz_img, _ = self.human_motion_detector.get_outputs(viz_img, {"activity": activity_str})
-            print(f"Activity: {activity_str}")
-            rospy.set_param("human_activity", activity_str)
+            activity_str, avg_velocity = self.human_motion_detector.process_frame(frame=data_frame)
+            ops = {
+                "activity": activity_str,
+                "velocity": avg_velocity,
+                "data_frame_fps": data_frame._avg_data_frame_fps,
+            }
+            viz_img, _ = self.human_motion_detector.get_outputs(viz_img, ops)
+            self.publish_human_activity_history(self.human_motion_detector.get_human_motion_history())
 
         # if detect_objects:
         #     viz_img, object_scores = self.object_detector.process_frame(viz_img)
diff --git a/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/human_motion_detector.py b/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/human_motion_detector.py
index c39d0235..3a83a9a9 100644
--- a/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/human_motion_detector.py
+++ b/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/human_motion_detector.py
@@ -12,7 +12,7 @@
 from perception_and_utils.perception.detector_wrappers.generic_detector_interface import (
     GenericDetector,
 )
-from perception_and_utils.utils.image_utils import label_img
+from perception_and_utils.utils.image_utils import label_img, decorate_img_with_fps
 from perception_and_utils.utils.data_frame import DataFrame
 
 class HumanMotionDetector(GenericDetector):
@@ -23,28 +23,39 @@ def __init__(self):
         super().__init__()
         self._logger = logging.getLogger("HumanMotionDetector")
 
-    def _init_human_motion_detector(self, velocity_threshold=0.75, time_horizon_ns=1000000000):
+    def _init_human_motion_detector(
+        self,
+        # velocity_threshold=1.2,
+        velocity_threshold=0.4, # hacking this
+        time_horizon_ns=1000000000):
         """
         Initialize the human motion detector
+
+        Args:
+            velocity_threshold (float, optional): Velocity threshold for determining walking vs standing. Defaults to 1.2 metres/sec
+            time_horizon_ns (int, optional): Time horizon for calculating velocity. Defaults to 1 second (in nanosec)
         """
         self._logger.info("Initializing Human Motion Detector")
         self.enable_detector()
         self._velocity_threshold = velocity_threshold
 
         self._time_horizon_ns = time_horizon_ns # 1 second
-        self._previous_frames = []
+        self._previous_frames = [] # type: List[Tuple[np.ndarray, float]]
         self._len_prev_frame_cache = 100 # Keep this value larger than fps
 
+
+        self._human_motion_history = [] # type: List[Tuple[float, str]]
+
     def process_frame(
         self,
-        frame: DataFrame,
-    ) -> Tuple[np.ndarray, str]:
+        frame: DataFrame, # new data frame
+    ) -> Tuple[str, float]:
         # Do nothing if detector is not enabled
         if self.is_enabled is False:
             self._logger.warning(
                 "Human Motion detector is disabled... Skipping processing of current frame."
             )
-            return None, "Disabled"
+            return "Disabled", None
 
         # Calculate the current position from the transformation matrix
         current_position = frame._deviceWorld_T_camera_rgb.matrix()[:3, 3]
@@ -52,32 +63,30 @@ def process_frame(
         # Add the current frame to the cache
         self._previous_frames.append((current_position, frame._timestamp_s))
 
-        if len(self._previous_frames) < self._len_prev_frame_cache:
-            return None, "Standing"
+        # If the cache is just 1 element or less, return "Standing"
+        lookback_index=10
+        if len(self._previous_frames) < lookback_index+1:
+            return "Standing", None
 
         # If the cache is full, remove the oldest frame
         if len(self._previous_frames) > self._len_prev_frame_cache:
             self._previous_frames.pop(0)
 
-        # Find an index in "_previous_positions" which is 1 sec before current value
-        index = len(self._previous_frames) - 60
-        # If the index is out of range, return "Standing"
-        if index < 0:
-            return None, "Standing"
-
         # Calculate the Euclidean distance between now & then
-        print(f"Current position: {current_position}   |   Previous position: {self._previous_frames[index][0]}")
-        distance = np.linalg.norm(current_position - self._previous_frames[index][0])
-        time = frame._timestamp_s - self._previous_frames[index][1] # as we are considering x frames per second so our window is 1 sec
+        print(f"Current position: {current_position}   |   Previous position: {self._previous_frames[lookback_index][0]}")
+        distance = np.linalg.norm(current_position - self._previous_frames[lookback_index][0])
+        time = frame._timestamp_s - self._previous_frames[lookback_index][1] # as we are considering x frames per second so our window is 1 sec
         print(f"Distance: {distance} m   |   Time: {time} sec")
         avg_velocity = distance / time
 
         print(f"Avg velocity: {avg_velocity} m/s")
+        state = "Standing"
         # Determine the activity based on the velocity threshold
         if avg_velocity > self._velocity_threshold:
-            return None, "Walking"
+            state = "Walking"
 
-        return None, "Standing"
+        self.update_human_motion_history(state, frame._timestamp_s)
+        return state, avg_velocity
 
     def get_outputs(
         self,
@@ -86,10 +95,25 @@ def get_outputs(
     ) -> Tuple[np.ndarray, Dict]:
         """
         """
+        viz_img = img_frame.copy()
         # Decorate image with text for visualization
         label_img(
-            img=img_frame,
+            img=viz_img,
             text=f"Activity: {outputs['activity']}",
             org=(50, 200),
         )
-        return img_frame, outputs
+        label_img(
+            img=viz_img,
+            text=f"Velocity: {outputs['velocity']}",
+            org=(50, 250),
+        )
+        decorate_img_with_fps(viz_img, outputs["data_frame_fps"],pos=(50, 400))
+        return viz_img, outputs
+
+    def update_human_motion_history(self, state, timestamp):
+        # If current state of human is different from last state in history list, only then update history
+        if len(self._human_motion_history) == 0 or self._human_motion_history[-1][1] != state:
+            self._human_motion_history.append((timestamp, state))
+
+    def get_human_motion_history(self):
+        return self._human_motion_history
diff --git a/perception_and_utils_root/perception_and_utils/utils/data_frame.py b/perception_and_utils_root/perception_and_utils/utils/data_frame.py
index 6cccf06c..116bd16a 100644
--- a/perception_and_utils_root/perception_and_utils/utils/data_frame.py
+++ b/perception_and_utils_root/perception_and_utils/utils/data_frame.py
@@ -11,6 +11,7 @@ def __init__(self):
         self._timestamp_s = -1
         self._avg_rgb_fps = None # type: Optional[float]
         self._avg_depth_fps = None # type: Optional[float]
+        self._avg_data_frame_fps = None # type: Optional[float]
         self._rgb_frame = None # type: Optional[np.ndarray]
         self._depth_frame = None # type: Optional[np.ndarray]
         self._aligned_depth_frame = None # type: Optional[np.ndarray]
diff --git a/perception_and_utils_root/perception_and_utils/utils/image_utils.py b/perception_and_utils_root/perception_and_utils/utils/image_utils.py
index cfd39819..a1037744 100644
--- a/perception_and_utils_root/perception_and_utils/utils/image_utils.py
+++ b/perception_and_utils_root/perception_and_utils/utils/image_utils.py
@@ -127,6 +127,36 @@ def decorate_img_with_text_for_qr(
     return img
 
 
+def decorate_img_with_fps(frame: np.ndarray, fps_value: float, pos=(50, 50)):
+    text = f"{fps_value:.2f} FPS"
+    
+    font = cv2.FONT_HERSHEY_PLAIN
+    font_scale = 2
+    text_color = (0, 0, 225)
+    text_color_bg = (0, 0, 0)
+    font_thickness = 2
+    (text_w, text_h), baseline = cv2.getTextSize(
+        text, font, font_scale, font_thickness
+    )
+    cv2.rectangle(
+        frame,
+        pos,
+        (pos[0] + text_w, pos[1] - text_h - font_scale - 1),
+        text_color_bg,
+        -1,
+        cv2.LINE_8,
+    )
+    cv2.putText(
+        frame,
+        text,
+        pos,
+        font,
+        font_scale,
+        text_color,
+        font_thickness,
+        cv2.LINE_8,
+    )
+
 def label_img(
     img: np.ndarray,
     text: str,

From 9bb8b3bdbe2b48e26c1c387ce2db2201fb922811 Mon Sep 17 00:00:00 2001
From: Kavit Shah <kavitsha@meta.com>
Date: Thu, 29 Aug 2024 00:45:29 -0700
Subject: [PATCH 6/7] Uncomment object detector and remove episodic memory
 script for quest3

---
 .../episodic_memory_robotic_fetch_quest3_c.py | 270 ------------------
 .../aria_data_utils/quest3_data_streamer.py   | 114 +++++---
 2 files changed, 80 insertions(+), 304 deletions(-)
 delete mode 100644 aria_data_loaders/aria_data_utils/episodic_memory_robotic_fetch_quest3_c.py

diff --git a/aria_data_loaders/aria_data_utils/episodic_memory_robotic_fetch_quest3_c.py b/aria_data_loaders/aria_data_utils/episodic_memory_robotic_fetch_quest3_c.py
deleted file mode 100644
index 4bb37c1e..00000000
--- a/aria_data_loaders/aria_data_utils/episodic_memory_robotic_fetch_quest3_c.py
+++ /dev/null
@@ -1,270 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and its affiliates.
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import logging
-from typing import Any, Dict, List, Optional, Tuple
-
-import click
-import numpy as np
-import rospy
-import sophus as sp
-from geometry_msgs.msg import PoseStamped
-from perception_and_utils.utils.conversions import (
-    ros_PoseStamped_to_sophus_SE3,
-    ros_TransformStamped_to_sophus_SE3,
-    sophus_SE3_to_ros_PoseStamped,
-    sophus_SE3_to_ros_TransformStamped,
-    xyt_to_sophus_SE3,
-)
-from spot_rl.envs.skill_manager import SpotSkillManager
-from spot_rl.utils.utils import ros_frames as rf
-from spot_wrapper.spot import Spot, SpotCamIds
-from spot_wrapper.spot_qr_detector import SpotQRDetector
-from std_msgs.msg import Bool
-
-# from spot_rl.envs.skill_manager import SpotSkillManager
-# from spot_rl.utils.utils import ros_frames as rf
-# from spot_wrapper.spot import Spot, SpotCamIds
-# from spot_wrapper.spot_qr_detector import SpotQRDetector
-# from std_msgs.msg import Bool
-from tf2_ros import (
-    ConnectivityException,
-    ExtrapolationException,
-    LookupException,
-    StaticTransformBroadcaster,
-)
-from tf2_ros.buffer import Buffer
-from tf2_ros.transform_listener import TransformListener
-
-# For information on useful reference frame, please check https://github.com/facebookresearch/spot-sim2real?tab=readme-ov-file#eyeglasses-run-spot-aria-project-code
-
-# Waypoints on left of the dock to observe the QR code clearly
-SPOT_DOCK_OBSERVER_WAYPOINT_LEFT = [
-    0.3840912007597151,
-    -0.5816728569741766,
-    149.58030524832756,
-]
-# Waypoints on right of the dock to observe the QR code clearly
-SPOT_DOCK_OBSERVER_WAYPOINT_RIGHT = [
-    0.5419185005639034,
-    0.5319243891865243,
-    -154.1506754378722,
-]
-
-
-class EpisodicMemoryRoboticFetchQuest3:
-    """
-    This class handles the core logic of the Episodic Memory Robotic Fetch
-    It begines with Spot looking at the marker and detecting it which results
-    in a static tf of marker w.r.t spotWorld frame being published.
-
-    Then it waits for Aria to detect marker and publish ariaWorld w.r.t marker
-    as a static tf. This is done by AriaLiveReader node. This step ensures both
-    Spot & Aria worlds are now mutually transformable which is necessary for the
-    rest of the logic.
-
-    It constantly listens to Aria's current pose w.r.t ariaWorld frame.
-    It also waits for Aria to detect the object and publish that pose of interest
-    (i.e. aria's cpf frame when it saw the object) w.r.t ariaWorld frame.
-
-    Once both the poses are received, it waits for the spot_fetch trigger to be True
-    which will be set by the user via `rostopic pub` on cli.
-
-    Once the trigger is True, it will run the following steps:
-        1. Nav to Pose of interest (Aria's cpf frame when it saw the object)
-        2. Pick the object (object is hardcoded to be bottle)
-        3. Nav to Pose of interest (Aria wearer's last location before triggering spot_fetch)
-        4. Place the object (object is hardcoded to be bottle)
-
-    Args:
-        spot: Spot object
-        verbose: bool indicating whether to print debug logs
-
-    Warning: Do not create objects of this class directly. It is a standalone node
-    """
-
-    def __init__(self):
-        # # @FIXME: This is initializing a ros-node silently (at core of inheritence in
-        # # SpotRobotSubscriberMixin). Make it explicit
-        # self.skill_manager = SpotSkillManager(
-        #     spot=spot,
-        #     verbose=verbose,
-        #     use_policies=use_policies,
-        # )
-
-        # Navigate Spot to SPOT_DOCK_OBSERVER_WAYPOINT_LEFT so that it can see the marker
-        # status, msg = self.skill_manager.nav(
-        #     SPOT_DOCK_OBSERVER_WAYPOINT_LEFT[0],
-        #     SPOT_DOCK_OBSERVER_WAYPOINT_LEFT[1],
-        #     np.deg2rad(SPOT_DOCK_OBSERVER_WAYPOINT_LEFT[2]),
-        # )
-        # if not status:
-        #     rospy.logerr(
-        #         f"Failed to navigate to spot dock observer waypoint. Error: {msg}. Exiting..."
-        #     )
-        #     return
-
-        # # Sit Spot down
-        # self.skill_manager.sit()
-
-        # # Instantiate static transform broadcaster for publishing marker w.r.t spotWorld transforms
-        # self.static_tf_broadcaster = StaticTransformBroadcaster()
-
-        # self._spot_fetch_trigger_sub = rospy.Subscriber(
-        #     "/spot_fetch_trigger", Bool, self.spot_fetch_trigger_callback
-        # )
-        # self._spot_fetch_flag = False
-
-        # Detect the marker and get the average pose of marker w.r.t spotWorld frame
-        # cam_id = SpotCamIds.HAND_COLOR
-        # spot_qr = SpotQRDetector(spot=spot, cam_ids=[cam_id])
-        # avg_spotWorld_T_marker = spot_qr.get_avg_spotWorld_T_marker(cam_id=cam_id)
-
-        # # Publish marker w.r.t spotWorld transforms for 5 seconds so it can be seen in rviz
-        # start_time = rospy.Time.now()
-        # while rospy.Time.now() - start_time < rospy.Duration(2.0):
-        #     self.static_tf_broadcaster.sendTransform(
-        #         sophus_SE3_to_ros_TransformStamped(
-        #             sp_se3=avg_spotWorld_T_marker,
-        #             parent_frame=rf.SPOT_WORLD,
-        #             child_frame=rf.MARKER,
-        #         )
-        #     )
-        #     rospy.sleep(0.5)
-
-        # Instantiate publisher for publishing go-to pose for handoff
-        self._nav_PoseStamped_pub_for_place = rospy.Publisher(
-            "/nav_pose_for_place_viz", PoseStamped, queue_size=10
-        )
-        self.nav_xyt_for_handoff = None
-
-        # Initialize static transform subscriber for listening to ariaWorld w.r.t marker transform
-        self._tf_buffer = Buffer()
-        self._tf_listener = TransformListener(self._tf_buffer)
-
-        # Wait for transform from camera to spotWorld to be available
-        while not rospy.is_shutdown():
-            while not rospy.is_shutdown() and not self._tf_buffer.can_transform(
-                target_frame=rf.SPOT_WORLD, source_frame="camera", time=rospy.Time()
-            ):
-                rospy.logwarn_throttle(
-                    5.0, "Waiting for transform from camera to spotWorld"
-                )
-                rospy.sleep(0.5)
-            try:
-                transform_stamped_spotWorld_T_camera = self._tf_buffer.lookup_transform(
-                    target_frame=rf.SPOT_WORLD,
-                    source_frame="camera",
-                    time=rospy.Time(0),
-                )
-            except (LookupException, ConnectivityException, ExtrapolationException):
-                raise RuntimeError(
-                    "Unable to lookup transform from ariaWorld to spotWorld"
-                )
-            spotWorld_T_camera = ros_TransformStamped_to_sophus_SE3(
-                ros_trf_stamped=transform_stamped_spotWorld_T_camera
-            )
-
-            self.nav_xyt_for_handoff = self.get_nav_xyt_to_wearer(spotWorld_T_camera)
-            # Publish place pose in spotWorld frame so it can be seen in rviz until spot_fetch is triggered
-            self._nav_PoseStamped_pub_for_place.publish(
-                sophus_SE3_to_ros_PoseStamped(
-                    sp_se3=xyt_to_sophus_SE3(
-                        np.asarray(self.nav_xyt_for_handoff, dtype=np.float32)
-                    ),
-                    parent_frame=rf.SPOT_WORLD,
-                )
-            )
-
-            # rospy.logwarn_throttle(
-            #     5.0, "Waiting for demo trigger on /spot_fetch_trigger topic"
-            # )
-            rospy.sleep(1.0)
-
-        # Set run status to true in the beginning
-        # run_status = True
-
-        # if run_status:
-        #     # Nav to x,y,theta of interest (Aria's cpf frame when it saw the object)
-        #     nav_xyt_for_handoff = self.nav_xyt_for_handoff
-        #     rospy.loginfo(f"Nav 2D location for pick: {nav_xyt_for_handoff}")
-
-        #     self.skill_manager.nav(
-        #         nav_xyt_for_handoff[0], nav_xyt_for_handoff[1], nav_xyt_for_handoff[2]
-        #     )
-
-        #     self.skill_manager.sit()
-
-    def spot_fetch_trigger_callback(self, msg):
-        """
-        Save _spot_fetch_flag as True when spot fetch is to be triggered
-
-        Args:
-            msg (Bool): Message received on /spot_fetch_trigger topic
-
-        Updates:
-            self._spot_fetch_flag (bool): True when spot fetch is to be triggered
-        """
-        self._spot_fetch_flag = True
-
-    def get_nav_xyt_to_wearer(
-        self, aria_pose: sp.SE3, shift_offset: float = 0.6
-    ) -> Tuple[float, float, float]:
-        """
-        Converts Sophus SE3 aria current pose to Tuple of x, y, theta, then flips it by 180 degrees and shifts it by a given offset.
-        At the end, the final x, y, theta is such that the robot will face the wearer.
-
-        Args:
-            aria_pose (sp.SE3): Sophus SE3 object for aria's current pose (as cpf frame) w.r.t spotWorld frame
-
-        Returns:
-            Tuple[float, float, float]: Tuple of x,y,theta as floats representing nav target for robot
-        """
-        # ARROW STARTS FROM ORIGIN
-
-        # get position and rotation as x, y, theta
-        position = aria_pose.translation()
-        # Find the angle made by CPF's z axis with spotWorld's x axis
-        # as robot should orient to the CPF's z axis. First 3 elements of
-        # column 3 from spotWorld_T_cpf represents cpf's z axis in spotWorld frame
-        cpf_z_axis_in_spotWorld = aria_pose.matrix()[:3, 2]
-        x_component_of_z_axis = cpf_z_axis_in_spotWorld[0]
-        y_component_of_z_axis = cpf_z_axis_in_spotWorld[1]
-        rotation = float(
-            np.arctan2(
-                y_component_of_z_axis,
-                x_component_of_z_axis,
-            )
-        )  # tan^-1(y/x)
-        x, y, theta = position[0], position[1], rotation
-
-        # push fwd this point along theta
-        x += shift_offset * np.cos(theta)
-        y += shift_offset * np.sin(theta)
-        # rotate theta by pi
-        theta += np.pi
-
-        return (x, y, theta)
-
-
-@click.command()
-@click.option("--verbose", type=bool, default=True)
-@click.option("--use-policies", type=bool, default=False)
-def main(verbose: bool, use_policies: bool):
-    rospy.init_node("episode")
-    rospy.logwarn("Starting up ROS node")
-
-    # spot = Spot("EpisodicMemoryRoboticFetchQuest3Node")
-    # with spot.get_lease(hijack=True) as lease:
-    # if lease is None:
-    # raise RuntimeError("Could not get lease")
-    # else:
-    # rospy.loginfo("Acquired lease")
-
-    _ = EpisodicMemoryRoboticFetchQuest3()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/aria_data_loaders/aria_data_utils/quest3_data_streamer.py b/aria_data_loaders/aria_data_utils/quest3_data_streamer.py
index da03692c..37b90722 100644
--- a/aria_data_loaders/aria_data_utils/quest3_data_streamer.py
+++ b/aria_data_loaders/aria_data_utils/quest3_data_streamer.py
@@ -1,21 +1,32 @@
-from aria_data_utils.human_sensor_data_streamer_interface import CameraParams, DataFrame, HumanSensorDataStreamerInterface
 from aria_data_utils.aria_sdk_utils import update_iptables_quest3
+from aria_data_utils.human_sensor_data_streamer_interface import (
+    CameraParams,
+    DataFrame,
+    HumanSensorDataStreamerInterface,
+)
+from perception_and_utils.utils.frame_rate_counter import (  # Local Frame rate counter
+    FrameRateCounter,
+)
 from perception_and_utils.utils.math_utils import get_running_avg_a_T_b
-from perception_and_utils.utils.frame_rate_counter import FrameRateCounter # Local Frame rate counter
+
 try:
-    from quest3_streamer.unified_quest_camera import UnifiedQuestCamera # From internal repo
+    from quest3_streamer.unified_quest_camera import (  # From internal repo
+        UnifiedQuestCamera,
+    )
 except ImportError:
     print("Could not import Quest3 camera wrapper")
 
 
-import rospy
-import numpy as np
-import sophus as sp
-import cv2
-from typing import Any, Dict, Optional, Tuple, List
-import click
 import logging
 import time
+from typing import Any, Dict, List, Optional, Tuple
+
+import click
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+import rospy
+import sophus as sp
 from perception_and_utils.utils.conversions import (
     sophus_SE3_to_ros_PoseStamped,
     sophus_SE3_to_ros_TransformStamped,
@@ -23,6 +34,7 @@
 )
 from spot_rl.utils.utils import ros_frames as rf
 
+
 class Quest3DataStreamer(HumanSensorDataStreamerInterface):
     def __init__(self, verbose: bool = False) -> None:
         super().__init__(verbose)
@@ -38,8 +50,8 @@ def __init__(self, verbose: bool = False) -> None:
         self.frame_rate_counter = FrameRateCounter()
 
         self.unified_quest3_camera = UnifiedQuestCamera()
-        self.rgb_cam_params : CameraParams = None
-        self.depth_cam_params : CameraParams = None
+        self.rgb_cam_params: CameraParams = None
+        self.depth_cam_params: CameraParams = None
         self._frame_number = -1
         self._setup_device()
 
@@ -58,7 +70,9 @@ def disconnect(self):
         self.logger("Quest3DataStreamer :: Disconnected")
 
     def is_connected(self) -> bool:
-        return self._is_connected # TODO: Return device connection status from Bruno's APIs
+        return (
+            self._is_connected
+        )  # TODO: Return device connection status from Bruno's APIs
 
     def _setup_device(self) -> Any:
         # Setup RGB camera params
@@ -67,13 +81,23 @@ def _setup_device(self) -> Any:
             rgb_pp = self.unified_quest3_camera.get_rgb_principal_point()
 
             if rgb_fl is not None and rgb_pp is not None:
-                rospy.logwarn("RGB Camera Params: focal lengths = {}, principal point = {}".format(rgb_fl, rgb_pp))
-                self.rgb_cam_params = CameraParams(focal_lengths=rgb_fl, principal_point=rgb_pp)
+                rospy.logwarn(
+                    "RGB Camera Params: focal lengths = {}, principal point = {}".format(
+                        rgb_fl, rgb_pp
+                    )
+                )
+                self.rgb_cam_params = CameraParams(
+                    focal_lengths=rgb_fl, principal_point=rgb_pp
+                )
         else:
             self.logger.info("Waiting for RGB camera params to be set...")
             rospy.logwarn("Waiting for RGB camera params to be set...")
             time.sleep(0.1)
-        rospy.loginfo("RGB Camera Params: focal lengths = {}, principal point = {}".format(self.rgb_cam_params._focal_lengths, self.rgb_cam_params._principal_point))
+        rospy.loginfo(
+            "RGB Camera Params: focal lengths = {}, principal point = {}".format(
+                self.rgb_cam_params._focal_lengths, self.rgb_cam_params._principal_point
+            )
+        )
 
         # Setup Depth camera params
         while self.depth_cam_params is None:
@@ -81,13 +105,24 @@ def _setup_device(self) -> Any:
             depth_pp = self.unified_quest3_camera.get_depth_principal_point()
 
             if depth_fl is not None and depth_pp is not None:
-                rospy.logwarn("Depth Camera Params: focal lengths = {}, principal point = {}".format(depth_fl, depth_pp))
-                self.depth_cam_params = CameraParams(focal_lengths=depth_fl, principal_point=depth_pp)
+                rospy.logwarn(
+                    "Depth Camera Params: focal lengths = {}, principal point = {}".format(
+                        depth_fl, depth_pp
+                    )
+                )
+                self.depth_cam_params = CameraParams(
+                    focal_lengths=depth_fl, principal_point=depth_pp
+                )
             else:
                 self.logger.info("Waiting for Depth camera params to be set...")
                 rospy.logwarn("Waiting for Depth camera params to be set...")
                 time.sleep(0.1)
-        print("Depth Camera Params: focal lengths = {}, principal point = {}".format(self.depth_cam_params._focal_lengths, self.depth_cam_params._principal_point))
+        print(
+            "Depth Camera Params: focal lengths = {}, principal point = {}".format(
+                self.depth_cam_params._focal_lengths,
+                self.depth_cam_params._principal_point,
+            )
+        )
 
     def get_latest_data_frame(self) -> Optional[DataFrame]:
         # Create DataFrame object
@@ -98,7 +133,9 @@ def get_latest_data_frame(self) -> Optional[DataFrame]:
         deviceWorld_T_rgbCam = self.unified_quest3_camera.get_deviceWorld_T_rgbCamera()
         device_T_rgbCam = self.unified_quest3_camera.get_device_T_rgbCamera()
         if rgb_frame is None or deviceWorld_T_rgbCam is None or device_T_rgbCam is None:
-            rospy.logwarn("Returning None, as rgb_frame or deviceWorld_T_rgbCam or device_T_rgbCam is None.")
+            rospy.logwarn(
+                "Returning None, as rgb_frame or deviceWorld_T_rgbCam or device_T_rgbCam is None."
+            )
             return None
         else:
             data_frame._rgb_frame = rgb_frame
@@ -107,10 +144,14 @@ def get_latest_data_frame(self) -> Optional[DataFrame]:
 
         # Populate depth frame data iff all the required data is available; else return None
         depth_frame = self.unified_quest3_camera.get_depth()
-        deviceWorld_T_depthCam = self.unified_quest3_camera.get_deviceWorld_T_depthCamera()
+        deviceWorld_T_depthCam = (
+            self.unified_quest3_camera.get_deviceWorld_T_depthCamera()
+        )
         device_T_depthCam = self.unified_quest3_camera.get_device_T_depthCamera()
         if depth_frame is None or deviceWorld_T_depthCam is None:
-            rospy.logwarn("Returning None as depth_frame or deviceWorld_T_depthCam or device_T_depthCam is None.")
+            rospy.logwarn(
+                "Returning None as depth_frame or deviceWorld_T_depthCam or device_T_depthCam is None."
+            )
             return None
         else:
             data_frame._depth_frame = depth_frame
@@ -122,9 +163,8 @@ def get_latest_data_frame(self) -> Optional[DataFrame]:
         data_frame._frame_number = self._frame_number
         data_frame._timestamp_s = time.time()
 
-
         # Align depth frame with rgb frame
-        data_frame._aligned_depth_frame = None # TODO: Add this logic
+        data_frame._aligned_depth_frame = None  # TODO: Add this logic
 
         # Update frame rate counter
         self.frame_rate_counter.update()
@@ -201,21 +241,25 @@ def process_frame(
                 )
 
         if detect_human_motion:
-            activity_str, avg_velocity = self.human_motion_detector.process_frame(frame=data_frame)
+            activity_str, avg_velocity = self.human_motion_detector.process_frame(
+                frame=data_frame
+            )
             ops = {
                 "activity": activity_str,
                 "velocity": avg_velocity,
                 "data_frame_fps": data_frame._avg_data_frame_fps,
             }
             viz_img, _ = self.human_motion_detector.get_outputs(viz_img, ops)
-            self.publish_human_activity_history(self.human_motion_detector.get_human_motion_history())
+            self.publish_human_activity_history(
+                self.human_motion_detector.get_human_motion_history()
+            )
 
-        # if detect_objects:
-        #     viz_img, object_scores = self.object_detector.process_frame(viz_img)
-        #     if object_label in object_scores.keys():
-        #         if self.verbose:
-        #             plt.imsave(f"frame_{self.frame_number}.jpg", viz_img)
-        #         # self.publish_pose_of_interest(frame.get("ros_pose"))
+        if detect_objects:
+            viz_img, object_scores = self.object_detector.process_frame(viz_img)
+            if object_label in object_scores.keys():
+                if self.verbose:
+                    plt.imsave(f"frame_{self.frame_number}.jpg", viz_img)
+                # self.publish_pose_of_interest(frame.get("ros_pose"))
 
         return viz_img, outputs
 
@@ -292,7 +336,7 @@ def initialize_human_motion_detector(self):
 @click.option("--do-update-iptables", is_flag=True, type=bool, default=False)
 @click.option("--debug", is_flag=True, default=False)
 @click.option("--hz", type=int, default=100)
-def main(do_update_iptables:bool, debug:bool, hz:int):
+def main(do_update_iptables: bool, debug: bool, hz: int):
     if debug:
         _log_level = rospy.DEBUG
     else:
@@ -306,7 +350,7 @@ def main(do_update_iptables:bool, debug:bool, hz:int):
     else:
         rospy.logwarn("Not updating iptables")
 
-    rate = rospy.Rate(hz)
+    # rate = rospy.Rate(hz)
     outputs: Dict[str, Any] = {}
     data_streamer = None
     cv2.namedWindow("Image", cv2.WINDOW_NORMAL)
@@ -314,7 +358,9 @@ def main(do_update_iptables:bool, debug:bool, hz:int):
         data_streamer = Quest3DataStreamer()
 
         outputs = data_streamer.initialize_april_tag_detector(outputs=outputs)
-        outputs = data_streamer.initialize_object_detector(outputs=outputs, object_labels=["milk bottle"])
+        outputs = data_streamer.initialize_object_detector(
+            outputs=outputs, object_labels=["milk bottle"]
+        )
         data_streamer.initialize_human_motion_detector()
         # data_streamer.connect()
         while not rospy.is_shutdown():

From e4688860fc12f6c022b535b33f0646d05daa485f Mon Sep 17 00:00:00 2001
From: Kavit Shah <kavitsha@meta.com>
Date: Wed, 23 Oct 2024 18:00:11 -0700
Subject: [PATCH 7/7] Add Profiling code

---
 .../human_sensor_data_streamer_interface.py   | 168 ++++++++++--------
 .../aria_data_utils/quest3_data_streamer.py   |  80 +++++++--
 .../unified_quest_camera_interface.py         |  11 +-
 .../human_motion_detector.py                  |  58 +++---
 .../detector_wrappers/object_detector.py      |   8 +-
 .../perception_and_utils/utils/data_frame.py  |  31 ++--
 6 files changed, 229 insertions(+), 127 deletions(-)

diff --git a/aria_data_loaders/aria_data_utils/human_sensor_data_streamer_interface.py b/aria_data_loaders/aria_data_utils/human_sensor_data_streamer_interface.py
index 06c565d9..63e30577 100644
--- a/aria_data_loaders/aria_data_utils/human_sensor_data_streamer_interface.py
+++ b/aria_data_loaders/aria_data_utils/human_sensor_data_streamer_interface.py
@@ -10,42 +10,50 @@
 import cv2
 import numpy as np
 import rospy
-import sophus as sp
+
+try:
+    import sophuspy as sp
+except Exception as e:
+    print(f"Cannot import sophuspy due to {e}. Import sophus instead")
+    import sophus as sp
+
+# ROS imports
+from geometry_msgs.msg import Pose, PoseStamped
+from nav_msgs.msg import Odometry
 from perception_and_utils.perception.detector_wrappers.april_tag_detector import (
     AprilTagDetectorWrapper,
 )
-from perception_and_utils.perception.detector_wrappers.object_detector import (
-    ObjectDetectorWrapper,
-)
 from perception_and_utils.perception.detector_wrappers.human_motion_detector import (
     HumanMotionDetector,
 )
-
+from perception_and_utils.perception.detector_wrappers.object_detector import (
+    ObjectDetectorWrapper,
+)
+from perception_and_utils.utils.data_frame import DataFrame
 from spot_rl.utils.utils import ros_frames as rf
-
-# ROS imports
-from geometry_msgs.msg import Pose, PoseStamped
-from nav_msgs.msg import Odometry
 from std_msgs.msg import String
 from tf2_ros import StaticTransformBroadcaster
 from visualization_msgs.msg import Marker
 
-from perception_and_utils.utils.data_frame import DataFrame
 
 class CameraParams:
     """
     CameraParams class to store camera parameters like focal length, principal point etc.
     It stores the image, timestamp and frame number of each received image
     """
-    def __init__(self, focal_lengths: Tuple[float, float], principal_point: Tuple[float, float]):
+
+    def __init__(
+        self, focal_lengths: Tuple[float, float], principal_point: Tuple[float, float]
+    ):
         self._focal_lengths = focal_lengths  # type: Optional[Tuple[float, float]]
         self._principal_point = principal_point  # type: Optional[Tuple[float, float]]
 
+
 class HumanSensorDataStreamerInterface:
     def __init__(self, verbose: bool = False) -> None:
         self.verbose = verbose
 
-        self._is_connected = False # TODO : Maybe not needed
+        self._is_connected = False  # TODO : Maybe not needed
 
         # TODO: Update this later with PerceptionPipeline
         # Create all detectors
@@ -57,19 +65,22 @@ def __init__(self, verbose: bool = False) -> None:
         self.static_tf_broadcaster = StaticTransformBroadcaster()
 
         self.human_activity_history_pub = rospy.Publisher(
-            "/human_activity_history", String, queue_size=10)
+            "/human_activity_history", String, queue_size=10
+        )
 
         # TODO: Define DEVICE FRAME as a visual frame of reference i.e. front facing frame of reference
-        self.device_T_camera: Optional[sp.SE3] = None # TODO: Should be initialized by implementations of this class
+        self.device_T_camera: Optional[
+            sp.SE3
+        ] = None  # TODO: Should be initialized by implementations of this class
 
         # Maintain a list of all poses where qr code is detected (w.r.t deviceWorld)
-        self.marker_positions_list = (
-            []
-        )  # type: List[np.ndarray] # List of  position as np.ndarray (x, y, z)
-        self.marker_quaternion_list = (
-            []
-        )  # type: List[np.ndarray] # List of quaternions as np.ndarray (x, y, z, w)
-        self.avg_deviceWorld_T_marker = None  # type: Optional[sp.SE3]
+        self.marker_positions_list: List[
+            np.ndarray
+        ] = []  # List of  position as np.ndarray (x, y, z)
+        self.marker_quaternion_list: List[
+            np.ndarray
+        ] = []  # List of quaternions as np.ndarray (x, y, z, w)
+        self.avg_deviceWorld_T_marker: Optional[sp.SE3] = None
 
     def connect(self):
         """
@@ -155,64 +166,65 @@ def initialize_object_detector(
         raise NotImplementedError
 
     def initialize_human_motion_detector(self, outputs: dict = {}):
-        """
-        """
+        """ """
         raise NotImplementedError
 
     ##### ROS Publishers #####
-    def publish_human_pose(self, data_frame: DataFrame):
-        """
-        Publishes current pose of Device as a pose of interest for Spot
-
-        Args:
-            data_frame (DataFrame): DataFrame object containing data packet with rgb, depth
-                                    and all necessary transformations
-
-        Publishes:
-            - /human_pose_publisher: ROS PoseStamped message for quest3World_T_rgb # TODO: Update this from rgb to VIZ frame
-        """
-        # Publish as pose for detail
-        pose_stamped = PoseStamped()
-        pose_stamped.header.stamp = rospy.Time().now()
-        pose_stamped.header.seq = data_frame._frame_number
-        pose_stamped.header.frame_id = rf.QUEST3_CAMERA
-        pose_stamped.pose = data_frame._deviceWorld_T_camera_rgb # TODO: Use a VIZ frame which is forward facing
-        self.human_pose_publisher.publish(pose_stamped)
-
-    def publish_marker(self, data_frame: DataFrame, marker_scale: float=0.1):
-        """
-        Publishes marker at pose of interest
-
-        Args:
-            data_frame (DataFrame): DataFrame object containing data packet with rgb, depth
-                                    and all necessary transformations
-
-        Publishes:
-            - /pose_of_interest_marker_publisher: ROS PoseStamped message for quest3World_T_rgb # TODO: Update this from rgb to VIZ frame
-        """
-        # Publish as marker for interpretability
-        marker = Marker()
-        marker.header.stamp = rospy.Time().now()
-        marker.header.frame_id = rf.rf.QUEST3_CAMERA
-        marker.id = self._frame_number
-
-        marker.type = Marker.SPHERE
-        marker.action = Marker.ADD
-
-        marker.pose = data_frame._deviceWorld_T_camera_rgb
-
-        marker.scale.x = marker_scale
-        marker.scale.y = marker_scale
-        marker.scale.z = marker_scale
-
-        marker.color.a = 1.0
-        marker.color.r = 0.45
-        marker.color.g = 0.95
-        marker.color.b = 0.2
-
-        self.pose_of_interest_marker_publisher.publish(marker)
-
-    def publish_human_activity_history(self, human_history: List[Tuple[float,str]]):
+    # def publish_human_pose(self, data_frame: DataFrame):
+    #     """
+    #     Publishes current pose of Device as a pose of interest for Spot
+
+    #     Args:
+    #         data_frame (DataFrame): DataFrame object containing data packet with rgb, depth
+    #                                 and all necessary transformations
+
+    #     Publishes:
+    #         - /human_pose_publisher: ROS PoseStamped message for quest3World_T_rgb # TODO: Update this from rgb to VIZ frame
+    #     """
+    #     # Publish as pose for detail
+    #     pose_stamped = PoseStamped()
+    #     pose_stamped.header.stamp = rospy.Time().now()
+    #     pose_stamped.header.seq = data_frame._frame_number
+    #     pose_stamped.header.frame_id = rf.QUEST3_CAMERA
+    #     pose_stamped.pose = (
+    #         data_frame._deviceWorld_T_camera_rgb
+    #     )  # TODO: Use a VIZ frame which is forward facing
+    #     self.human_pose_publisher.publish(pose_stamped)
+
+    # def publish_marker(self, data_frame: DataFrame, marker_scale: float = 0.1):
+    #     """
+    #     Publishes marker at pose of interest
+
+    #     Args:
+    #         data_frame (DataFrame): DataFrame object containing data packet with rgb, depth
+    #                                 and all necessary transformations
+
+    #     Publishes:
+    #         - /pose_of_interest_marker_publisher: ROS PoseStamped message for quest3World_T_rgb # TODO: Update this from rgb to VIZ frame
+    #     """
+    #     # Publish as marker for interpretability
+    #     marker = Marker()
+    #     marker.header.stamp = rospy.Time().now()
+    #     marker.header.frame_id = rf.rf.QUEST3_CAMERA
+    #     marker.id = self._frame_number
+
+    #     marker.type = Marker.SPHERE
+    #     marker.action = Marker.ADD
+
+    #     marker.pose = data_frame._deviceWorld_T_camera_rgb
+
+    #     marker.scale.x = marker_scale
+    #     marker.scale.y = marker_scale
+    #     marker.scale.z = marker_scale
+
+    #     marker.color.a = 1.0
+    #     marker.color.r = 0.45
+    #     marker.color.g = 0.95
+    #     marker.color.b = 0.2
+
+    #     self.pose_of_interest_marker_publisher.publish(marker)
+
+    def publish_human_activity_history(self, human_history: List[Tuple[float, str]]):
         """
         Publishes human activity history as a string
 
@@ -227,7 +239,9 @@ def publish_human_activity_history(self, human_history: List[Tuple[float,str]]):
         # Iterate over human history and create a string
         human_history_str = ""
         for timestamp, activity in human_history:
-            human_history_str += f"{timestamp},{activity}|"  # str = "time1,Standing|time2,Walking"
+            human_history_str += (
+                f"{timestamp},{activity}|"  # str = "time1,Standing|time2,Walking"
+            )
 
         # Publish human activity history string
         self.human_activity_history_pub.publish(human_history_str)
diff --git a/aria_data_loaders/aria_data_utils/quest3_data_streamer.py b/aria_data_loaders/aria_data_utils/quest3_data_streamer.py
index 37b90722..ae7d1a0b 100644
--- a/aria_data_loaders/aria_data_utils/quest3_data_streamer.py
+++ b/aria_data_loaders/aria_data_utils/quest3_data_streamer.py
@@ -19,6 +19,7 @@
 
 import logging
 import time
+import traceback
 from typing import Any, Dict, List, Optional, Tuple
 
 import click
@@ -26,7 +27,12 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import rospy
-import sophus as sp
+
+try:
+    import sophuspy as sp
+except Exception as e:
+    print(f"Cannot import sophuspy due to {e}. Import sophus instead")
+    import sophus as sp
 from perception_and_utils.utils.conversions import (
     sophus_SE3_to_ros_PoseStamped,
     sophus_SE3_to_ros_TransformStamped,
@@ -47,7 +53,10 @@ def __init__(self, verbose: bool = False) -> None:
         handler.setFormatter(formatter)
         self.logger.addHandler(handler)
 
-        self.frame_rate_counter = FrameRateCounter()
+        self.frc_all = FrameRateCounter()
+        self.frc_qrd = FrameRateCounter()
+        self.frc_hmd = FrameRateCounter()
+        self.frc_od = FrameRateCounter()
 
         self.unified_quest3_camera = UnifiedQuestCamera()
         self.rgb_cam_params: CameraParams = None
@@ -60,14 +69,14 @@ def connect(self):
         Connect to Device
         """
         self._is_connected = True
-        self.logger("Quest3DataStreamer :: Connected")
+        self.logger.info("Quest3DataStreamer :: Connected")
 
     def disconnect(self):
         """
         Disconnects from Device cleanly
         """
         self._is_connected = False
-        self.logger("Quest3DataStreamer :: Disconnected")
+        self.logger.info("Quest3DataStreamer :: Disconnected")
 
     def is_connected(self) -> bool:
         return (
@@ -167,10 +176,11 @@ def get_latest_data_frame(self) -> Optional[DataFrame]:
         data_frame._aligned_depth_frame = None  # TODO: Add this logic
 
         # Update frame rate counter
-        self.frame_rate_counter.update()
+        # self.frame_rate_counter.update()
         data_frame._avg_rgb_fps = self.unified_quest3_camera.get_avg_fps_rgb()
         data_frame._avg_depth_fps = self.unified_quest3_camera.get_avg_fps_depth()
-        data_frame._avg_data_frame_fps = self.frame_rate_counter.avg_value()
+        # data_frame._avg_data_frame_fps = self.frame_rate_counter.avg_value()
+        data_frame._avg_data_frame_fps = 0.0
 
         return data_frame
 
@@ -183,15 +193,32 @@ def process_frame(
         detect_human_motion: bool = False,
         object_label="milk bottle",
     ) -> Tuple[np.ndarray, dict]:
+        rate_all = 0.0
+        rate_qrd = 0.0
+        rate_hmd = 0.0
+        rate_od = 0.0
+
+        self.frc_all.start()
+
         # Initialize object_scores to empty dict for current image frame
         # object_scores = {}  # type: Dict[str, Any]
 
         # Get rgb image from frame
         viz_img = data_frame._rgb_frame
         if viz_img is None:
-            rospy.logwarn("No image found in frame")
+            rospy.logwarn("No image found in frame")  # This gets over-written.. WASTE!
+
+        if detect_objects:
+            self.frc_od.start()
+            viz_img, object_scores = self.object_detector.process_frame(data_frame)
+            # if object_label in object_scores.keys():
+            #     if self.verbose:
+            #         plt.imsave(f"frame_{self.frame_number}.jpg", viz_img)
+            #     self.publish_pose_of_interest(frame.get("ros_pose"))
+            rate_od = self.frc_od.stop()
 
         if detect_qr:
+            self.frc_qrd.start()
             (viz_img, camera_T_marker) = self.april_tag_detector.process_frame(frame=data_frame)  # type: ignore
 
             deviceWorld_T_camera = data_frame._deviceWorld_T_camera_rgb
@@ -213,9 +240,9 @@ def process_frame(
                     self.marker_quaternion_list,
                     self.avg_deviceWorld_T_marker,
                 ) = get_running_avg_a_T_b(
-                    current_avg_a_T_b=self.avg_deviceWorld_T_marker,
-                    a_T_b_position_list=self.marker_positions_list,
-                    a_T_b_quaternion_list=self.marker_quaternion_list,
+                    current_avg_a_T_b=self.avg_deviceWorld_T_marker,  # type: ignore
+                    a_T_b_position_list=self.marker_positions_list,  # type: ignore
+                    a_T_b_quaternion_list=self.marker_quaternion_list,  # type: ignore
                     a_T_intermediate=deviceWorld_T_camera,
                     intermediate_T_b=camera_T_marker,
                 )
@@ -240,7 +267,10 @@ def process_frame(
                     viz_img=viz_img,
                 )
 
+            rate_qrd = self.frc_qrd.stop()
+
         if detect_human_motion:
+            self.frc_hmd.start()
             activity_str, avg_velocity = self.human_motion_detector.process_frame(
                 frame=data_frame
             )
@@ -253,14 +283,24 @@ def process_frame(
             self.publish_human_activity_history(
                 self.human_motion_detector.get_human_motion_history()
             )
+            rate_hmd = self.frc_hmd.stop()
 
         if detect_objects:
+            self.frc_od.start()
             viz_img, object_scores = self.object_detector.process_frame(viz_img)
             if object_label in object_scores.keys():
                 if self.verbose:
                     plt.imsave(f"frame_{self.frame_number}.jpg", viz_img)
                 # self.publish_pose_of_interest(frame.get("ros_pose"))
 
+            rate_od = self.frc_od.stop()
+
+        rate_all = self.frc_all.stop()
+
+        outputs["process_all"] = rate_all
+        outputs["process_qr"] = rate_qrd
+        outputs["process_od"] = rate_od
+        outputs["process_hmd"] = rate_hmd
         return viz_img, outputs
 
     def initialize_april_tag_detector(self, outputs: dict = {}):
@@ -345,11 +385,20 @@ def main(do_update_iptables: bool, debug: bool, hz: int):
     rospy.init_node("quest3_data_streamer", log_level=_log_level)
     rospy.loginfo("Starting quest3_data_streamer node")
 
+    # Logger for profiling
+    loggerp = logging.getLogger("Profiler-Quest3ProcessFrame")
+    loggerp.setLevel(logging.INFO)
+    csv_handlerp = logging.FileHandler("quest3_time_profiles_sec.txt")
+    formatterp = logging.Formatter("%(asctime)s, %(message)s")
+    csv_handlerp.setFormatter(formatterp)
+    loggerp.addHandler(csv_handlerp)
+
     if do_update_iptables:
         update_iptables_quest3()
     else:
         rospy.logwarn("Not updating iptables")
 
+    outer_frc = FrameRateCounter()
     # rate = rospy.Rate(hz)
     outputs: Dict[str, Any] = {}
     data_streamer = None
@@ -364,6 +413,7 @@ def main(do_update_iptables: bool, debug: bool, hz: int):
         data_streamer.initialize_human_motion_detector()
         # data_streamer.connect()
         while not rospy.is_shutdown():
+            outer_frc.start()
             data_frame = data_streamer.get_latest_data_frame()
             if data_frame is not None:
                 if debug:
@@ -380,9 +430,15 @@ def main(do_update_iptables: bool, debug: bool, hz: int):
                 cv2.waitKey(1)
             else:
                 print("No data frame received.")
-    except Exception as e:
+
+            outer_rate = outer_frc.stop()
+            msg_str = f"Fetch+AllDetection:{outer_rate},All_detectors={outputs.get('process_all',0.0)},QR={outputs.get('process_qr',0.0)},OD={outputs.get('process_od',0.0)},HMD={outputs.get('process_hmd',0.0)}"
+            loggerp.info(msg_str)
+            print(msg_str)
+
+    except Exception:
         print("Ending script.")
-        rospy.logwarn(f"Exception: {e}")
+        rospy.logwarn(f"Exception: {traceback.format_exc()}")
         if data_streamer is not None:
             data_streamer.disconnect()
 
diff --git a/aria_data_loaders/aria_data_utils/unified_quest_camera_interface.py b/aria_data_loaders/aria_data_utils/unified_quest_camera_interface.py
index a6108b00..503f41a1 100644
--- a/aria_data_loaders/aria_data_utils/unified_quest_camera_interface.py
+++ b/aria_data_loaders/aria_data_utils/unified_quest_camera_interface.py
@@ -3,7 +3,12 @@
 # LICENSE file in the root directory of this source tree.
 
 from typing import Optional, Tuple
-import sophus as sp
+
+try:
+    import sophuspy as sp
+except Exception as e:
+    print(f"Cannot import sophuspy due to {e}. Import sophus instead")
+    import sophus as sp
 import numpy as np
 
 """
@@ -51,9 +56,9 @@ def get_device_T_depthCamera(self) -> Optional[sp.SE3]:
 
     def get_avg_fps_rgb(self):
         raise NotImplementedError
-    
+
     def get_avg_fps_depth(self):
         raise NotImplementedError
 
-    def is_connected():
+    def is_connected(self):
         raise NotImplementedError
diff --git a/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/human_motion_detector.py b/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/human_motion_detector.py
index 3a83a9a9..6a46803c 100644
--- a/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/human_motion_detector.py
+++ b/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/human_motion_detector.py
@@ -5,19 +5,25 @@
 
 import logging
 import os
-from typing import Any, Dict, Tuple
+from typing import Any, Dict, List, Tuple
 
 import numpy as np
-import sophus as sp
+
+try:
+    import sophuspy as sp
+except Exception as e:
+    print(f"Cannot import sophuspy due to {e}. Import sophus instead")
+    import sophus as sp
+
 from perception_and_utils.perception.detector_wrappers.generic_detector_interface import (
     GenericDetector,
 )
-from perception_and_utils.utils.image_utils import label_img, decorate_img_with_fps
 from perception_and_utils.utils.data_frame import DataFrame
+from perception_and_utils.utils.image_utils import decorate_img_with_fps, label_img
+
 
 class HumanMotionDetector(GenericDetector):
-    """
-    """
+    """ """
 
     def __init__(self):
         super().__init__()
@@ -26,8 +32,9 @@ def __init__(self):
     def _init_human_motion_detector(
         self,
         # velocity_threshold=1.2,
-        velocity_threshold=0.4, # hacking this
-        time_horizon_ns=1000000000):
+        velocity_threshold=0.4,  # hacking this
+        time_horizon_ns=1000000000,
+    ):
         """
         Initialize the human motion detector
 
@@ -39,16 +46,15 @@ def _init_human_motion_detector(
         self.enable_detector()
         self._velocity_threshold = velocity_threshold
 
-        self._time_horizon_ns = time_horizon_ns # 1 second
-        self._previous_frames = [] # type: List[Tuple[np.ndarray, float]]
-        self._len_prev_frame_cache = 100 # Keep this value larger than fps
-
+        self._time_horizon_ns = time_horizon_ns  # 1 second
+        self._previous_frames = []  # type: List[Tuple[np.ndarray, float]]
+        self._len_prev_frame_cache = 100  # Keep this value larger than fps
 
-        self._human_motion_history = [] # type: List[Tuple[float, str]]
+        self._human_motion_history = []  # type: List[Tuple[float, str]]
 
     def process_frame(
         self,
-        frame: DataFrame, # new data frame
+        frame: DataFrame,  # new data frame
     ) -> Tuple[str, float]:
         # Do nothing if detector is not enabled
         if self.is_enabled is False:
@@ -64,8 +70,8 @@ def process_frame(
         self._previous_frames.append((current_position, frame._timestamp_s))
 
         # If the cache is just 1 element or less, return "Standing"
-        lookback_index=10
-        if len(self._previous_frames) < lookback_index+1:
+        lookback_index = 10
+        if len(self._previous_frames) < lookback_index + 1:
             return "Standing", None
 
         # If the cache is full, remove the oldest frame
@@ -73,9 +79,15 @@ def process_frame(
             self._previous_frames.pop(0)
 
         # Calculate the Euclidean distance between now & then
-        print(f"Current position: {current_position}   |   Previous position: {self._previous_frames[lookback_index][0]}")
-        distance = np.linalg.norm(current_position - self._previous_frames[lookback_index][0])
-        time = frame._timestamp_s - self._previous_frames[lookback_index][1] # as we are considering x frames per second so our window is 1 sec
+        print(
+            f"Current position: {current_position}   |   Previous position: {self._previous_frames[lookback_index][0]}"
+        )
+        distance = np.linalg.norm(
+            current_position - self._previous_frames[lookback_index][0]
+        )
+        time = (
+            frame._timestamp_s - self._previous_frames[lookback_index][1]
+        )  # as we are considering x frames per second so our window is 1 sec
         print(f"Distance: {distance} m   |   Time: {time} sec")
         avg_velocity = distance / time
 
@@ -93,8 +105,7 @@ def get_outputs(
         img_frame: np.ndarray,
         outputs: Dict,
     ) -> Tuple[np.ndarray, Dict]:
-        """
-        """
+        """ """
         viz_img = img_frame.copy()
         # Decorate image with text for visualization
         label_img(
@@ -107,12 +118,15 @@ def get_outputs(
             text=f"Velocity: {outputs['velocity']}",
             org=(50, 250),
         )
-        decorate_img_with_fps(viz_img, outputs["data_frame_fps"],pos=(50, 400))
+        decorate_img_with_fps(viz_img, outputs["data_frame_fps"], pos=(50, 400))
         return viz_img, outputs
 
     def update_human_motion_history(self, state, timestamp):
         # If current state of human is different from last state in history list, only then update history
-        if len(self._human_motion_history) == 0 or self._human_motion_history[-1][1] != state:
+        if (
+            len(self._human_motion_history) == 0
+            or self._human_motion_history[-1][1] != state
+        ):
             self._human_motion_history.append((timestamp, state))
 
     def get_human_motion_history(self):
diff --git a/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/object_detector.py b/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/object_detector.py
index 27540a81..b5ebf0e0 100644
--- a/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/object_detector.py
+++ b/perception_and_utils_root/perception_and_utils/perception/detector_wrappers/object_detector.py
@@ -10,11 +10,11 @@
 from perception_and_utils.perception.detector_wrappers.generic_detector_interface import (
     GenericDetector,
 )
+from perception_and_utils.utils.data_frame import DataFrame
 from perception_and_utils.utils.image_utils import (
     centered_object_detection_heuristic,
     check_bbox_intersection,
 )
-from perception_and_utils.utils.data_frame import DataFrame
 from spot_rl.models.owlvit import OwlVit
 
 
@@ -226,7 +226,9 @@ def process_frame(self, frame: DataFrame) -> Tuple[np.ndarray, Dict[str, Any]]:
         )
         return updated_img_frame, object_scores
 
-    def process_frame_offline(self, frame: DataFrame) -> Tuple[np.ndarray, Dict[str, Any]]:
+    def process_frame_offline(
+        self, frame: DataFrame
+    ) -> Tuple[np.ndarray, Dict[str, Any]]:
         """
         Process image frame to detect object instances and score them based on heuristics
 
@@ -239,6 +241,8 @@ def process_frame_offline(self, frame: DataFrame) -> Tuple[np.ndarray, Dict[str,
             updated_img_frame (np.ndarray) : Image frame with detections and text for visualization
             object_scores (Dict[str, float]) : Dictionary of scores for each object in the image frame
         """
+        img_frame = frame._rgb_frame
+
         # Do nothing if detector is not enabled
         if self.is_enabled is False:
             self._logger.warning(
diff --git a/perception_and_utils_root/perception_and_utils/utils/data_frame.py b/perception_and_utils_root/perception_and_utils/utils/data_frame.py
index 116bd16a..54b06dea 100644
--- a/perception_and_utils_root/perception_and_utils/utils/data_frame.py
+++ b/perception_and_utils_root/perception_and_utils/utils/data_frame.py
@@ -1,21 +1,30 @@
+from typing import Optional
+
 import numpy as np
-import sophus as sp
+
+try:
+    import sophuspy as sp
+except Exception as e:
+    print(f"Cannot import sophuspy due to {e}. Import sophus instead")
+    import sophus as sp
+
 
 class DataFrame:
     """
     DataFrame class to store instantaneous data from Human Sensor stream.
     It stores the image, timestamp and frame number of each received image
     """
+
     def __init__(self):
         self._frame_number = -1
         self._timestamp_s = -1
-        self._avg_rgb_fps = None # type: Optional[float]
-        self._avg_depth_fps = None # type: Optional[float]
-        self._avg_data_frame_fps = None # type: Optional[float]
-        self._rgb_frame = None # type: Optional[np.ndarray]
-        self._depth_frame = None # type: Optional[np.ndarray]
-        self._aligned_depth_frame = None # type: Optional[np.ndarray]
-        self._deviceWorld_T_camera_rgb = None # type: Optional[sp.SE3]
-        self._deviceWorld_T_camera_depth = None # type: Optional[sp.SE3]
-        self._device_T_camera_rgb = None # type: Optional[sp.SE3]
-        self._device_T_camera_depth = None # type: Optional[sp.SE3]
+        self._avg_rgb_fps = None  # type: Optional[float]
+        self._avg_depth_fps = None  # type: Optional[float]
+        self._avg_data_frame_fps = None  # type: Optional[float]
+        self._rgb_frame = None  # type: Optional[np.ndarray]
+        self._depth_frame = None  # type: Optional[np.ndarray]
+        self._aligned_depth_frame = None  # type: Optional[np.ndarray]
+        self._deviceWorld_T_camera_rgb = None  # type: Optional[sp.SE3]
+        self._deviceWorld_T_camera_depth = None  # type: Optional[sp.SE3]
+        self._device_T_camera_rgb = None  # type: Optional[sp.SE3]
+        self._device_T_camera_depth = None  # type: Optional[sp.SE3]