diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
index 51b5c54c9..6b994f26b 100644
--- a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
+++ b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
@@ -26,23 +26,25 @@ class speech_model_params:
Must be a valid Whisper model name.
device (str, optional): Device to run the model on. Defaults to "cuda" if available, otherwise "cpu".
start_timeout (float): Max number of seconds of silence when starting listening before stopping. Defaults to 5.0.
- end_timeout (Optional[float]): Max number of seconds of silence after starting listening before stopping. Defaults to None.
+ phrase_duration (Optional[float]): Max number of seconds of the phrase. Defaults to 10 seconds.
sample_rate (int): Sample rate of the microphone. Defaults to 16000Hz.
mic_device (Optional[str]): Microphone device index or name. Defaults to None.
timer_duration (Optional[int]): Duration of the timer for adjusting the microphone for ambient noise. Defaults to 20 seconds.
warmup (bool): Whether to warmup the model by running inference on a test file. Defaults to True.
energy_threshold (Optional[int]): Energy threshold for silence detection. Using this disables automatic adjustment. Defaults to None.
+ pause_threshold (Optional[float]): Seconds of non-speaking audio before a phrase is considered complete. Defaults to 0.8 seconds.
"""
model_name: str = "medium.en"
device: str = "cuda" if torch.cuda.is_available() else "cpu"
start_timeout: float = 5.0
- end_timeout: Optional[float] = None
+ phrase_duration: Optional[float] = 10
sample_rate: int = 16000
mic_device: Optional[str] = None
timer_duration: Optional[int] = 20
warmup: bool = True
- energy_threshold: Optional[int] = None
+ energy_threshold: Optional[int] = 600
+ phrase_duration: Optional[float] = 0.8
class TranscribeSpeechAction(object):
@@ -122,6 +124,9 @@ class TranscribeSpeechAction(object):
self._listening = True
recogniser = sr.Recognizer()
+ if self._model_params.pause_threshold:
+ recogniser.pause_threshold = self._model_params.pause_threshold
+
if self._model_params.energy_threshold:
recogniser.dynamic_energy_threshold = False
recogniser.energy_threshold = self._model_params.energy_threshold
@@ -160,7 +165,7 @@ class TranscribeSpeechAction(object):
wav_data = self.recogniser.listen(
src,
timeout=self._model_params.start_timeout,
- phrase_time_limit=self._model_params.end_timeout,
+ phrase_time_limit=self._model_params.phrase_duration,
).get_wav_data()
# Magic number 32768.0 is the maximum value of a 16-bit signed integer
float_data = (
@@ -233,10 +238,10 @@ def parse_args() -> dict:
help="Timeout for listening for the start of a phrase.",
)
parser.add_argument(
- "--end_timeout",
+ "--phrase_duration",
type=float,
- default=None,
- help="Timeout for listening for the end of a phrase.",
+ default=10,
+ help="Maximum phrase duration after starting listening in seconds.",
)
parser.add_argument(
"--sample_rate",
@@ -259,10 +264,17 @@ def parse_args() -> dict:
parser.add_argument(
"--energy_threshold",
type=int,
- default=None,
+ default=600,
help="Energy threshold for silence detection. Using this disables automatic adjustment",
)
+ parser.add_argument(
+ "--pause_threshold",
+ type=float,
+ default=2.0,
+ help="Seconds of non-speaking audio before a phrase is considered complete.",
+ )
+
args, unknown = parser.parse_known_args()
return vars(args)
@@ -284,14 +296,18 @@ def configure_model_params(config: dict) -> speech_model_params:
model_params.device = config["device"]
if config["start_timeout"]:
model_params.start_timeout = config["start_timeout"]
- if config["end_timeout"]:
- model_params.end_timeout = config["end_timeout"]
+ if config["phrase_duration"]:
+ model_params.phrase_duration = config["phrase_duration"]
if config["sample_rate"]:
model_params.sample_rate = config["sample_rate"]
if config["mic_device"]:
model_params.mic_device = config["mic_device"]
if config["no_warmup"]:
model_params.warmup = False
+ if config["energy_threshold"]:
+ model_params.energy_threshold = config["energy_threshold"]
+ if config["pause_threshold"]:
+ model_params.pause_threshold = config["pause_threshold"]
return model_params
diff --git a/skills/src/lasr_skills/describe_people.py b/skills/src/lasr_skills/describe_people.py
index 8b09ed186..770fdf582 100755
--- a/skills/src/lasr_skills/describe_people.py
+++ b/skills/src/lasr_skills/describe_people.py
@@ -39,7 +39,10 @@ def __init__(self):
smach.StateMachine.add(
"GET_IMAGE",
GetCroppedImage(
- object_name="person", crop_method="closest", use_mask=True
+ object_name="person",
+ crop_method=crop_method,
+ rgb_topic=rgb_topic,
+ use_mask=True,
),
transitions={
"succeeded": "CONVERT_IMAGE",
@@ -232,9 +235,14 @@ def execute(self, userdata):
rospy.logdebug(f"|> Person does not have {part} visible")
continue
- if part.name == "torso_front" or part.name == "torso_back":
+ if (
+ part.part_name == "torso_front"
+ or part.part_name == "torso_back"
+ ):
torso_mask = np.logical_or(torso_mask, part_mask)
- elif part.name == "left_face" or part.name == "right_face":
+ elif (
+ part.part_name == "left_face" or part.part_name == "right_face"
+ ):
head_mask = np.logical_or(head_mask, part_mask)
torso_mask_data, torso_mask_shape, torso_mask_dtype = numpy2message(
diff --git a/skills/src/lasr_skills/look_at_person.py b/skills/src/lasr_skills/look_at_person.py
index e41bcef48..fb9675c80 100755
--- a/skills/src/lasr_skills/look_at_person.py
+++ b/skills/src/lasr_skills/look_at_person.py
@@ -2,8 +2,10 @@
from geometry_msgs.msg import PointStamped
import smach
from .vision import GetPointCloud
-from lasr_vision_msgs.srv import BodyPixDetection, BodyPixDetectionRequest
-from lasr_vision_msgs.msg import BodyPixMaskRequest
+from lasr_vision_msgs.srv import (
+ BodyPixKeypointDetection,
+ BodyPixKeypointDetectionRequest,
+)
from lasr_skills import LookToPoint, DetectFaces
import cv2
import rospy
@@ -155,38 +157,39 @@ def execute(self, userdata):
return "failed"
- @smach.cb_interface(input_keys=["poses", "detections"], output_keys=["bbox_eyes"])
+ @smach.cb_interface(
+ input_keys=["keypoints", "detections"], output_keys=["bbox_eyes"]
+ )
def match_poses_and_detections(ud):
bbox_eyes = []
- for pose in ud.poses:
+ for keypoint in ud.keypoints:
for detection in ud.detections.detections:
temp = {
"bbox": detection.xywh,
}
- for keypoint in pose.keypoints:
- if (
- keypoint.part == "leftEye"
- and detection.xywh[0]
- < keypoint.xy[0]
- < detection.xywh[0] + detection.xywh[2]
- and detection.xywh[1]
- < keypoint.xy[1]
- < detection.xywh[1] + detection.xywh[3]
- ):
- temp["left_eye"] = keypoint.xy
- if (
- keypoint.part == "rightEye"
- and detection.xywh[0]
- < keypoint.xy[0]
- < detection.xywh[0] + detection.xywh[2]
- and detection.xywh[1]
- < keypoint.xy[1]
- < detection.xywh[1] + detection.xywh[3]
- ):
- temp["right_eye"] = keypoint.xy
+ if (
+ keypoint.keypoint_name == "leftEye"
+ and detection.xywh[0]
+ < keypoint.x
+ < detection.xywh[0] + detection.xywh[2]
+ and detection.xywh[1]
+ < keypoint.y
+ < detection.xywh[1] + detection.xywh[3]
+ ):
+ temp["left_eye"] = keypoint.x, keypoint.y
+ if (
+ keypoint.keypoint_name == "rightEye"
+ and detection.xywh[0]
+ < keypoint.x
+ < detection.xywh[0] + detection.xywh[2]
+ and detection.xywh[1]
+ < keypoint.y
+ < detection.xywh[1] + detection.xywh[3]
+ ):
+ temp["right_eye"] = keypoint.x, keypoint.y
- if "left_eye" in temp and "right_eye" in temp:
- bbox_eyes.append(temp)
+ if "left_eye" in temp and "right_eye" in temp:
+ bbox_eyes.append(temp)
ud.bbox_eyes = bbox_eyes
@@ -226,21 +229,17 @@ def __init__(self, filter=False):
},
)
- eyes = BodyPixMaskRequest()
- eyes.parts = ["left_eye", "right_eye"]
- masks = [eyes]
-
smach.StateMachine.add(
"SEGMENT_PERSON",
smach_ros.ServiceState(
- "/bodypix/detect",
- BodyPixDetection,
- request_cb=lambda ud, _: BodyPixDetectionRequest(
- pcl_to_img_msg(ud.pcl_msg), "resnet50", 0.7, masks
+ "/bodypix/keypoint_detection",
+ BodyPixKeypointDetection,
+ request_cb=lambda ud, _: BodyPixKeypointDetectionRequest(
+ pcl_to_img_msg(ud.pcl_msg), "resnet50", 0.2
),
- response_slots=["masks", "poses"],
+ response_slots=["keypoints"],
input_keys=["pcl_msg"],
- output_keys=["masks", "poses"],
+ output_keys=["keypoints"],
),
transitions={
"succeeded": "DETECT_FACES",
@@ -262,8 +261,8 @@ def __init__(self, filter=False):
"MATCH_POSES_AND_DETECTIONS",
CBState(
self.match_poses_and_detections,
- input_keys=["poses", "detections"],
- output_keys=["poses"],
+ input_keys=["keypoints", "detections"],
+ output_keys=["keypoints"],
outcomes=["succeeded", "failed"],
),
transitions={"succeeded": "CHECK_EYES", "failed": "failed"},
diff --git a/tasks/carry_my_luggage/launch/setup.launch b/tasks/carry_my_luggage/launch/setup.launch
index d4b3a2775..a75310f9d 100644
--- a/tasks/carry_my_luggage/launch/setup.launch
+++ b/tasks/carry_my_luggage/launch/setup.launch
@@ -15,5 +15,5 @@
-
+
diff --git a/tasks/receptionist/launch/setup.launch b/tasks/receptionist/launch/setup.launch
index 530dda7d7..bbcf45192 100644
--- a/tasks/receptionist/launch/setup.launch
+++ b/tasks/receptionist/launch/setup.launch
@@ -4,7 +4,7 @@
-
+
@@ -21,7 +21,7 @@
-
+
diff --git a/tasks/receptionist/scripts/main.py b/tasks/receptionist/scripts/main.py
index 4c4cbeeea..f93e2bc24 100755
--- a/tasks/receptionist/scripts/main.py
+++ b/tasks/receptionist/scripts/main.py
@@ -10,25 +10,21 @@
if __name__ == "__main__":
rospy.init_node("receptionist_robocup")
wait_pose_param = rospy.get_param("/receptionist/wait_pose")
- # wait_pose_param = rospy.get_param("/wait_pose/")
wait_pose = Pose(
position=Point(**wait_pose_param["position"]),
orientation=Quaternion(**wait_pose_param["orientation"]),
)
- # wait_area_param = rospy.get_param("/wait_area")
wait_area_param = rospy.get_param("/receptionist/wait_area")
wait_area = Polygon(wait_area_param)
- # seat_pose_param = rospy.get_param("/seat_pose")
seat_pose_param = rospy.get_param("/receptionist/seat_pose")
seat_pose = Pose(
position=Point(**seat_pose_param["position"]),
orientation=Quaternion(**seat_pose_param["orientation"]),
)
- # seat_area_param = rospy.get_param("/seat_area")
seat_area_param = rospy.get_param("/receptionist/seat_area")
seat_area = Polygon(seat_area_param)
diff --git a/tasks/receptionist/src/receptionist/states/introduce.py b/tasks/receptionist/src/receptionist/states/introduce.py
index 7650737ff..7efff5482 100644
--- a/tasks/receptionist/src/receptionist/states/introduce.py
+++ b/tasks/receptionist/src/receptionist/states/introduce.py
@@ -46,7 +46,7 @@ def stringify_guest_data(guest_data: Dict[str, Any], guest_id: str) -> str:
known_attributes = {}
for attribute, value in relevant_guest_data["attributes"].items():
- if value != "unknown" and value != False and value != "No_Beard":
+ if value != "unknown":
known_attributes[attribute] = value
print("These are the known attributes")
print(known_attributes)
@@ -98,11 +98,26 @@ def stringify_guest_data(guest_data: Dict[str, Any], guest_id: str) -> str:
)
detection = True
if attribute == "facial_hair":
- guest_str += f"They have facial hair."
detection = True
+ if relevant_guest_data["attributes"][attribute] != "No_Beard":
+ guest_str += f"They have facial hair."
+ else:
+ guest_str += f"They do not have facial hair."
+ if attribute == "glasses":
+ detection = True
+ if relevant_guest_data["attributes"][attribute]:
+ guest_str += f"They are wearing glasses."
+ else:
+ guest_str += f"They are not wearing glasses."
+ if attribute == "hat":
+ detection = True
+ if relevant_guest_data["attributes"][attribute]:
+ guest_str += f"They are wearing a hat."
+ else:
+ guest_str += f"They are not wearing a hat."
if not detection:
if isSingular(attribute):
- guest_str += f"They are a wearing {attribute}."
+ guest_str += f"They are wearing a {attribute}."
else:
guest_str += f"They are wearing {attribute}."