diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server index 51b5c54c9..6b994f26b 100644 --- a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server +++ b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server @@ -26,23 +26,25 @@ class speech_model_params: Must be a valid Whisper model name. device (str, optional): Device to run the model on. Defaults to "cuda" if available, otherwise "cpu". start_timeout (float): Max number of seconds of silence when starting listening before stopping. Defaults to 5.0. - end_timeout (Optional[float]): Max number of seconds of silence after starting listening before stopping. Defaults to None. + phrase_duration (Optional[float]): Max number of seconds of the phrase. Defaults to 10 seconds. sample_rate (int): Sample rate of the microphone. Defaults to 16000Hz. mic_device (Optional[str]): Microphone device index or name. Defaults to None. timer_duration (Optional[int]): Duration of the timer for adjusting the microphone for ambient noise. Defaults to 20 seconds. warmup (bool): Whether to warmup the model by running inference on a test file. Defaults to True. energy_threshold (Optional[int]): Energy threshold for silence detection. Using this disables automatic adjustment. Defaults to None. + pause_threshold (Optional[float]): Seconds of non-speaking audio before a phrase is considered complete. Defaults to 0.8 seconds. """ model_name: str = "medium.en" device: str = "cuda" if torch.cuda.is_available() else "cpu" start_timeout: float = 5.0 - end_timeout: Optional[float] = None + phrase_duration: Optional[float] = 10 sample_rate: int = 16000 mic_device: Optional[str] = None timer_duration: Optional[int] = 20 warmup: bool = True - energy_threshold: Optional[int] = None + energy_threshold: Optional[int] = 600 + phrase_duration: Optional[float] = 0.8 class TranscribeSpeechAction(object): @@ -122,6 +124,9 @@ class TranscribeSpeechAction(object): self._listening = True recogniser = sr.Recognizer() + if self._model_params.pause_threshold: + recogniser.pause_threshold = self._model_params.pause_threshold + if self._model_params.energy_threshold: recogniser.dynamic_energy_threshold = False recogniser.energy_threshold = self._model_params.energy_threshold @@ -160,7 +165,7 @@ class TranscribeSpeechAction(object): wav_data = self.recogniser.listen( src, timeout=self._model_params.start_timeout, - phrase_time_limit=self._model_params.end_timeout, + phrase_time_limit=self._model_params.phrase_duration, ).get_wav_data() # Magic number 32768.0 is the maximum value of a 16-bit signed integer float_data = ( @@ -233,10 +238,10 @@ def parse_args() -> dict: help="Timeout for listening for the start of a phrase.", ) parser.add_argument( - "--end_timeout", + "--phrase_duration", type=float, - default=None, - help="Timeout for listening for the end of a phrase.", + default=10, + help="Maximum phrase duration after starting listening in seconds.", ) parser.add_argument( "--sample_rate", @@ -259,10 +264,17 @@ def parse_args() -> dict: parser.add_argument( "--energy_threshold", type=int, - default=None, + default=600, help="Energy threshold for silence detection. Using this disables automatic adjustment", ) + parser.add_argument( + "--pause_threshold", + type=float, + default=2.0, + help="Seconds of non-speaking audio before a phrase is considered complete.", + ) + args, unknown = parser.parse_known_args() return vars(args) @@ -284,14 +296,18 @@ def configure_model_params(config: dict) -> speech_model_params: model_params.device = config["device"] if config["start_timeout"]: model_params.start_timeout = config["start_timeout"] - if config["end_timeout"]: - model_params.end_timeout = config["end_timeout"] + if config["phrase_duration"]: + model_params.phrase_duration = config["phrase_duration"] if config["sample_rate"]: model_params.sample_rate = config["sample_rate"] if config["mic_device"]: model_params.mic_device = config["mic_device"] if config["no_warmup"]: model_params.warmup = False + if config["energy_threshold"]: + model_params.energy_threshold = config["energy_threshold"] + if config["pause_threshold"]: + model_params.pause_threshold = config["pause_threshold"] return model_params diff --git a/skills/src/lasr_skills/describe_people.py b/skills/src/lasr_skills/describe_people.py index 8b09ed186..770fdf582 100755 --- a/skills/src/lasr_skills/describe_people.py +++ b/skills/src/lasr_skills/describe_people.py @@ -39,7 +39,10 @@ def __init__(self): smach.StateMachine.add( "GET_IMAGE", GetCroppedImage( - object_name="person", crop_method="closest", use_mask=True + object_name="person", + crop_method=crop_method, + rgb_topic=rgb_topic, + use_mask=True, ), transitions={ "succeeded": "CONVERT_IMAGE", @@ -232,9 +235,14 @@ def execute(self, userdata): rospy.logdebug(f"|> Person does not have {part} visible") continue - if part.name == "torso_front" or part.name == "torso_back": + if ( + part.part_name == "torso_front" + or part.part_name == "torso_back" + ): torso_mask = np.logical_or(torso_mask, part_mask) - elif part.name == "left_face" or part.name == "right_face": + elif ( + part.part_name == "left_face" or part.part_name == "right_face" + ): head_mask = np.logical_or(head_mask, part_mask) torso_mask_data, torso_mask_shape, torso_mask_dtype = numpy2message( diff --git a/skills/src/lasr_skills/look_at_person.py b/skills/src/lasr_skills/look_at_person.py index e41bcef48..fb9675c80 100755 --- a/skills/src/lasr_skills/look_at_person.py +++ b/skills/src/lasr_skills/look_at_person.py @@ -2,8 +2,10 @@ from geometry_msgs.msg import PointStamped import smach from .vision import GetPointCloud -from lasr_vision_msgs.srv import BodyPixDetection, BodyPixDetectionRequest -from lasr_vision_msgs.msg import BodyPixMaskRequest +from lasr_vision_msgs.srv import ( + BodyPixKeypointDetection, + BodyPixKeypointDetectionRequest, +) from lasr_skills import LookToPoint, DetectFaces import cv2 import rospy @@ -155,38 +157,39 @@ def execute(self, userdata): return "failed" - @smach.cb_interface(input_keys=["poses", "detections"], output_keys=["bbox_eyes"]) + @smach.cb_interface( + input_keys=["keypoints", "detections"], output_keys=["bbox_eyes"] + ) def match_poses_and_detections(ud): bbox_eyes = [] - for pose in ud.poses: + for keypoint in ud.keypoints: for detection in ud.detections.detections: temp = { "bbox": detection.xywh, } - for keypoint in pose.keypoints: - if ( - keypoint.part == "leftEye" - and detection.xywh[0] - < keypoint.xy[0] - < detection.xywh[0] + detection.xywh[2] - and detection.xywh[1] - < keypoint.xy[1] - < detection.xywh[1] + detection.xywh[3] - ): - temp["left_eye"] = keypoint.xy - if ( - keypoint.part == "rightEye" - and detection.xywh[0] - < keypoint.xy[0] - < detection.xywh[0] + detection.xywh[2] - and detection.xywh[1] - < keypoint.xy[1] - < detection.xywh[1] + detection.xywh[3] - ): - temp["right_eye"] = keypoint.xy + if ( + keypoint.keypoint_name == "leftEye" + and detection.xywh[0] + < keypoint.x + < detection.xywh[0] + detection.xywh[2] + and detection.xywh[1] + < keypoint.y + < detection.xywh[1] + detection.xywh[3] + ): + temp["left_eye"] = keypoint.x, keypoint.y + if ( + keypoint.keypoint_name == "rightEye" + and detection.xywh[0] + < keypoint.x + < detection.xywh[0] + detection.xywh[2] + and detection.xywh[1] + < keypoint.y + < detection.xywh[1] + detection.xywh[3] + ): + temp["right_eye"] = keypoint.x, keypoint.y - if "left_eye" in temp and "right_eye" in temp: - bbox_eyes.append(temp) + if "left_eye" in temp and "right_eye" in temp: + bbox_eyes.append(temp) ud.bbox_eyes = bbox_eyes @@ -226,21 +229,17 @@ def __init__(self, filter=False): }, ) - eyes = BodyPixMaskRequest() - eyes.parts = ["left_eye", "right_eye"] - masks = [eyes] - smach.StateMachine.add( "SEGMENT_PERSON", smach_ros.ServiceState( - "/bodypix/detect", - BodyPixDetection, - request_cb=lambda ud, _: BodyPixDetectionRequest( - pcl_to_img_msg(ud.pcl_msg), "resnet50", 0.7, masks + "/bodypix/keypoint_detection", + BodyPixKeypointDetection, + request_cb=lambda ud, _: BodyPixKeypointDetectionRequest( + pcl_to_img_msg(ud.pcl_msg), "resnet50", 0.2 ), - response_slots=["masks", "poses"], + response_slots=["keypoints"], input_keys=["pcl_msg"], - output_keys=["masks", "poses"], + output_keys=["keypoints"], ), transitions={ "succeeded": "DETECT_FACES", @@ -262,8 +261,8 @@ def __init__(self, filter=False): "MATCH_POSES_AND_DETECTIONS", CBState( self.match_poses_and_detections, - input_keys=["poses", "detections"], - output_keys=["poses"], + input_keys=["keypoints", "detections"], + output_keys=["keypoints"], outcomes=["succeeded", "failed"], ), transitions={"succeeded": "CHECK_EYES", "failed": "failed"}, diff --git a/tasks/carry_my_luggage/launch/setup.launch b/tasks/carry_my_luggage/launch/setup.launch index d4b3a2775..a75310f9d 100644 --- a/tasks/carry_my_luggage/launch/setup.launch +++ b/tasks/carry_my_luggage/launch/setup.launch @@ -15,5 +15,5 @@ - + diff --git a/tasks/receptionist/launch/setup.launch b/tasks/receptionist/launch/setup.launch index 530dda7d7..bbcf45192 100644 --- a/tasks/receptionist/launch/setup.launch +++ b/tasks/receptionist/launch/setup.launch @@ -4,7 +4,7 @@ - + @@ -21,7 +21,7 @@ - + diff --git a/tasks/receptionist/scripts/main.py b/tasks/receptionist/scripts/main.py index 4c4cbeeea..f93e2bc24 100755 --- a/tasks/receptionist/scripts/main.py +++ b/tasks/receptionist/scripts/main.py @@ -10,25 +10,21 @@ if __name__ == "__main__": rospy.init_node("receptionist_robocup") wait_pose_param = rospy.get_param("/receptionist/wait_pose") - # wait_pose_param = rospy.get_param("/wait_pose/") wait_pose = Pose( position=Point(**wait_pose_param["position"]), orientation=Quaternion(**wait_pose_param["orientation"]), ) - # wait_area_param = rospy.get_param("/wait_area") wait_area_param = rospy.get_param("/receptionist/wait_area") wait_area = Polygon(wait_area_param) - # seat_pose_param = rospy.get_param("/seat_pose") seat_pose_param = rospy.get_param("/receptionist/seat_pose") seat_pose = Pose( position=Point(**seat_pose_param["position"]), orientation=Quaternion(**seat_pose_param["orientation"]), ) - # seat_area_param = rospy.get_param("/seat_area") seat_area_param = rospy.get_param("/receptionist/seat_area") seat_area = Polygon(seat_area_param) diff --git a/tasks/receptionist/src/receptionist/states/introduce.py b/tasks/receptionist/src/receptionist/states/introduce.py index 7650737ff..7efff5482 100644 --- a/tasks/receptionist/src/receptionist/states/introduce.py +++ b/tasks/receptionist/src/receptionist/states/introduce.py @@ -46,7 +46,7 @@ def stringify_guest_data(guest_data: Dict[str, Any], guest_id: str) -> str: known_attributes = {} for attribute, value in relevant_guest_data["attributes"].items(): - if value != "unknown" and value != False and value != "No_Beard": + if value != "unknown": known_attributes[attribute] = value print("These are the known attributes") print(known_attributes) @@ -98,11 +98,26 @@ def stringify_guest_data(guest_data: Dict[str, Any], guest_id: str) -> str: ) detection = True if attribute == "facial_hair": - guest_str += f"They have facial hair." detection = True + if relevant_guest_data["attributes"][attribute] != "No_Beard": + guest_str += f"They have facial hair." + else: + guest_str += f"They do not have facial hair." + if attribute == "glasses": + detection = True + if relevant_guest_data["attributes"][attribute]: + guest_str += f"They are wearing glasses." + else: + guest_str += f"They are not wearing glasses." + if attribute == "hat": + detection = True + if relevant_guest_data["attributes"][attribute]: + guest_str += f"They are wearing a hat." + else: + guest_str += f"They are not wearing a hat." if not detection: if isSingular(attribute): - guest_str += f"They are a wearing {attribute}." + guest_str += f"They are wearing a {attribute}." else: guest_str += f"They are wearing {attribute}."