diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 97b7f733..49e970f9 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -2630,8 +2630,39 @@ def countgd_sam2_video_tracking( frames: List[np.ndarray], chunk_length: Optional[int] = 10, ) -> List[List[Dict[str, Any]]]: - """`countgd_sam2_video_tracking` it is only a test method""" + """'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text + prompt such as category names or referring expressions. The categories in the text + prompt are separated by commas. It returns a list of bounding boxes, label names, + mask file names and associated probability scores of 1.0. + + Parameters: + prompt (str): The prompt to ground to the image. + image (np.ndarray): The image to ground the prompt to. + + Returns: + List[Dict[str, Any]]: A list of dictionaries containing the score, label, + bounding box, and mask of the detected objects with normalized coordinates + (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left + and xmax and ymax are the coordinates of the bottom-right of the bounding box. + The mask is binary 2D numpy array where 1 indicates the object and 0 indicates + the background. + Example + ------- + >>> countgd_sam2_video_tracking("car, dinosaur", image) + [ + { + 'score': 1.0, + 'label': 'dinosaur', + 'bbox': [0.1, 0.11, 0.35, 0.4], + 'mask': array([[0, 0, 0, ..., 0, 0, 0], + [0, 0, 0, ..., 0, 0, 0], + ..., + [0, 0, 0, ..., 0, 0, 0], + [0, 0, 0, ..., 0, 0, 0]], dtype=uint8), + }, + ] + """ results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames) if chunk_length is None: