Minor release preparation. (#93)

* Bigger tolerance for classification scores. * Improved MultiClassificationParser. * Warnings in Keypoints and Segmentation parsers. * Examples for new models. * Adjusted tests due to tolerance change. * Add XFeat Mono and Stereo to examples. * XFeat demo instructions. * Remove warp drawing in stereo mode.
luxonis · Oct 4, 2024 · 3f3e360 · 3f3e360
1 parent 9212a21
commit 3f3e360
Show file tree

Hide file tree

Showing 15 changed files with 429 additions and 32 deletions.
diff --git a/depthai_nodes/ml/messages/creators/classification.py b/depthai_nodes/ml/messages/creators/classification.py
@@ -66,7 +66,7 @@ def create_classification_message(
             f"Scores list must contain probabilities between 0 and 1, instead got {scores}."
         )
 
-    if not np.isclose(np.sum(scores), 1.0, atol=1e-2):
+    if not np.isclose(np.sum(scores), 1.0, atol=1e-1):
         raise ValueError(f"Scores should sum to 1, got {np.sum(scores)}.")
 
     if len(scores) != len(classes):

diff --git a/depthai_nodes/ml/parsers/classification.py b/depthai_nodes/ml/parsers/classification.py
@@ -123,8 +123,8 @@ class MultiClassificationParser(dai.node.ThreadedHostNode):
 
     def __init__(
         self,
-        classification_attributes: List[str],
-        classification_labels: List[List[str]],
+        classification_attributes: List[str] = None,
+        classification_labels: List[List[str]] = None,
     ):
         """Initializes the MultipleClassificationParser node."""
         dai.node.ThreadedHostNode.__init__(self)
@@ -133,7 +133,28 @@ def __init__(
         self.classification_attributes: List[str] = classification_attributes
         self.classification_labels: List[List[str]] = classification_labels
 
+    def setClassificationAttributes(self, classification_attributes: List[str]):
+        """Sets the classification attributes for the multiple classification model.
+
+        @param classification_attributes: List of attributes to be classified.
+        @type classification_attributes: List[str]
+        """
+        self.classification_attributes = classification_attributes
+
+    def setClassificationLabels(self, classification_labels: List[List[str]]):
+        """Sets the classification labels for the multiple classification model.
+
+        @param classification_labels: List of class labels for each attribute.
+        @type classification_labels: List[List[str]]
+        """
+        self.classification_labels = classification_labels
+
     def run(self):
+        if not self.classification_attributes:
+            raise ValueError("Classification attributes must be provided.")
+        if not self.classification_labels:
+            raise ValueError("Classification labels must be provided.")
+
         while self.isRunning():
             try:
                 output: dai.NNData = self.input.get()

diff --git a/depthai_nodes/ml/parsers/keypoints.py b/depthai_nodes/ml/parsers/keypoints.py
@@ -53,6 +53,7 @@ def __init__(
 
         self.scale_factor = scale_factor
         self.n_keypoints = n_keypoints
+        self._warned = False
 
     def setScaleFactor(self, scale_factor):
         """Sets the scale factor to divide the keypoints by.
@@ -82,10 +83,11 @@ def run(self):
 
             output_layer_names = output.getAllLayerNames()
 
-            if len(output_layer_names) != 1:
-                raise ValueError(
-                    f"Expected 1 output layer, got {len(output_layer_names)}."
+            if len(output_layer_names) != 1 and not self._warned:
+                print(
+                    f"Expected 1 output layer, got {len(output_layer_names)}, will take the first one."
                 )
+                self._warned = True
 
             keypoints = output.getTensor(output_layer_names[0], dequantize=True).astype(
                 np.float32

diff --git a/depthai_nodes/ml/parsers/segmentation.py b/depthai_nodes/ml/parsers/segmentation.py
@@ -39,6 +39,7 @@ def __init__(self, background_class=False):
         self.input = self.createInput()
         self.out = self.createOutput()
         self.background_class = background_class
+        self._warned = False
 
     def setBackgroundClass(self, background_class):
         """Sets the background class.
@@ -57,10 +58,11 @@ def run(self):
 
             output_layer_names = output.getAllLayerNames()
 
-            if len(output_layer_names) != 1:
+            if len(output_layer_names) != 1 and not self._warned:
                 print(
                     f"Expected 1 output layer, got {len(output_layer_names)}. Will take the first one."
                 )
+                self._warned = True
 
             segmentation_mask = output.getTensor(output_layer_names[0], dequantize=True)
             if len(segmentation_mask.shape) == 4:

diff --git a/examples/README.md b/examples/README.md
@@ -35,3 +35,7 @@ python main.py -s yolov6-nano:coco-416x416 -fps 28
 Some models have small input sizes and requesting small image size from `Camera` is problematic so we request 4x bigger frame and resize it back down. During visualization image frame is resized back so some image quality is lost - only for visualization.
 
 The parser is obtained from NN archive along with other important parameters for the parser. So, make sure your NN archive is well-defined.
+
+### XFeat
+
+If you want to run xfeat demo you have two options available - to run it in `Stereo` mode or `Mono` mode depending on the nn archive you provided. If the NN archive requires `XFeatMonoParser` then the mono mode will be used, otherwise the stereo mode will be used (`XFeatStereoParser`). For the stereo mode you need OAK camera which has left and right cameras, if not the error will be raised. If you use mono mode you can set the reference frame to which all the other frames will be compared to. The reference frame is set by triggering - pressing `S` key.
diff --git a/examples/main.py b/examples/main.py
@@ -2,6 +2,7 @@
 from utils.arguments import initialize_argparser, parse_fps_limit, parse_model_slug
 from utils.model import get_input_shape, get_model_from_hub, get_parser
 from utils.parser import setup_parser
+from utils.xfeat import xfeat_mono, xfeat_stereo
 from visualization.visualize import visualize
 
 # Initialize the argument parser
@@ -18,8 +19,12 @@
 parser_class, parser_name = get_parser(nn_archive)
 input_shape = get_input_shape(nn_archive)
 
-if parser_name == "XFeatParser":
-    raise NotImplementedError("XFeatParser is not supported in this script yet.")
+if parser_name == "XFeatMonoParser":
+    xfeat_mono(nn_archive, input_shape, fps_limit)
+    exit(0)
+elif parser_name == "XFeatStereoParser":
+    xfeat_stereo(nn_archive, input_shape, fps_limit)
+    exit(0)
 
 # Create the pipeline
 with dai.Pipeline() as pipeline:

diff --git a/examples/utils/parser.py b/examples/utils/parser.py
@@ -7,9 +7,10 @@
     LaneDetectionParser,
     MapOutputParser,
     MPPalmDetectionParser,
+    MultiClassificationParser,
+    PaddleOCRParser,
     SCRFDParser,
     SegmentationParser,
-    XFeatParser,
     YOLOExtendedParser,
 )
 
@@ -76,18 +77,6 @@ def setup_map_output_parser(parser: MapOutputParser, params: dict):
         )
 
 
-def setup_xfeat_parser(parser: XFeatParser, params: dict):
-    """Setup the XFeat parser with the required metadata."""
-    try:
-        input_size = params["input_size"]
-        parser.setInputSize(input_size)
-        parser.setOriginalSize(input_size)
-    except Exception:
-        print(
-            "This NN archive does not have required metadata for XFeatParser. Skipping setup..."
-        )
-
-
 def setup_yolo_extended_parser(parser: YOLOExtendedParser, params: dict):
     """Setup the YOLO parser with the required metadata."""
     try:
@@ -142,6 +131,30 @@ def setup_fastsam_parser(parser: FastSAMParser, params: dict):
         )
 
 
+def setup_paddleocr_parser(parser: PaddleOCRParser, params: dict):
+    """Setup the PaddleOCR parser with the required metadata."""
+    try:
+        classes = params["classes"]
+        parser.setClasses(classes)
+    except Exception:
+        print(
+            "This NN archive does not have required metadata for PaddleOCRParser. Skipping setup..."
+        )
+
+
+def setup_multi_classification_parser(parser: MultiClassificationParser, params: dict):
+    """Setup the Multi Classification parser with the required metadata."""
+    try:
+        classification_attributes = params["classification_attributes"]
+        classification_labels = params["classification_labels"]
+        parser.setClassificationAttributes(classification_attributes)
+        parser.setClassificationLabels(classification_labels)
+    except Exception:
+        print(
+            "This NN archive does not have required metadata for MultiClassificationParser. Skipping setup..."
+        )
+
+
 def setup_parser(parser: dai.ThreadedNode, nn_archive: dai.NNArchive, parser_name: str):
     """Setup the parser with the NN archive."""
 
@@ -159,8 +172,6 @@ def setup_parser(parser: dai.ThreadedNode, nn_archive: dai.NNArchive, parser_nam
         setup_classification_parser(parser, extraParams)
     elif parser_name == "MapOutputParser":
         setup_map_output_parser(parser, extraParams)
-    elif parser_name == "XFeatParser":
-        setup_xfeat_parser(parser, extraParams)
     elif parser_name == "YOLOExtendedParser":
         setup_yolo_extended_parser(parser, extraParams)
     elif parser_name == "MPPalmDetectionParser":
@@ -169,3 +180,7 @@ def setup_parser(parser: dai.ThreadedNode, nn_archive: dai.NNArchive, parser_nam
         setup_land_detection_parser(parser, extraParams)
     elif parser_name == "FastSAMParser":
         setup_fastsam_parser(parser, extraParams)
+    elif parser_name == "PaddleOCRParser":
+        setup_paddleocr_parser(parser, extraParams)
+    elif parser_name == "MultiClassificationParser":
+        setup_multi_classification_parser(parser, extraParams)
diff --git a/examples/utils/xfeat.py b/examples/utils/xfeat.py
@@ -0,0 +1,145 @@
+from typing import List
+
+import cv2
+import depthai as dai
+from visualization.visualizers import xfeat_visualizer
+
+from depthai_nodes.ml.parsers import XFeatMonoParser, XFeatStereoParser
+
+
+def xfeat_mono(nn_archive: dai.NNArchive, input_shape: List[int], fps_limit: int):
+    """Run the XFeatMonoParser on a single camera.
+
+    It lets you set the reference frame by pressing S-key.
+    """
+    previous_frame = None
+    with dai.Pipeline() as pipeline:
+        # Set up camera
+        cam = pipeline.create(dai.node.Camera).build()
+
+        # Set up the neural network
+        network = pipeline.create(dai.node.NeuralNetwork).build(
+            cam.requestOutput(
+                input_shape, type=dai.ImgFrame.Type.BGR888p, fps=fps_limit
+            ),
+            nn_archive,
+        )
+
+        # Set up parser
+        parser = XFeatMonoParser()
+        parser.setOriginalSize(input_shape)
+        parser.setInputSize(input_shape)
+        parser.setMaxKeypoints(2048)
+
+        # Linking
+        network.out.link(parser.input)
+
+        # Set up queue
+        camera_queue = network.passthrough.createOutputQueue()
+        parser_queue = parser.out.createOutputQueue()
+
+        pipeline.start()
+
+        while pipeline.isRunning():
+            frame: dai.ImgFrame = camera_queue.get().getCvFrame()
+            message: dai.TrackedFeatures = (
+                parser_queue.get()
+            )  # get message from the queue
+            features = message.trackedFeatures
+            if previous_frame is not None:
+                resulting_frame = xfeat_visualizer(previous_frame, frame, features)
+            else:
+                resulting_frame = frame
+            number_of_matches = len(features) // 2
+            cv2.putText(
+                resulting_frame,
+                f"Number of matches: {number_of_matches}",
+                (10, 30),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.5,
+                (255, 255, 255),
+                1,
+            )
+            cv2.imshow("XFeat", resulting_frame)
+
+            key_pressed = cv2.waitKey(1)
+            if key_pressed == ord("s"):
+                parser.setTrigger()  # trigger to set the reference frame
+                previous_frame = frame
+            if key_pressed == ord("q"):
+                cv2.destroyAllWindows()
+                pipeline.stop()
+                break
+
+
+def xfeat_stereo(nn_archive: dai.NNArchive, input_shape: List[int], fps_limit: int):
+    """Run the XFeatStereoParser on stereo cameras - left and right - and match the features."""
+    with dai.Pipeline() as pipeline:
+        device: dai.Device = pipeline.getDefaultDevice()
+        available_cameras = [
+            camera.name for camera in device.getConnectedCameraFeatures()
+        ]
+
+        if "left" not in available_cameras or "right" not in available_cameras:
+            raise RuntimeError(
+                f"Stereo cameras are not available! Available cameras: {available_cameras}"
+            )
+
+        left_cam = pipeline.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_B)
+        right_cam = pipeline.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_C)
+
+        left_network = pipeline.create(dai.node.NeuralNetwork).build(
+            left_cam.requestOutput(
+                input_shape, type=dai.ImgFrame.Type.RGB888p, fps=fps_limit
+            ),
+            nn_archive,
+        )
+        left_network.setNumInferenceThreads(2)
+
+        right_network = pipeline.create(dai.node.NeuralNetwork).build(
+            right_cam.requestOutput(
+                input_shape, type=dai.ImgFrame.Type.RGB888p, fps=fps_limit
+            ),
+            nn_archive,
+        )
+        right_network.setNumInferenceThreads(2)
+
+        parser = pipeline.create(XFeatStereoParser)
+        parser.setOriginalSize(input_shape)
+        parser.setInputSize(input_shape)
+        parser.setMaxKeypoints(512)
+
+        left_network.out.link(parser.reference_input)
+        right_network.out.link(parser.target_input)
+
+        left_cam_queue = left_network.passthrough.createOutputQueue()
+        right_cam_queue = right_network.passthrough.createOutputQueue()
+        parser_queue = parser.out.createOutputQueue()
+
+        pipeline.start()
+
+        while pipeline.isRunning():
+            left_frame: dai.ImgFrame = left_cam_queue.get().getCvFrame()
+            right_frame: dai.ImgFrame = right_cam_queue.get().getCvFrame()
+            features: dai.TrackedFeatures = parser_queue.get()
+            features = features.trackedFeatures
+
+            resulting_frame = xfeat_visualizer(
+                left_frame, right_frame, features, draw_warp_corners=False
+            )
+            number_of_matches = len(features) // 2
+            cv2.putText(
+                resulting_frame,
+                f"Number of matches: {number_of_matches}",
+                (10, 30),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.5,
+                (255, 255, 255),
+                1,
+            )
+            cv2.imshow("XFeat Stereo", resulting_frame)
+
+            if cv2.waitKey(1) == ord("q"):
+                cv2.destroyAllWindows()
+                pipeline.stop()
+                break
diff --git a/examples/visualization/visualize.py b/examples/visualization/visualize.py
@@ -10,7 +10,10 @@
     visualize_lane_detections,
     visualize_line_detections,
     visualize_map,
+    visualize_multi_classification,
     visualize_segmentation,
+    visualize_text_detection,
+    visualize_text_recognition,
     visualize_yolo_extended,
 )
 
@@ -33,6 +36,9 @@
     "YOLOExtendedParser": visualize_yolo_extended,
     "LaneDetectionParser": visualize_lane_detections,
     "FastSAMParser": visualize_fastsam,
+    "PPTextDetectionParser": visualize_text_detection,
+    "PaddleOCRParser": visualize_text_recognition,
+    "MultiClassificationParser": visualize_multi_classification,
 }