Merge pull request #1739 from Fleyderer/fix-yolox

mikel-brostrom · web-flow · commit 7ebb7a325d1a · 2024-11-15T12:37:47.000+01:00
Fix YOLOX
diff --git a/boxmot/utils/ops.py b/boxmot/utils/ops.py
@@ -6,7 +6,6 @@
 from typing import Tuple, Union
 
 
-
 def xyxy2xywh(x):
     """
     Convert bounding box coordinates from (x1, y1, x2, y2) format to (x, y, width, height) format.
@@ -186,4 +185,34 @@ def letterbox(
     left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
     img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
 
-    return img, ratio, (dw, dh)
+    return img, ratio, (dw, dh)
+
+
+# This preprocess differs from the current version of YOLOX preprocess, but ByteTrack uses it
+# https://github.com/ifzhang/ByteTrack/blob/d1bf0191adff59bc8fcfeaa0b33d3d1642552a99/yolox/data/data_augment.py#L189
+def bytetrack_preprocess(image, input_size, 
+                         mean=(0.485, 0.456, 0.406), 
+                         std=(0.229, 0.224, 0.225), 
+                         swap=(2, 0, 1)):
+    if len(image.shape) == 3:
+        padded_img = np.ones((input_size[0], input_size[1], 3)) * 114.0
+    else:
+        padded_img = np.ones(input_size) * 114.0
+    img = np.array(image)
+    r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
+    resized_img = cv2.resize(
+        img,
+        (int(img.shape[1] * r), int(img.shape[0] * r)),
+        interpolation=cv2.INTER_LINEAR,
+    ).astype(np.float32)
+    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
+
+    padded_img = padded_img[:, :, ::-1]
+    padded_img /= 255.0
+    if mean is not None:
+        padded_img -= mean
+    if std is not None:
+        padded_img /= std
+    padded_img = padded_img.transpose(swap)
+    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
+    return padded_img, r
diff --git a/examples/det/yolox_boxmot.ipynb b/examples/det/yolox_boxmot.ipynb
@@ -27,7 +27,7 @@
     "from yolox.utils import postprocess\n",
     "from yolox.utils.model_utils import fuse_model\n",
     "from boxmot import BotSort\n",
-    "from boxmot.utils.ops import letterbox\n",
+    "from boxmot.utils.ops import bytetrack_preprocess\n",
     "\n",
     "\n",
     "# Dictionary for YOLOX model weights URLs\n",
@@ -40,7 +40,7 @@
     "}\n",
     "\n",
     "# Preprocessing pipeline\n",
-    "preprocess = transforms.Compose([transforms.ToTensor()])\n",
+    "input_size = [800, 1440]\n",
     "device = torch.device('cpu')\n",
     "yolox_model = 'yolox_s.pt'\n",
     "yolox_model_path = Path(yolox_model)\n",
@@ -59,8 +59,15 @@
     "model = fuse_model(model).to(device).eval()\n",
     "\n",
     "# Initialize tracker\n",
-    "tracker = BotSort(reid_weights=Path('osnet_x0_25_msmt17.pt'), device=device, half=False)\n",
-    "\n",
+    "tracker = BotSort(reid_weights=Path('osnet_x0_25_msmt17.pt'), device=device, half=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "# Video capture setup\n",
     "vid = cv2.VideoCapture(0)\n",
     "\n",
@@ -70,20 +77,20 @@
     "        break\n",
     "\n",
     "    # Preprocess frame\n",
-    "    frame_letterbox, ratio, (dw, dh) = letterbox(frame, new_shape=[640, 640], auto=False, scaleFill=True)\n",
-    "    frame_tensor = preprocess(frame_letterbox).unsqueeze(0).to(device)\n",
+    "    frame_img, ratio = bytetrack_preprocess(frame, input_size=input_size)\n",
+    "    frame_tensor = torch.Tensor(frame_img).unsqueeze(0).to(device)\n",
     "\n",
     "    # Detection with YOLOX\n",
     "    with torch.no_grad():\n",
     "        dets = model(frame_tensor)\n",
-    "    dets = postprocess(dets, 1, 0.5, 0.2, class_agnostic=True)[0]\n",
+    "    dets = postprocess(dets, 1, 0.5, 0.7, class_agnostic=True)[0]\n",
     "\n",
     "    if dets is not None:\n",
     "        # Rescale coordinates from letterbox back to the original frame size\n",
-    "        dets[:, 0] = (dets[:, 0] - dw) / ratio[0]\n",
-    "        dets[:, 1] = (dets[:, 1] - dh) / ratio[1]\n",
-    "        dets[:, 2] = (dets[:, 2] - dw) / ratio[0]\n",
-    "        dets[:, 3] = (dets[:, 3] - dh) / ratio[1]\n",
+    "        dets[:, 0] = (dets[:, 0]) / ratio\n",
+    "        dets[:, 1] = (dets[:, 1]) / ratio\n",
+    "        dets[:, 2] = (dets[:, 2]) / ratio\n",
+    "        dets[:, 3] = (dets[:, 3]) / ratio\n",
     "        dets[:, 4] *= dets[:, 5]\n",
     "        dets = dets[:, [0, 1, 2, 3, 4, 6]].cpu().numpy()\n",
     "    else:\n",
@@ -121,7 +128,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.5"
+   "version": "3.12.4"
   }
  },
  "nbformat": 4,
diff --git a/tracking/detectors/__init__.py b/tracking/detectors/__init__.py
@@ -5,10 +5,29 @@
 
 checker = RequirementsChecker()
 
+UL_MODELS = ['yolov8', 'yolov9', 'yolov10', 'yolo11', 'rtdetr', 'sam']
+
+
+def is_ultralytics_model(yolo_name):
+    return any(yolo in str(yolo_name) for yolo in UL_MODELS)
+
+
+def is_yolox_model(yolo_name):
+    return 'yolox' in str(yolo_name)
+
+
+def default_imgsz(yolo_name):
+    if is_ultralytics_model(yolo_name):
+        return [640, 640]
+    elif is_yolox_model(yolo_name):
+        return [800, 1440]
+    else:
+        return [640, 640]
+
 
 def get_yolo_inferer(yolo_model):
 
-    if 'yolox' in str(yolo_model):
+    if is_yolox_model(yolo_model):
         try:
             import yolox  # for linear_assignment
             assert yolox.__version__
diff --git a/tracking/detectors/yolo_interface.py b/tracking/detectors/yolo_interface.py
@@ -14,6 +14,10 @@ class YoloInterface(ABC):
     def __call__(self, im):
         pass
 
+    @abstractmethod
+    def preprocess(self, ims):
+        pass
+
     @abstractmethod
     def postprocess(self, preds):
         pass
diff --git a/tracking/detectors/yolox.py b/tracking/detectors/yolox.py
@@ -4,11 +4,13 @@
 import torch
 from ultralytics.engine.results import Results
 from ultralytics.utils import ops
+from ultralytics.models.yolo.detect import DetectionPredictor
 from yolox.exp import get_exp
 from yolox.utils import postprocess
 from yolox.utils.model_utils import fuse_model
 
 from boxmot.utils import logger as LOGGER
+from boxmot.utils.ops import bytetrack_preprocess
 from tracking.detectors.yolo_interface import YoloInterface
 
 # default model weigths for these model names
@@ -48,6 +50,7 @@ class YoloXStrategy(YoloInterface):
     def __init__(self, model, device, args):
 
         self.args = args
+        self.imgsz = args.imgsz
         self.pt = False
         self.stride = 32  # max stride in YOLOX
 
@@ -80,25 +83,64 @@ def __init__(self, model, device, args):
             map_location=torch.device('cpu')
         )
 
+        self.device = device
         self.model = exp.get_model()
         self.model.eval()
         self.model.load_state_dict(ckpt["model"])
         self.model = fuse_model(self.model)
-        self.model.to(device)
+        self.model.to(self.device)
         self.model.eval()
+        self.im_paths = []
+        self._preproc_data = []
 
     @torch.no_grad()
     def __call__(self, im, augment, visualize, embed):
+        if isinstance(im, list):
+            if len(im[0].shape) == 3:
+                im = torch.stack(im)
+            else:
+                im = torch.vstack(im)
+
+        if len(im.shape) == 3:
+            im = im.unsqueeze(0)
+
+        assert len(im.shape) == 4, f"Expected 4D tensor as input, got {im.shape}"
+
         preds = self.model(im)
         return preds
 
     def warmup(self, imgsz):
         pass
 
-    def postprocess(self, path, preds, im, im0s):
+    def update_im_paths(self, predictor: DetectionPredictor):
+        """
+        This function saves image paths for the current batch,
+        being passed as callback on_predict_batch_start
+        """
+        assert (isinstance(predictor, DetectionPredictor),
+                "Only ultralytics predictors are supported")
+        self.im_paths = predictor.batch[0]
+
+    def preprocess(self, im) -> torch.Tensor:
+        assert isinstance(im, list)
+        im_preprocessed = []
+        self._preproc_data = []
+        for i, img in enumerate(im):
+            img_pre, ratio = bytetrack_preprocess(img, input_size=self.imgsz)
+            img_pre = torch.Tensor(img_pre).unsqueeze(0).to(self.device)
+
+            im_preprocessed.append(img_pre)
+            self._preproc_data.append(ratio)
+
+        im_preprocessed = torch.vstack(im_preprocessed)
+
+        return im_preprocessed
+
+    def postprocess(self, preds, im, im0s):
 
         results = []
         for i, pred in enumerate(preds):
+            im_path = self.im_paths[i] if len(self.im_paths) else ""
 
             pred = postprocess(
                 pred.unsqueeze(0),  # YOLOX postprocessor expects 3D arary
@@ -111,25 +153,27 @@ def postprocess(self, path, preds, im, im0s):
             if pred is None:
                 pred = torch.empty((0, 6))
                 r = Results(
-                    path=path,
+                    path=im_path,
                     boxes=pred,
                     orig_img=im0s[i],
                     names=self.names
                 )
                 results.append(r)
             else:
-                # (x, y, x, y, conf, obj, cls) --> (x, y, x, y, conf, cls)
-                pred[:, 4] = pred[:, 4] * pred[:, 5]
+                ratio = self._preproc_data[i]
+                pred[:, 0] = pred[:, 0] / ratio
+                pred[:, 1] = pred[:, 1] / ratio
+                pred[:, 2] = pred[:, 2] / ratio
+                pred[:, 3] = pred[:, 3] / ratio
+                pred[:, 4] *= pred[:, 5]
                 pred = pred[:, [0, 1, 2, 3, 4, 6]]
 
-                pred[:, :4] = ops.scale_boxes(im.shape[2:], pred[:, :4], im0s[i].shape)
-
                 # filter boxes by classes
                 if self.args.classes:
                     pred = pred[torch.isin(pred[:, 5].cpu(), torch.as_tensor(self.args.classes))]
 
                 r = Results(
-                    path=path,
+                    path=im_path,
                     boxes=pred,
                     orig_img=im0s[i],
                     names=self.names
diff --git a/tracking/track.py b/tracking/track.py
@@ -12,7 +12,8 @@
 from boxmot.tracker_zoo import create_tracker
 from boxmot.utils import ROOT, WEIGHTS, TRACKER_CONFIGS
 from boxmot.utils.checks import RequirementsChecker
-from tracking.detectors import get_yolo_inferer
+from tracking.detectors import (get_yolo_inferer, default_imgsz,
+                                is_ultralytics_model, is_yolox_model)
 
 checker = RequirementsChecker()
 checker.check_packages(('ultralytics @ git+https://github.com/mikel-brostrom/ultralytics.git', ))  # install
@@ -56,11 +57,13 @@ def on_predict_start(predictor, persist=False):
 
 @torch.no_grad()
 def run(args):
-    
-    ul_models = ['yolov8', 'yolov9', 'yolov10', 'yolo11', 'rtdetr', 'sam']
+
+    if args.imgsz is None:
+        args.imgsz = default_imgsz(args.yolo_model)
 
     yolo = YOLO(
-        args.yolo_model if any(yolo in str(args.yolo_model) for yolo in ul_models) else 'yolov8n.pt',
+        args.yolo_model if is_ultralytics_model(args.yolo_model)
+        else 'yolov8n.pt',
     )
 
     results = yolo.track(
@@ -87,15 +90,23 @@ def run(args):
 
     yolo.add_callback('on_predict_start', partial(on_predict_start, persist=True))
 
-    if not any(yolo in str(args.yolo_model) for yolo in ul_models):
+    if not is_ultralytics_model(args.yolo_model):
         # replace yolov8 model
         m = get_yolo_inferer(args.yolo_model)
-        model = m(
-            model=args.yolo_model,
-            device=yolo.predictor.device,
-            args=yolo.predictor.args
-        )
-        yolo.predictor.model = model
+        yolo_model = m(model=args.yolo_model, device=yolo.predictor.device,
+                       args=yolo.predictor.args)
+        yolo.predictor.model = yolo_model
+
+        # If current model is YOLOX, change the preprocess and postprocess
+        if is_yolox_model(args.yolo_model):
+            # add callback to save image paths for further processing
+            yolo.add_callback("on_predict_batch_start",
+                              lambda p: yolo_model.update_im_paths(p))
+            yolo.predictor.preprocess = (
+                lambda imgs: yolo_model.preprocess(im=imgs))
+            yolo.predictor.postprocess = (
+                lambda preds, im, im0s:
+                yolo_model.postprocess(preds=preds, im=im, im0s=im0s))
 
     # store custom args in predictor
     yolo.predictor.custom_args = args
@@ -112,6 +123,7 @@ def run(args):
 
 
 def parse_opt():
+    
     parser = argparse.ArgumentParser()
     parser.add_argument('--yolo-model', type=Path, default=WEIGHTS / 'yolov8n',
                         help='yolo model path')
@@ -121,7 +133,7 @@ def parse_opt():
                         help='deepocsort, botsort, strongsort, ocsort, bytetrack, imprassoc')
     parser.add_argument('--source', type=str, default='0',
                         help='file/dir/URL/glob, 0 for webcam')
-    parser.add_argument('--imgsz', '--img', '--img-size', nargs='+', type=int, default=[640],
+    parser.add_argument('--imgsz', '--img', '--img-size', nargs='+', type=int, default=None,
                         help='inference size h,w')
     parser.add_argument('--conf', type=float, default=0.5,
                         help='confidence threshold')
diff --git a/tracking/utils.py b/tracking/utils.py
@@ -371,13 +371,13 @@ def write_mot_results(txt_path: Path, mot_results: np.ndarray) -> None:
     path to the file will be created as well if necessary.
     """
     if mot_results is not None:
-        if mot_results.size != 0:
-            # Ensure the parent directory of the txt_path exists
-            txt_path.parent.mkdir(parents=True, exist_ok=True)
+        # Ensure the parent directory of the txt_path exists
+        txt_path.parent.mkdir(parents=True, exist_ok=True)
 
-            # Ensure the file exists before opening
-            txt_path.touch(exist_ok=True)
+        # Ensure the file exists before opening
+        txt_path.touch(exist_ok=True)
 
+        if mot_results.size != 0:
             # Open the file in append mode and save the MOT results
             with open(str(txt_path), 'a') as file:
                 np.savetxt(file, mot_results, fmt='%d,%d,%d,%d,%d,%d,%d,%d,%.6f')
diff --git a/tracking/val.py b/tracking/val.py