Merge pull request zju3dv#4 from pablovela5620/master

processing script from nerfstudio -> sdfstudio
siatheindochinese · Dec 23, 2022 · ee2f170 · ee2f170
2 parents 7aa7274 + 3d47df8
commit ee2f170
Showing 1 changed file with 198 additions and 0 deletions.
diff --git a/scripts/datasets/process_nerfstudio_to_sdfstudio.py b/scripts/datasets/process_nerfstudio_to_sdfstudio.py
@@ -0,0 +1,198 @@
+import argparse
+import json
+from pathlib import Path
+
+import cv2
+import numpy as np
+import PIL
+from PIL import Image
+from torchvision import transforms
+
+
+def main():
+    """
+    given data that follows the nerfstduio format such as the output from colmap or polycam, convert to a format
+    that sdfstudio will ingest
+    """
+
+    parser = argparse.ArgumentParser(description="preprocess scannet dataset to sdfstudio dataset")
+
+    parser.add_argument("--data", dest="input_path", help="path to scannet scene")
+    parser.set_defaults(input_path="NONE")
+
+    parser.add_argument("--output-dir", dest="output_path", help="path to output")
+    parser.set_defaults(output_path="NONE")
+
+    parser.add_argument("--type", dest="type", default="colmap", choices=["colmap", "polycam"])
+    parser.add_argument("--indoor", action="store_true")
+
+    args = parser.parse_args()
+
+    output_path = Path(args.output_path)
+    input_path = Path(args.input_path)
+
+    POLYCAM = True if args.type == "polycam" else False
+
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    # load transformation json with images/intrinsics/extrinsics
+    camera_parameters_path = input_path / "transforms.json"
+    camera_parameters = json.load(open(camera_parameters_path))
+
+    # extract intrinsic parameters
+    if not POLYCAM:
+        cx = camera_parameters["cx"]
+        cy = camera_parameters["cy"]
+        fl_x = camera_parameters["fl_x"]
+        fl_y = camera_parameters["fl_y"]
+    else:
+        cx = []
+        cy = []
+        fl_x = []
+        fl_y = []
+
+    camera_parameters = camera_parameters["frames"]
+    num_frames = len(camera_parameters)
+
+    # load poses
+    poses = []
+    image_paths = []
+    # only load images with corresponding pose info
+    # currently in random order??, probably need to sort
+    for camera in camera_parameters:
+        if POLYCAM:
+            # average frames into single intrinsic
+            cx.append(camera["cx"])
+            cy.append(camera["cy"])
+            fl_x.append(camera["fl_x"])
+            fl_y.append(camera["fl_y"])
+
+        # OpenGL/Blender convention, needs to change to COLMAP/OpenCV convention
+        # https://docs.nerf.studio/en/latest/quickstart/data_conventions.html
+        ## IGNORED for now
+        c2w = np.array(camera["transform_matrix"]).reshape(4, 4)
+        c2w[0:3, 1:3] *= -1
+
+        img_path = input_path / camera["file_path"]
+        assert img_path.exists()
+        image_paths.append(img_path)
+        poses.append(c2w)
+
+    poses = np.array(poses)
+
+    if POLYCAM:
+        # intrinsics
+        camera_intrinsics = []
+        for idx in range(len(cx)):
+            camera_intrinsics.append(np.array([[fl_x[idx], 0, cx[idx]], [0, fl_y[idx], cy[idx]], [0, 0, 1]]))
+    else:
+        camera_intrinsics = np.array([[fl_x, 0, cx], [0, fl_y, cy], [0, 0, 1]])
+
+    # deal with invalid poses
+    valid_poses = np.isfinite(poses).all(axis=2).all(axis=1)
+    min_vertices = poses[:, :3, 3][valid_poses].min(axis=0)
+    max_vertices = poses[:, :3, 3][valid_poses].max(axis=0)
+
+    # camera pose normalization only used for indoor scenes
+    if args.indoor:
+        center = (min_vertices + max_vertices) / 2.0
+        scale = 2.0 / (np.max(max_vertices - min_vertices) + 3.0)
+
+        # we should normalize pose to unit cube
+        poses[:, :3, 3] -= center
+        poses[:, :3, 3] *= scale
+
+        # inverse normalization
+        scale_mat = np.eye(4).astype(np.float32)
+        scale_mat[:3, 3] -= center
+        scale_mat[:3] *= scale
+        scale_mat = np.linalg.inv(scale_mat)
+    else:
+        scale_mat = np.eye(4).astype(np.float32)
+
+    # copy image
+    sample_img = cv2.imread(str(image_paths[0]))
+    H, W, _ = sample_img.shape  # 1080 x 1920
+
+    # get smallest side to generate square crop
+    target_crop = min(H, W)
+
+    target_size = 384
+    trans_totensor = transforms.Compose(
+        [
+            transforms.CenterCrop(target_crop),
+            transforms.Resize(target_size, interpolation=PIL.Image.BILINEAR),
+        ]
+    )
+
+    # center crop by min_dim
+    offset_x = (W - target_crop) * 0.5
+    offset_y = (H - target_crop) * 0.5
+
+    if POLYCAM:
+        for idx in range(len(camera_intrinsics)):
+            camera_intrinsics[idx][0, 2] -= offset_x
+            camera_intrinsics[idx][1, 2] -= offset_y
+            resize_factor = target_size / target_crop
+            camera_intrinsics[idx][:2, :] *= resize_factor
+    else:
+        camera_intrinsics[0, 2] -= offset_x
+        camera_intrinsics[1, 2] -= offset_y
+        # resize from min_dim x min_dim -> to 384 x 384
+        resize_factor = target_size / target_crop
+        camera_intrinsics[:2, :] *= resize_factor
+
+    K = camera_intrinsics
+
+    frames = []
+    out_index = 0
+    for idx, (valid, pose, image_path) in enumerate(zip(valid_poses, poses, image_paths)):
+        if not valid:
+            continue
+
+        target_image = output_path / f"{out_index:06d}_rgb.png"
+        img = Image.open(image_path)
+        img_tensor = trans_totensor(img)
+        img_tensor.save(target_image)
+
+        rgb_path = str(target_image.relative_to(output_path))
+        frame = {
+            "rgb_path": rgb_path,
+            "camtoworld": pose.tolist(),
+            "intrinsics": K[idx].tolist() if isinstance(K, list) else K.tolist(),
+            "mono_depth_path": rgb_path.replace("_rgb.png", "_depth.npy"),
+            "mono_normal_path": rgb_path.replace("_rgb.png", "_normal.npy"),
+        }
+
+        frames.append(frame)
+        out_index += 1
+
+    # scene bbox for the scannet scene
+    scene_box = {
+        "aabb": [[-1, -1, -1], [1, 1, 1]],
+        "near": 0.05,
+        "far": 2.5,
+        "radius": 1.0,
+        "collider_type": "box",
+    }
+
+    # meta data
+    output_data = {
+        "camera_model": "OPENCV",
+        "height": target_size,
+        "width": target_size,
+        "has_mono_prior": True,
+        "pairs": None,
+        "worldtogt": scale_mat.tolist(),
+        "scene_box": scene_box,
+    }
+
+    output_data["frames"] = frames
+
+    # save as json
+    with open(output_path / "meta_data.json", "w", encoding="utf-8") as f:
+        json.dump(output_data, f, indent=4)
+
+
+if __name__ == "__main__":
+    main()