Update code for sparse view NVS

CloudEngineHub · Oct 15, 2024 · 6fbc6c5 · 6fbc6c5
1 parent 01672dc
commit 6fbc6c5
Show file tree

Hide file tree

Showing 21 changed files with 223 additions and 41 deletions.
diff --git a/README.md b/README.md
@@ -103,24 +103,22 @@ ViewCrafter can generate high-fidelity novel views from <strong>a single or spar
   </tr>
 </table>
 
-## 🗓️ TODO
-- [x] [2024-09-01] Launch the project page and update the arXiv preprint.
-- [x] [2024-09-01] Release pretrained models and the code for single-view novel view synthesis.
-- [ ] Release the code for sparse-view novel view synthesis.
-- [ ] Release the code for iterative novel view synthesis.
-- [ ] Release the code for 3D-GS reconstruction.
+## 📝 Changelog
+- __[2024-10-15]__: 🔥🔥 Release the code for sparse-view novel view synthesis.
+- __[2024-09-01]__: Launch the project page and update the arXiv preprint.
+- __[2024-09-01]__: Release pretrained models and the code for single-view novel view synthesis.
 <br>
 
 ## 🧰 Models
 
-|Model|Resolution|Frames|GPU Mem. & Inference Time (A100, ddim 50steps)|Checkpoint|
-|:---------|:---------|:--------|:--------|:--------|
-|ViewCrafter_25|576x1024|25| 23.5GB & 120s (`perframe_ae=True`)|[Hugging Face](https://huggingface.co/Drexubery/ViewCrafter_25/blob/main/model.ckpt)|
-|ViewCrafter_16|576x1024|16| 18.3GB & 75s (`perframe_ae=True`)|[Hugging Face](https://huggingface.co/Drexubery/ViewCrafter_16/blob/main/model.ckpt)|
-|ViewCrafter_25_512|320x512|25| 13.8GB & 50s (`perframe_ae=True`)|[Hugging Face](https://huggingface.co/Drexubery/ViewCrafter_25_512/blob/main/model.ckpt)|
+|Model|Resolution|Frames|GPU Mem. & Inference Time (A100, ddim 50steps)|Checkpoint|Description|
+|:---------|:---------|:--------|:--------|:--------|:--------|
+|ViewCrafter_25|576x1024|25| 23.5GB & 120s (`perframe_ae=True`)|[Hugging Face](https://huggingface.co/Drexubery/ViewCrafter_25/blob/main/model.ckpt)|Used for single view NVS, can also adapt to sparse view NVS|
+|ViewCrafter_25_sparse|576x1024|25| 23.5GB & 120s (`perframe_ae=True`)|[Hugging Face](https://huggingface.co/Drexubery/ViewCrafter_25_sparse/blob/main/model_sparse.ckpt)|Used for sparse view NVS|
+|ViewCrafter_16|576x1024|16| 18.3GB & 75s (`perframe_ae=True`)|[Hugging Face](https://huggingface.co/Drexubery/ViewCrafter_16/blob/main/model.ckpt)|16 frames model|
+|ViewCrafter_25_512|320x512|25| 13.8GB & 50s (`perframe_ae=True`)|[Hugging Face](https://huggingface.co/Drexubery/ViewCrafter_25_512/blob/main/model.ckpt)|512 resolution model|
 
-
-Currently, we provide three versions of the model: a base model that generates 16 frames at a time, an enhanced model that generates 25 frames at a time (used by default), and a low-resolution model that produces 25 frames of 320x512 video. The inference time can be reduced by using fewer DDIM steps.
+<!-- Currently, we provide four versions of the model: a base model that generates 16 frames at a time, an enhanced model that generates 25 frames at a time (used by default), and a low-resolution model that produces 25 frames of 320x512 video. The inference time can be reduced by using fewer DDIM steps. -->
 
 ## ⚙️ Setup
 
@@ -146,14 +144,20 @@ wget https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge
 
 ```
 
-## 💫 Inference
+## 💫 Inference 
 ### 1. Command line
-
-(1) Download pretrained model (ViewCrafter_25 for example) and put the `model.ckpt` in `checkpoints/model.ckpt`. \
+### Single view novel view synthesis
+(1) Download pretrained [ViewCrafter_25](https://huggingface.co/Drexubery/ViewCrafter_25/blob/main/model.ckpt) and put the `model.ckpt` in `checkpoints/model.ckpt`. \
 (2) Run [inference.py](./inference.py) using the following script. Please refer to the [configuration document](docs/config_help.md) and [render document](docs/render_help.md) to set up inference parameters and camera trajectory. 
 ```bash
   sh run.sh
 ```
+### Sparse view novel view synthesis
+(1) Download pretrained [ViewCrafter_25_sparse](https://huggingface.co/Drexubery/ViewCrafter_25_sparse/blob/main/model_sparse.ckpt) and put the `model_sparse.ckpt` in `checkpoints/model_sparse.ckpt`. ([ViewCrafter_25_sparse](https://huggingface.co/Drexubery/ViewCrafter_25_sparse/blob/main/model_sparse.ckpt) is specifically trained for the sparse view NVS task and performs better than [ViewCrafter_25](https://huggingface.co/Drexubery/ViewCrafter_25/blob/main/model.ckpt) on this task) \
+(2) Run [inference.py](./inference.py) using the following script. Adjust the `--bg_trd` parameter to clean the point cloud; higher values will produce a cleaner point cloud but may create holes in the background.
+```bash
+  sh run_sparse.sh
+```
 
 ### 2. Local Gradio demo
 

diff --git a/docs/config_help.md b/docs/config_help.md
@@ -18,6 +18,7 @@
 | `--d_theta` | 10. | Required for 'single_view_target' mode, specify target theta angle as (theta + d_theta) |
 | `--d_phi` | 30. | Required for 'single_view_target' mode, specify target phi angle as (phi + d_phi) |
 | `--d_r` | -.2 | Required for 'single_view_target' mode, specify target radius as (r + r*dr) |
+| `--bg_trd` | 0.2 | Range from [0,1). Required for 'sparse_view_interp' mode, higher values will produce a cleaner point cloud but may create holes in the background |
 ### 3. Diffusion configs
 | Configuration | default |   Explanation  | 
 |:------------- |:----- | :------------- |

diff --git a/inference.py b/inference.py
@@ -21,5 +21,8 @@
     elif opts.mode == 'single_view_txt':
         pvd.nvs_single_view()
 
+    elif opts.mode == 'sparse_view_interp':
+        pvd.nvs_sparse_view_interp()
+
     else:
         raise KeyError(f"Invalid Mode: {opts.mode}")
diff --git a/run_sparse.sh b/run_sparse.sh
@@ -0,0 +1,13 @@
+python inference.py \
+--image_dir test/images_sparse/family \
+--out_dir ./output \
+--mode 'sparse_view_interp' \
+--bg_trd 0.2 \
+--seed 123 \
+--ckpt_path ./checkpoints/model_sparse.ckpt \
+--config configs/inference_pvd_1024.yaml \
+--ddim_steps 50 \
+--video_length 25 \
+--device 'cuda:0' \
+--height 576 --width 1024 \
+--model_path ./checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth
diff --git a/test/images_sparse/bench/016.jpg b/test/images_sparse/bench/016.jpg
diff --git a/test/images_sparse/bench/029.jpg b/test/images_sparse/bench/029.jpg
diff --git a/test/images_sparse/bicycle/0.JPG b/test/images_sparse/bicycle/0.JPG
diff --git a/test/images_sparse/bicycle/1.JPG b/test/images_sparse/bicycle/1.JPG
diff --git a/test/images_sparse/bicycle/2.JPG b/test/images_sparse/bicycle/2.JPG
diff --git a/test/images_sparse/family/000301.jpg b/test/images_sparse/family/000301.jpg
diff --git a/test/images_sparse/family/000500.jpg b/test/images_sparse/family/000500.jpg
diff --git a/test/images_sparse/francis/000401.jpg b/test/images_sparse/francis/000401.jpg
diff --git a/test/images_sparse/francis/000639.jpg b/test/images_sparse/francis/000639.jpg
diff --git a/test/images_sparse/ig/000746.jpg b/test/images_sparse/ig/000746.jpg
diff --git a/test/images_sparse/ig/000850.jpg b/test/images_sparse/ig/000850.jpg
diff --git a/test/images_sparse/real1/31698000.png b/test/images_sparse/real1/31698000.png
diff --git a/test/images_sparse/real1/34301000.png b/test/images_sparse/real1/34301000.png
diff --git a/test/images_sparse/real2/44600000.png b/test/images_sparse/real2/44600000.png
diff --git a/test/images_sparse/real2/50100000.png b/test/images_sparse/real2/50100000.png
diff --git a/utils/pvd_utils.py b/utils/pvd_utils.py
@@ -4,6 +4,7 @@
 import os
 import math
 import torchvision
+import scipy
 from tqdm import tqdm
 import cv2  # Assuming OpenCV is used for image saving
 from PIL import Image
@@ -150,6 +151,103 @@ def generate_candidate_poses(c2ws_anchor,H,W,fs,c,theta, phi,num_candidates,devi
     cameras = PerspectiveCameras(focal_length=fs, principal_point=c, in_ndc=False, image_size=image_size, R=R_new, T=T_new, device=device)
     return cameras,thetas,phis
 
+def interpolate_poses_spline(poses, n_interp, spline_degree=5,
+                               smoothness=.03, rot_weight=.1):
+    """Creates a smooth spline path between input keyframe camera poses.
+
+  Spline is calculated with poses in format (position, lookat-point, up-point).
+
+  Args:
+    poses: (n, 3, 4) array of input pose keyframes.
+    n_interp: returned path will have n_interp * (n - 1) total poses.
+    spline_degree: polynomial degree of B-spline.
+    smoothness: parameter for spline smoothing, 0 forces exact interpolation.
+    rot_weight: relative weighting of rotation/translation in spline solve.
+
+  Returns:
+    Array of new camera poses with shape (n_interp * (n - 1), 3, 4).
+  """
+
+    def poses_to_points(poses, dist):
+        """Converts from pose matrices to (position, lookat, up) format."""
+        pos = poses[:, :3, -1]
+        lookat = poses[:, :3, -1] - dist * poses[:, :3, 2]
+        up = poses[:, :3, -1] + dist * poses[:, :3, 1]
+        return np.stack([pos, lookat, up], 1)
+
+    def points_to_poses(points):
+        """Converts from (position, lookat, up) format to pose matrices."""
+        return np.array([viewmatrix(p - l, u - p, p) for p, l, u in points])
+
+    def interp(points, n, k, s):
+        """Runs multidimensional B-spline interpolation on the input points."""
+        sh = points.shape
+        pts = np.reshape(points, (sh[0], -1))
+        k = min(k, sh[0] - 1)
+        tck, _ = scipy.interpolate.splprep(pts.T, k=k, s=s)
+        u = np.linspace(0, 1, n, endpoint=False)
+        new_points = np.array(scipy.interpolate.splev(u, tck))
+        new_points = np.reshape(new_points.T, (n, sh[1], sh[2]))
+        return new_points
+
+    def viewmatrix(lookdir, up, position):
+        """Construct lookat view matrix."""
+        vec2 = normalize(lookdir)
+        vec0 = normalize(np.cross(up, vec2))
+        vec1 = normalize(np.cross(vec2, vec0))
+        m = np.stack([vec0, vec1, vec2, position], axis=1)
+        return m
+
+    def normalize(x):
+        """Normalization helper function."""
+        return x / np.linalg.norm(x)
+
+    points = poses_to_points(poses, dist=rot_weight)
+    new_points = interp(points,
+                        n_interp * (points.shape[0] - 1),
+                        k=spline_degree,
+                        s=smoothness)
+    new_poses = points_to_poses(new_points) 
+    poses_tensor = torch.from_numpy(new_poses)
+    extra_row = torch.tensor(np.repeat([[0, 0, 0, 1]], n_interp, axis=0), dtype=torch.float32).unsqueeze(1)
+    poses_final = torch.cat([poses_tensor, extra_row], dim=1)
+
+    return poses_final
+
+def interp_traj(c2ws: torch.Tensor, n_inserts: int = 25, device='cuda') -> torch.Tensor:
+
+    n_poses = c2ws.shape[0] 
+    interpolated_poses = []
+
+    for i in range(n_poses-1):
+        start_pose = c2ws[i]
+        end_pose = c2ws[(i + 1) % n_poses]
+        interpolated_path = interpolate_poses_spline(torch.stack([start_pose, end_pose])[:, :3, :].cpu().numpy(), n_inserts).to(device)
+        interpolated_path = interpolated_path[:-1]
+        interpolated_poses.append(interpolated_path)
+
+    interpolated_poses.append(c2ws[-1:])
+    full_path = torch.cat(interpolated_poses, dim=0)
+
+    return full_path
+
+def generate_traj_interp(c2ws,H,W,fs,c,ns,device):
+
+    c2ws = interp_traj(c2ws,n_inserts= ns,device=device)
+    num_views = c2ws.shape[0] 
+    R, T = c2ws[:,:3, :3], c2ws[:,:3, 3:]
+    R = torch.stack([-R[:,:, 0], -R[:,:, 1], R[:,:, 2]], 2) # from RDF to LUF for Rotation
+    new_c2w = torch.cat([R, T], 2)
+    w2c = torch.linalg.inv(torch.cat((new_c2w, torch.Tensor([[[0,0,0,1]]]).to(device).repeat(new_c2w.shape[0],1,1)),1))
+    R_new, T_new = w2c[:,:3, :3].permute(0,2,1), w2c[:,:3, 3] # convert R to row-major matrix
+    image_size = ((H, W),)  # (h, w)
+
+    fs = interpolate_sequence(fs,ns-2,device=device)
+    c = interpolate_sequence(c,ns-2,device=device)
+    cameras = PerspectiveCameras(focal_length=fs, principal_point=c, in_ndc=False, image_size=image_size, R=R_new, T=T_new, device=device)
+
+    return cameras, num_views
+
 def generate_traj_specified(c2ws_anchor,H,W,fs,c,theta, phi,d_r,d_x,d_y,frame,device):
     # Initialize a camera.
     """
@@ -314,6 +412,8 @@ def interpolate_poses(start_pose: torch.Tensor, end_pose: torch.Tensor, focus_po
     path = torch.stack(inserted_c2ws)
     return path
 
+
+
 def inv(mat):
     """ Invert a torch or numpy matrix
     """
@@ -554,6 +654,4 @@ def center_crop_image(input_image):
     ])
 
     input_image = transformer(input_image)
-    return input_image
-
-
+    return input_image