fix raymarching thresh and grad in ashawkey#77, add offset to camera

niki-amini-naieni · Jun 29, 2022 · aeaa996 · aeaa996
1 parent 39ed3bc
commit aeaa996
Show file tree

Hide file tree

Showing 9 changed files with 95 additions and 62 deletions.
diff --git a/dnerf/provider.py b/dnerf/provider.py
@@ -15,12 +15,12 @@
 
 
 # ref: https://github.com/NVlabs/instant-ngp/blob/b76004c8cf478880227401ae763be4c02f80b62f/include/neural-graphics-primitives/nerf_loader.h#L50
-def nerf_matrix_to_ngp(pose, scale=0.33):
+def nerf_matrix_to_ngp(pose, scale=0.33, offset=[0, 0, 0]):
  # for the fox dataset, 0.33 scales camera radius to ~ 2
  new_pose = np.array([
- [pose[1, 0], -pose[1, 1], -pose[1, 2], pose[1, 3] * scale],
- [pose[2, 0], -pose[2, 1], -pose[2, 2], pose[2, 3] * scale],
- [pose[0, 0], -pose[0, 1], -pose[0, 2], pose[0, 3] * scale],
+ [pose[1, 0], -pose[1, 1], -pose[1, 2], pose[1, 3] * scale + offset[0]],
+ [pose[2, 0], -pose[2, 1], -pose[2, 2], pose[2, 3] * scale + offset[1]],
+ [pose[0, 0], -pose[0, 1], -pose[0, 2], pose[0, 3] * scale + offset[2]],
  [0, 0, 0, 1],
  ], dtype=np.float32)
  return new_pose
@@ -30,8 +30,9 @@ def visualize_poses(poses, size=0.1):
  # poses: [B, 4, 4]
 
  axes = trimesh.creation.axis(axis_length=4)
- sphere = trimesh.creation.icosphere(radius=1)
- objects = [axes, sphere]
+ box = trimesh.primitives.Box(extents=(2, 2, 2)).as_outline()
+ box.colors = np.array([[128, 128, 128]] * len(box.entities))
+ objects = [axes, box]
 
  for pose in poses:
  # a camera is visualized with 8 line segments.
@@ -41,7 +42,11 @@ def visualize_poses(poses, size=0.1):
  c = pos - size * pose[:3, 0] - size * pose[:3, 1] + size * pose[:3, 2]
  d = pos + size * pose[:3, 0] - size * pose[:3, 1] + size * pose[:3, 2]
 
- segs = np.array([[pos, a], [pos, b], [pos, c], [pos, d], [a, b], [b, c], [c, d], [d, a]])
+ dir = (a + b + c + d) / 4 - pos
+ dir = dir / (np.linalg.norm(dir) + 1e-8)
+ o = pos + dir * 3
+
+ segs = np.array([[pos, a], [pos, b], [pos, c], [pos, d], [a, b], [b, c], [c, d], [d, a], [pos, o]])
  segs = trimesh.load_path(segs)
  objects.append(segs)
 
@@ -96,6 +101,7 @@ def __init__(self, opt, device, type='train', downscale=1, n_test=10):
  self.root_path = opt.path
  self.preload = opt.preload # preload data into GPU
  self.scale = opt.scale # camera radius scale to make sure camera are inside the bounding box.
+ self.offset = opt.offset # camera offset
  self.bound = opt.bound # bounding box half length, also used as the radius to random sample poses.
  self.fp16 = opt.fp16 # if preload, load into fp16.
 
@@ -158,19 +164,24 @@ def __init__(self, opt, device, type='train', downscale=1, n_test=10):
 
  # choose two random poses, and interpolate between.
  f0, f1 = np.random.choice(frames, 2, replace=False)
- pose0 = nerf_matrix_to_ngp(np.array(f0['transform_matrix'], dtype=np.float32), scale=self.scale) # [4, 4]
- pose1 = nerf_matrix_to_ngp(np.array(f1['transform_matrix'], dtype=np.float32), scale=self.scale) # [4, 4]
+ pose0 = nerf_matrix_to_ngp(np.array(f0['transform_matrix'], dtype=np.float32), scale=self.scale, offset=self.offset) # [4, 4]
+ pose1 = nerf_matrix_to_ngp(np.array(f1['transform_matrix'], dtype=np.float32), scale=self.scale, offset=self.offset) # [4, 4]
+ time0 = f0['time'] if 'time' in f0 else 0
+ time1 = f1['time'] if 'time' in f1 else 0
  rots = Rotation.from_matrix(np.stack([pose0[:3, :3], pose1[:3, :3]]))
  slerp = Slerp([0, 1], rots)
 
  self.poses = []
  self.images = None
+ self.times = []
  for i in range(n_test + 1):
  ratio = np.sin(((i / n_test) - 0.5) * np.pi) * 0.5 + 0.5
  pose = np.eye(4, dtype=np.float32)
  pose[:3, :3] = slerp(ratio).as_matrix()
  pose[:3, 3] = (1 - ratio) * pose0[:3, 3] + ratio * pose1[:3, 3]
  self.poses.append(pose)
+ time = (1 - ratio) * time0 + ratio * time1
+ self.times.append(time)
 
  else:
  # for colmap, manually split a valid set (the first frame).
@@ -196,7 +207,7 @@ def __init__(self, opt, device, type='train', downscale=1, n_test=10):
  continue
 
  pose = np.array(f['transform_matrix'], dtype=np.float32) # [4, 4]
- pose = nerf_matrix_to_ngp(pose, scale=self.scale)
+ pose = nerf_matrix_to_ngp(pose, scale=self.scale, offset=self.offset)
 
  image = cv2.imread(f_path, cv2.IMREAD_UNCHANGED) # [H, W, 3] o [H, W, 4]
  if self.H is None or self.W is None:
@@ -218,7 +229,7 @@ def __init__(self, opt, device, type='train', downscale=1, n_test=10):
  if 'time' in f:
  time = f['time']
  else:
- time = 0 # assume static scene
+ time = int(os.path.basename(f['file_path'])[:-4]) # assume frame index as time
 
  self.poses.append(pose)
  self.images.append(image)
@@ -228,6 +239,7 @@ def __init__(self, opt, device, type='train', downscale=1, n_test=10):
  if self.images is not None:
  self.images = torch.from_numpy(np.stack(self.images, axis=0)) # [N, H, W, C]
  self.times = torch.from_numpy(np.asarray(self.times, dtype=np.float32)).view(-1, 1) # [N, 1]
+ self.times = self.times / self.times.max() # normalize to [0, 1]
 
  # calculate mean radius of all camera poses
  self.radius = self.poses[:, :3, 3].norm(dim=-1).mean(0).item()
@@ -240,7 +252,7 @@ def __init__(self, opt, device, type='train', downscale=1, n_test=10):
  self.error_map = None
 
  # [debug] uncomment to view all training poses.
- visualize_poses(self.poses.numpy())
+ # visualize_poses(self.poses.numpy())
 
  # [debug] uncomment to view examples of randomly generated poses.
  # visualize_poses(rand_poses(100, self.device, radius=self.radius).cpu().numpy())
@@ -271,8 +283,8 @@ def __init__(self, opt, device, type='train', downscale=1, n_test=10):
  else:
  raise RuntimeError('Failed to load focal length, please check the transforms.json!')
 
- cx = (transform['cx'] / downscale) if 'cx' in transform else (self.H / 2)
- cy = (transform['cy'] / downscale) if 'cy' in transform else (self.W / 2)
+ cx = (transform['cx'] / downscale) if 'cx' in transform else (self.W / 2)
+ cy = (transform['cy'] / downscale) if 'cy' in transform else (self.H / 2)
 
  self.intrinsics = np.array([fl_x, fl_y, cx, cy])
 

diff --git a/main_CCNeRF.py b/main_CCNeRF.py
@@ -43,6 +43,7 @@
  parser.add_argument('--preload', action='store_true', help="preload all data into GPU, accelerate training but use more GPU memory")
  parser.add_argument('--bound', type=float, default=1, help="assume the scene is bounded in box[-bound, bound]^3, if > 1, will invoke adaptive ray marching.")
  parser.add_argument('--scale', type=float, default=0.33, help="scale camera location into box[-bound, bound]^3")
+ parser.add_argument('--offset', type=float, nargs='*', default=[0, 0, 0], help="offset of camera location")
  parser.add_argument('--dt_gamma', type=float, default=0, help="dt_gamma (>=0) for adaptive ray marching. set to 0 to disable, >0 to accelerate rendering (but usually with worse quality)")
  parser.add_argument('--min_near', type=float, default=0.2, help="minimum near distance for camera")
  parser.add_argument('--density_thresh', type=float, default=10, help="threshold for density grid to be occupied")

diff --git a/main_dnerf.py b/main_dnerf.py
@@ -28,7 +28,7 @@
  parser.add_argument('--cuda_ray', action='store_true', help="use CUDA raymarching instead of pytorch")
  parser.add_argument('--max_steps', type=int, default=1024, help="max num steps sampled per ray (only valid when using --cuda_ray)")
  parser.add_argument('--update_extra_interval', type=int, default=100, help="iter interval to update extra status (only valid when using --cuda_ray)")
- parser.add_argument('--num_steps', type=int, default=256, help="num steps sampled per ray (only valid when NOT using --cuda_ray)")
+ parser.add_argument('--num_steps', type=int, default=128, help="num steps sampled per ray (only valid when NOT using --cuda_ray)")
  parser.add_argument('--upsample_steps', type=int, default=0, help="num steps up-sampled per ray (only valid when NOT using --cuda_ray)")
  parser.add_argument('--max_ray_batch', type=int, default=4096, help="batch size of rays at inference to avoid OOM (only valid when NOT using --cuda_ray)")
 
@@ -43,6 +43,7 @@
  # (the default value is for the fox dataset)
  parser.add_argument('--bound', type=float, default=2, help="assume the scene is bounded in box[-bound, bound]^3, if > 1, will invoke adaptive ray marching.")
  parser.add_argument('--scale', type=float, default=0.33, help="scale camera location into box[-bound, bound]^3")
+ parser.add_argument('--offset', type=float, nargs='*', default=[0, 0, 0], help="offset of camera location")
  parser.add_argument('--dt_gamma', type=float, default=1/128, help="dt_gamma (>=0) for adaptive ray marching. set to 0 to disable, >0 to accelerate rendering (but usually with worse quality)")
  parser.add_argument('--min_near', type=float, default=0.2, help="minimum near distance for camera")
  parser.add_argument('--density_thresh', type=float, default=10, help="threshold for density grid to be occupied")

diff --git a/main_nerf.py b/main_nerf.py
@@ -42,6 +42,7 @@
  # (the default value is for the fox dataset)
  parser.add_argument('--bound', type=float, default=2, help="assume the scene is bounded in box[-bound, bound]^3, if > 1, will invoke adaptive ray marching.")
  parser.add_argument('--scale', type=float, default=0.33, help="scale camera location into box[-bound, bound]^3")
+ parser.add_argument('--offset', type=float, nargs='*', default=[0, 0, 0], help="offset of camera location")
  parser.add_argument('--dt_gamma', type=float, default=1/128, help="dt_gamma (>=0) for adaptive ray marching. set to 0 to disable, >0 to accelerate rendering (but usually with worse quality)")
  parser.add_argument('--min_near', type=float, default=0.2, help="minimum near distance for camera")
  parser.add_argument('--density_thresh', type=float, default=10, help="threshold for density grid to be occupied")

diff --git a/main_tensoRF.py b/main_tensoRF.py
@@ -42,6 +42,7 @@
  # (the default value is for the fox dataset)
  parser.add_argument('--bound', type=float, default=2, help="assume the scene is bounded in box[-bound, bound]^3, if > 1, will invoke adaptive ray marching.")
  parser.add_argument('--scale', type=float, default=0.33, help="scale camera location into box[-bound, bound]^3")
+ parser.add_argument('--offset', type=float, nargs='*', default=[0, 0, 0], help="offset of camera location")
  parser.add_argument('--dt_gamma', type=float, default=1/128, help="dt_gamma (>=0) for adaptive ray marching. set to 0 to disable, >0 to accelerate rendering (but usually with worse quality)")
  parser.add_argument('--min_near', type=float, default=0.2, help="minimum near distance for camera")
  parser.add_argument('--density_thresh', type=float, default=10, help="threshold for density grid to be occupied")

diff --git a/nerf/provider.py b/nerf/provider.py
@@ -2,6 +2,7 @@
 import cv2
 import glob
 import json
+from cv2 import transform
 import tqdm
 import numpy as np
 from scipy.spatial.transform import Slerp, Rotation
@@ -11,16 +12,16 @@
 import torch
 from torch.utils.data import DataLoader
 
-from .utils import get_rays, srgb_to_linear
+from .utils import get_rays, srgb_to_linear, torch_vis_2d
 
 
 # ref: https://github.com/NVlabs/instant-ngp/blob/b76004c8cf478880227401ae763be4c02f80b62f/include/neural-graphics-primitives/nerf_loader.h#L50
-def nerf_matrix_to_ngp(pose, scale=0.33):
+def nerf_matrix_to_ngp(pose, scale=0.33, offset=[0, 0, 0]):
  # for the fox dataset, 0.33 scales camera radius to ~ 2
  new_pose = np.array([
- [pose[1, 0], -pose[1, 1], -pose[1, 2], pose[1, 3] * scale],
- [pose[2, 0], -pose[2, 1], -pose[2, 2], pose[2, 3] * scale],
- [pose[0, 0], -pose[0, 1], -pose[0, 2], pose[0, 3] * scale],
+ [pose[1, 0], -pose[1, 1], -pose[1, 2], pose[1, 3] * scale + offset[0]],
+ [pose[2, 0], -pose[2, 1], -pose[2, 2], pose[2, 3] * scale + offset[1]],
+ [pose[0, 0], -pose[0, 1], -pose[0, 2], pose[0, 3] * scale + offset[2]],
  [0, 0, 0, 1],
  ], dtype=np.float32)
  return new_pose
@@ -30,8 +31,9 @@ def visualize_poses(poses, size=0.1):
  # poses: [B, 4, 4]
 
  axes = trimesh.creation.axis(axis_length=4)
- sphere = trimesh.creation.icosphere(radius=1)
- objects = [axes, sphere]
+ box = trimesh.primitives.Box(extents=(2, 2, 2)).as_outline()
+ box.colors = np.array([[128, 128, 128]] * len(box.entities))
+ objects = [axes, box]
 
  for pose in poses:
  # a camera is visualized with 8 line segments.
@@ -41,7 +43,11 @@ def visualize_poses(poses, size=0.1):
  c = pos - size * pose[:3, 0] - size * pose[:3, 1] + size * pose[:3, 2]
  d = pos + size * pose[:3, 0] - size * pose[:3, 1] + size * pose[:3, 2]
 
- segs = np.array([[pos, a], [pos, b], [pos, c], [pos, d], [a, b], [b, c], [c, d], [d, a]])
+ dir = (a + b + c + d) / 4 - pos
+ dir = dir / (np.linalg.norm(dir) + 1e-8)
+ o = pos + dir * 3
+
+ segs = np.array([[pos, a], [pos, b], [pos, c], [pos, d], [a, b], [b, c], [c, d], [d, a], [pos, o]])
  segs = trimesh.load_path(segs)
  objects.append(segs)
 
@@ -96,6 +102,7 @@ def __init__(self, opt, device, type='train', downscale=1, n_test=10):
  self.root_path = opt.path
  self.preload = opt.preload # preload data into GPU
  self.scale = opt.scale # camera radius scale to make sure camera are inside the bounding box.
+ self.offset = opt.offset # camera offset
  self.bound = opt.bound # bounding box half length, also used as the radius to random sample poses.
  self.fp16 = opt.fp16 # if preload, load into fp16.
 
@@ -158,8 +165,8 @@ def __init__(self, opt, device, type='train', downscale=1, n_test=10):
 
  # choose two random poses, and interpolate between.
  f0, f1 = np.random.choice(frames, 2, replace=False)
- pose0 = nerf_matrix_to_ngp(np.array(f0['transform_matrix'], dtype=np.float32), scale=self.scale) # [4, 4]
- pose1 = nerf_matrix_to_ngp(np.array(f1['transform_matrix'], dtype=np.float32), scale=self.scale) # [4, 4]
+ pose0 = nerf_matrix_to_ngp(np.array(f0['transform_matrix'], dtype=np.float32), scale=self.scale, offset=self.offset) # [4, 4]
+ pose1 = nerf_matrix_to_ngp(np.array(f1['transform_matrix'], dtype=np.float32), scale=self.scale, offset=self.offset) # [4, 4]
  rots = Rotation.from_matrix(np.stack([pose0[:3, :3], pose1[:3, :3]]))
  slerp = Slerp([0, 1], rots)
 
@@ -193,7 +200,7 @@ def __init__(self, opt, device, type='train', downscale=1, n_test=10):
  continue
 
  pose = np.array(f['transform_matrix'], dtype=np.float32) # [4, 4]
- pose = nerf_matrix_to_ngp(pose, scale=self.scale)
+ pose = nerf_matrix_to_ngp(pose, scale=self.scale, offset=self.offset)
 
  image = cv2.imread(f_path, cv2.IMREAD_UNCHANGED) # [H, W, 3] o [H, W, 4]
  if self.H is None or self.W is None:
@@ -259,8 +266,8 @@ def __init__(self, opt, device, type='train', downscale=1, n_test=10):
  else:
  raise RuntimeError('Failed to load focal length, please check the transforms.json!')
 
- cx = (transform['cx'] / downscale) if 'cx' in transform else (self.H / 2)
- cy = (transform['cy'] / downscale) if 'cy' in transform else (self.W / 2)
+ cx = (transform['cx'] / downscale) if 'cx' in transform else (self.W / 2)
+ cy = (transform['cy'] / downscale) if 'cy' in transform else (self.H / 2)
 
  self.intrinsics = np.array([fl_x, fl_y, cx, cy])
 

diff --git a/raymarching/src/raymarching.cu b/raymarching/src/raymarching.cu
@@ -546,21 +546,20 @@ __global__ void kernel_composite_rays_train_forward(
  const scalar_t alpha = 1.0f - __expf(- sigmas[0] * deltas[0]);
  const scalar_t weight = alpha * T;
 
- // minimal remained transmittence
- // NOTE: uncomment it won't affect instant-ngp, but totally breaks TensoRF...
- //if (weight < 1e-4f) break;
-
  r += weight * rgbs[0];
  g += weight * rgbs[1];
  b += weight * rgbs[2];
-
+ 
  t += deltas[1]; // real delta
  d += weight * t;
-
+ 
  ws += weight;
-
+ 
  T *= 1.0f - alpha;
 
+ // minimal remained transmittence
+ if (T < 1e-4f) break;
+
  //printf("[n=%d] num_steps=%d, alpha=%f, w=%f, T=%f, sum_dt=%f, d=%f\n", n, step, alpha, weight, T, sum_delta, d);
 
  // locate
@@ -650,15 +649,17 @@ __global__ void kernel_composite_rays_train_backward(
  const scalar_t alpha = 1.0f - __expf(- sigmas[0] * deltas[0]);
  const scalar_t weight = alpha * T;
 
- //if (weight < 1e-4f) break;
-
  r += weight * rgbs[0];
  g += weight * rgbs[1];
  b += weight * rgbs[2];
  ws += weight;
 
  T *= 1.0f - alpha;
 
+ // minimal remained transmittence
+ if (T < 1e-4f) break;
+
+ // check https://note.kiui.moe/others/nerf_gradient/ for the gradient calculation.
  // write grad_rgbs
  grad_rgbs[0] = grad_image[0] * weight;
  grad_rgbs[1] = grad_image[1] * weight;
@@ -669,7 +670,7 @@ __global__ void kernel_composite_rays_train_backward(
  grad_image[0] * (T * rgbs[0] - (r_final - r)) + 
  grad_image[1] * (T * rgbs[1] - (g_final - g)) + 
  grad_image[2] * (T * rgbs[2] - (b_final - b)) +
- grad_weights_sum[0] * (T - (ws_final - ws))
+ grad_weights_sum[0] * (1 - ws_final)
  );
 
  //printf("[n=%d] num_steps=%d, T=%f, grad_sigmas=%f, r_final=%f, r=%f\n", n, step, T, grad_sigmas[0], r_final, r);