Spaces:

abreza
/

SpatialTrackerV2_ttm

Sleeping

App Files Files Community

abreza commited on 17 days ago

Commit

06b736b

1 Parent(s): f15498a

move wan pipeline to cuda in zero gpu inference time

Browse files

Files changed (10) hide show

app.py +4 -2
app_3rd/spatrack_utils/infer_track.py +18 -18
inference.py +15 -15
models/SpaTrackV2/models/utils.py +59 -59
models/SpaTrackV2/models/vggt4track/models/tracker_front.py +7 -7
models/SpaTrackV2/models/vggt4track/models/vggt.py +1 -1
models/SpaTrackV2/models/vggt4track/models/vggt_moe.py +5 -5
models/vggt/vggt/models/tracker_front.py +7 -7
models/vggt/vggt/models/vggt.py +1 -1
models/vggt/vggt/models/vggt_moe.py +5 -5

app.py CHANGED Viewed

@@ -103,7 +103,6 @@ wan_pipeline = WanImageToVideoTTMPipeline.from_pretrained(
 )
 wan_pipeline.vae.enable_tiling()
 wan_pipeline.vae.enable_slicing()
-wan_pipeline.to("cuda")
@@ -218,7 +217,7 @@ def run_spatial_tracker(video_tensor: torch.Tensor):
     video_input = preprocess_image(video_tensor)[None].cuda()
     with torch.no_grad():
-        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
             predictions = vggt4track_model(video_input / 255)
             extrinsic = predictions["poses_pred"]
             intrinsic = predictions["intrs"]
@@ -293,6 +292,9 @@ def run_wan_ttm_generation(prompt, tweak_index, tstrong_index, first_frame_path,
         "毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
     )
     # Match resolution logic from run_wan.py
     max_area = 480 * 832
     mod_value = wan_pipeline.vae_scale_factor_spatial * \

 )
 wan_pipeline.vae.enable_tiling()
 wan_pipeline.vae.enable_slicing()
     video_input = preprocess_image(video_tensor)[None].cuda()
     with torch.no_grad():
+        with torch.amp.autocast('cuda', dtype=torch.bfloat16):
             predictions = vggt4track_model(video_input / 255)
             extrinsic = predictions["poses_pred"]
             intrinsic = predictions["intrs"]
         "毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
     )
+    wan_pipeline.to("cuda")
     # Match resolution logic from run_wan.py
     max_area = 480 * 832
     mod_value = wan_pipeline.vae_scale_factor_spatial * \

app_3rd/spatrack_utils/infer_track.py CHANGED Viewed

@@ -34,13 +34,13 @@ def get_tracker_predictor(output_dir: str, vo_points: int = 756, tracker_model=N
     """
     viz = True
     os.makedirs(output_dir, exist_ok=True)
     with open(config["cfg_dir"], "r") as f:
         cfg = yaml.load(f, Loader=yaml.FullLoader)
     cfg = easydict.EasyDict(cfg)
     cfg.out_dir = output_dir
     cfg.model.track_num = vo_points
     # Check if it's a local path or HuggingFace repo
     if tracker_model is not None:
         model = tracker_model
@@ -60,8 +60,8 @@ def get_tracker_predictor(output_dir: str, vo_points: int = 756, tracker_model=N
             model = Predictor.from_pretrained(checkpoint_path, model_cfg=cfg["model"])
         model.eval()
         model.to("cuda")
-    viser = Visualizer(save_dir=cfg.out_dir, grayscale=True,
                      fps=10, pad_value=0, tracks_leave_trace=5)
     return model, viser
@@ -83,11 +83,11 @@ def run_tracker(model, viser, temp_dir, video_name, grid_size, vo_points, fps=3)
     mask_path = os.path.join(temp_dir, f"{video_name}.png")
     out_dir = os.path.join(temp_dir, "results")
     os.makedirs(out_dir, exist_ok=True)
     # Load video using decord
     video_reader = decord.VideoReader(video_path)
     video_tensor = torch.from_numpy(video_reader.get_batch(range(len(video_reader))).asnumpy()).permute(0, 3, 1, 2)  # Convert to tensor and permute to (N, C, H, W)
     # resize make sure the shortest side is 336
     h, w = video_tensor.shape[2:]
     scale = max(336 / h, 336 / w)
@@ -99,7 +99,7 @@ def run_tracker(model, viser, temp_dir, video_name, grid_size, vo_points, fps=3)
     intrs = None
     extrs = None
     data_npz_load = {}
     # Load and process mask
     if os.path.exists(mask_path):
         mask = cv2.imread(mask_path)
@@ -107,20 +107,20 @@ def run_tracker(model, viser, temp_dir, video_name, grid_size, vo_points, fps=3)
         mask = mask.sum(axis=-1)>0
     else:
         mask = np.ones_like(video_tensor[0,0].numpy())>0
     # Get frame dimensions and create grid points
     frame_H, frame_W = video_tensor.shape[2:]
     grid_pts = get_points_on_a_grid(grid_size, (frame_H, frame_W), device="cpu")
     # Sample mask values at grid points and filter out points where mask=0
     if os.path.exists(mask_path):
         grid_pts_int = grid_pts[0].long()
         mask_values = mask[grid_pts_int[...,1], grid_pts_int[...,0]]
         grid_pts = grid_pts[:, mask_values]
     query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[0].numpy()
-    # run vggt
     if os.environ.get("VGGT_DIR", None) is not None:
         vggt_model = VGGT()
         vggt_model.load_state_dict(torch.load(VGGT_DIR))
@@ -128,7 +128,7 @@ def run_tracker(model, viser, temp_dir, video_name, grid_size, vo_points, fps=3)
         vggt_model = vggt_model.to("cuda")
         # process the image tensor
         video_tensor = preprocess_image(video_tensor)[None]
-        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
             # Predict attributes including cameras, depth maps, and point maps.
             aggregated_tokens_list, ps_idx = vggt_model.aggregator(video_tensor.cuda()/255)
             pose_enc = vggt_model.camera_head(aggregated_tokens_list)[-1]
@@ -154,12 +154,12 @@ def run_tracker(model, viser, temp_dir, video_name, grid_size, vo_points, fps=3)
             c2w_traj, intrs, point_map, conf_depth,
             track3d_pred, track2d_pred, vis_pred, conf_pred, video
         ) = model.forward(video_tensor, depth=depth_tensor,
-                            intrs=intrs, extrs=extrs,
                             queries=query_xyt,
                             fps=1, full_point=False, iters_track=4,
                             query_no_BA=True, fixed_cam=False, stage=1,
-                            support_frame=len(video_tensor)-1, replace_ratio=0.2)
         # Resize results to avoid too large I/O Burden
         max_size = 336
         h, w = video.shape[2:]
@@ -174,12 +174,12 @@ def run_tracker(model, viser, temp_dir, video_name, grid_size, vo_points, fps=3)
             if depth_tensor is not None:
                 depth_tensor = T.Resize((new_h, new_w))(depth_tensor)
             conf_depth = T.Resize((new_h, new_w))(conf_depth)
         # Visualize tracks
         viser.visualize(video=video[None],
                         tracks=track2d_pred[None][...,:2],
                         visibility=vis_pred[None],filename="test")
         # Save in tapip3d format
         data_npz_load["coords"] = (torch.einsum("tij,tnj->tni", c2w_traj[:,:3,:3], track3d_pred[:,:,:3].cpu()) + c2w_traj[:,:3,3][:,None,:]).numpy()
         data_npz_load["extrinsics"] = torch.inverse(c2w_traj).cpu().numpy()
@@ -190,5 +190,5 @@ def run_tracker(model, viser, temp_dir, video_name, grid_size, vo_points, fps=3)
         data_npz_load["confs"] = conf_pred.cpu().numpy()
         data_npz_load["confs_depth"] = conf_depth.cpu().numpy()
         np.savez(os.path.join(out_dir, f'result.npz'), **data_npz_load)
         print(f"Results saved to {out_dir}.\nTo visualize them with tapip3d, run: [bold yellow]python tapip3d_viz.py {out_dir}/result.npz[/bold yellow]")

     """
     viz = True
     os.makedirs(output_dir, exist_ok=True)
     with open(config["cfg_dir"], "r") as f:
         cfg = yaml.load(f, Loader=yaml.FullLoader)
     cfg = easydict.EasyDict(cfg)
     cfg.out_dir = output_dir
     cfg.model.track_num = vo_points
     # Check if it's a local path or HuggingFace repo
     if tracker_model is not None:
         model = tracker_model
             model = Predictor.from_pretrained(checkpoint_path, model_cfg=cfg["model"])
         model.eval()
         model.to("cuda")
+    viser = Visualizer(save_dir=cfg.out_dir, grayscale=True,
                      fps=10, pad_value=0, tracks_leave_trace=5)
     return model, viser
     mask_path = os.path.join(temp_dir, f"{video_name}.png")
     out_dir = os.path.join(temp_dir, "results")
     os.makedirs(out_dir, exist_ok=True)
     # Load video using decord
     video_reader = decord.VideoReader(video_path)
     video_tensor = torch.from_numpy(video_reader.get_batch(range(len(video_reader))).asnumpy()).permute(0, 3, 1, 2)  # Convert to tensor and permute to (N, C, H, W)
     # resize make sure the shortest side is 336
     h, w = video_tensor.shape[2:]
     scale = max(336 / h, 336 / w)
     intrs = None
     extrs = None
     data_npz_load = {}
     # Load and process mask
     if os.path.exists(mask_path):
         mask = cv2.imread(mask_path)
         mask = mask.sum(axis=-1)>0
     else:
         mask = np.ones_like(video_tensor[0,0].numpy())>0
     # Get frame dimensions and create grid points
     frame_H, frame_W = video_tensor.shape[2:]
     grid_pts = get_points_on_a_grid(grid_size, (frame_H, frame_W), device="cpu")
     # Sample mask values at grid points and filter out points where mask=0
     if os.path.exists(mask_path):
         grid_pts_int = grid_pts[0].long()
         mask_values = mask[grid_pts_int[...,1], grid_pts_int[...,0]]
         grid_pts = grid_pts[:, mask_values]
     query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[0].numpy()
+    # run vggt
     if os.environ.get("VGGT_DIR", None) is not None:
         vggt_model = VGGT()
         vggt_model.load_state_dict(torch.load(VGGT_DIR))
         vggt_model = vggt_model.to("cuda")
         # process the image tensor
         video_tensor = preprocess_image(video_tensor)[None]
+        with torch.amp.autocast('cuda', dtype=torch.bfloat16):
             # Predict attributes including cameras, depth maps, and point maps.
             aggregated_tokens_list, ps_idx = vggt_model.aggregator(video_tensor.cuda()/255)
             pose_enc = vggt_model.camera_head(aggregated_tokens_list)[-1]
             c2w_traj, intrs, point_map, conf_depth,
             track3d_pred, track2d_pred, vis_pred, conf_pred, video
         ) = model.forward(video_tensor, depth=depth_tensor,
+                            intrs=intrs, extrs=extrs,
                             queries=query_xyt,
                             fps=1, full_point=False, iters_track=4,
                             query_no_BA=True, fixed_cam=False, stage=1,
+                            support_frame=len(video_tensor)-1, replace_ratio=0.2)
         # Resize results to avoid too large I/O Burden
         max_size = 336
         h, w = video.shape[2:]
             if depth_tensor is not None:
                 depth_tensor = T.Resize((new_h, new_w))(depth_tensor)
             conf_depth = T.Resize((new_h, new_w))(conf_depth)
         # Visualize tracks
         viser.visualize(video=video[None],
                         tracks=track2d_pred[None][...,:2],
                         visibility=vis_pred[None],filename="test")
         # Save in tapip3d format
         data_npz_load["coords"] = (torch.einsum("tij,tnj->tni", c2w_traj[:,:3,:3], track3d_pred[:,:,:3].cpu()) + c2w_traj[:,:3,3][:,None,:]).numpy()
         data_npz_load["extrinsics"] = torch.inverse(c2w_traj).cpu().numpy()
         data_npz_load["confs"] = conf_pred.cpu().numpy()
         data_npz_load["confs_depth"] = conf_depth.cpu().numpy()
         np.savez(os.path.join(out_dir, f'result.npz'), **data_npz_load)
         print(f"Results saved to {out_dir}.\nTo visualize them with tapip3d, run: [bold yellow]python tapip3d_viz.py {out_dir}/result.npz[/bold yellow]")

inference.py CHANGED Viewed

@@ -38,7 +38,7 @@ if __name__ == "__main__":
     # fps
     fps = int(args.fps)
     mask_dir = args.data_dir + f"/{args.video_name}.png"
     vggt4track_model = VGGT4Track.from_pretrained("Yuxihenry/SpatialTrackerV2_Front")
     vggt4track_model.eval()
     vggt4track_model = vggt4track_model.to("cuda")
@@ -66,12 +66,12 @@ if __name__ == "__main__":
         # process the image tensor
         video_tensor = preprocess_image(video_tensor)[None]
         with torch.no_grad():
-            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                 # Predict attributes including cameras, depth maps, and point maps.
                 predictions = vggt4track_model(video_tensor.cuda()/255)
                 extrinsic, intrinsic = predictions["poses_pred"], predictions["intrs"]
                 depth_map, depth_conf = predictions["points_map"][..., 2], predictions["unc_metric"]
         depth_tensor = depth_map.squeeze().cpu().numpy()
         extrs = np.eye(4)[None].repeat(len(depth_tensor), axis=0)
         extrs = extrinsic.squeeze().cpu().numpy()
@@ -82,7 +82,7 @@ if __name__ == "__main__":
         unc_metric = depth_conf.squeeze().cpu().numpy() > 0.5
         data_npz_load = {}
     if os.path.exists(mask_dir):
         mask_files = mask_dir
         mask = cv2.imread(mask_files)
@@ -90,11 +90,11 @@ if __name__ == "__main__":
         mask = mask.sum(axis=-1)>0
     else:
         mask = np.ones_like(video_tensor[0,0].numpy())>0
     # get all data pieces
     viz = True
     os.makedirs(out_dir, exist_ok=True)
     # with open(cfg_dir, "r") as f:
     #     cfg = yaml.load(f, Loader=yaml.FullLoader)
     # cfg = easydict.EasyDict(cfg)
@@ -108,12 +108,12 @@ if __name__ == "__main__":
     # config the model; the track_num is the number of points in the grid
     model.spatrack.track_num = args.vo_points
     model.eval()
     model.to("cuda")
-    viser = Visualizer(save_dir=out_dir, grayscale=True,
                      fps=10, pad_value=0, tracks_leave_trace=5)
     grid_size = args.grid_size
     # get frame H W
@@ -124,13 +124,13 @@ if __name__ == "__main__":
     else:
         frame_H, frame_W = video_tensor.shape[2:]
     grid_pts = get_points_on_a_grid(grid_size, (frame_H, frame_W), device="cpu")
     # Sample mask values at grid points and filter out points where mask=0
     if os.path.exists(mask_dir):
         grid_pts_int = grid_pts[0].long()
         mask_values = mask[grid_pts_int[...,1], grid_pts_int[...,0]]
         grid_pts = grid_pts[:, mask_values]
     query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[0].numpy()
     # Run model inference
@@ -139,12 +139,12 @@ if __name__ == "__main__":
             c2w_traj, intrs, point_map, conf_depth,
             track3d_pred, track2d_pred, vis_pred, conf_pred, video
         ) = model.forward(video_tensor, depth=depth_tensor,
-                            intrs=intrs, extrs=extrs,
                             queries=query_xyt,
                             fps=1, full_point=False, iters_track=4,
                             query_no_BA=True, fixed_cam=False, stage=1, unc_metric=unc_metric,
-                            support_frame=len(video_tensor)-1, replace_ratio=0.2)
         # resize the results to avoid too large I/O Burden
         # depth and image, the maximum side is 336
         max_size = 336
@@ -169,7 +169,7 @@ if __name__ == "__main__":
                                 tracks=track2d_pred[None][...,:2],
                                 visibility=vis_pred[None],filename="test")
-        # save as the tapip3d format
         data_npz_load["coords"] = (torch.einsum("tij,tnj->tni", c2w_traj[:,:3,:3], track3d_pred[:,:,:3].cpu()) + c2w_traj[:,:3,3][:,None,:]).numpy()
         data_npz_load["extrinsics"] = torch.inverse(c2w_traj).cpu().numpy()
         data_npz_load["intrinsics"] = intrs.cpu().numpy()

     # fps
     fps = int(args.fps)
     mask_dir = args.data_dir + f"/{args.video_name}.png"
     vggt4track_model = VGGT4Track.from_pretrained("Yuxihenry/SpatialTrackerV2_Front")
     vggt4track_model.eval()
     vggt4track_model = vggt4track_model.to("cuda")
         # process the image tensor
         video_tensor = preprocess_image(video_tensor)[None]
         with torch.no_grad():
+            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                 # Predict attributes including cameras, depth maps, and point maps.
                 predictions = vggt4track_model(video_tensor.cuda()/255)
                 extrinsic, intrinsic = predictions["poses_pred"], predictions["intrs"]
                 depth_map, depth_conf = predictions["points_map"][..., 2], predictions["unc_metric"]
         depth_tensor = depth_map.squeeze().cpu().numpy()
         extrs = np.eye(4)[None].repeat(len(depth_tensor), axis=0)
         extrs = extrinsic.squeeze().cpu().numpy()
         unc_metric = depth_conf.squeeze().cpu().numpy() > 0.5
         data_npz_load = {}
     if os.path.exists(mask_dir):
         mask_files = mask_dir
         mask = cv2.imread(mask_files)
         mask = mask.sum(axis=-1)>0
     else:
         mask = np.ones_like(video_tensor[0,0].numpy())>0
     # get all data pieces
     viz = True
     os.makedirs(out_dir, exist_ok=True)
     # with open(cfg_dir, "r") as f:
     #     cfg = yaml.load(f, Loader=yaml.FullLoader)
     # cfg = easydict.EasyDict(cfg)
     # config the model; the track_num is the number of points in the grid
     model.spatrack.track_num = args.vo_points
     model.eval()
     model.to("cuda")
+    viser = Visualizer(save_dir=out_dir, grayscale=True,
                      fps=10, pad_value=0, tracks_leave_trace=5)
     grid_size = args.grid_size
     # get frame H W
     else:
         frame_H, frame_W = video_tensor.shape[2:]
     grid_pts = get_points_on_a_grid(grid_size, (frame_H, frame_W), device="cpu")
     # Sample mask values at grid points and filter out points where mask=0
     if os.path.exists(mask_dir):
         grid_pts_int = grid_pts[0].long()
         mask_values = mask[grid_pts_int[...,1], grid_pts_int[...,0]]
         grid_pts = grid_pts[:, mask_values]
     query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[0].numpy()
     # Run model inference
             c2w_traj, intrs, point_map, conf_depth,
             track3d_pred, track2d_pred, vis_pred, conf_pred, video
         ) = model.forward(video_tensor, depth=depth_tensor,
+                            intrs=intrs, extrs=extrs,
                             queries=query_xyt,
                             fps=1, full_point=False, iters_track=4,
                             query_no_BA=True, fixed_cam=False, stage=1, unc_metric=unc_metric,
+                            support_frame=len(video_tensor)-1, replace_ratio=0.2)
         # resize the results to avoid too large I/O Burden
         # depth and image, the maximum side is 336
         max_size = 336
                                 tracks=track2d_pred[None][...,:2],
                                 visibility=vis_pred[None],filename="test")
+        # save as the tapip3d format
         data_npz_load["coords"] = (torch.einsum("tij,tnj->tni", c2w_traj[:,:3,:3], track3d_pred[:,:,:3].cpu()) + c2w_traj[:,:3,3][:,None,:]).numpy()
         data_npz_load["extrinsics"] = torch.inverse(c2w_traj).cpu().numpy()
         data_npz_load["intrinsics"] = intrs.cpu().numpy()

models/SpaTrackV2/models/utils.py CHANGED Viewed

@@ -95,7 +95,7 @@ class AverageMeter(object):
         return fmtstr.format(**self.__dict__)
-def procrustes_analysis(X0,X1): # [N,3]
     # translation
     t0 = X0.mean(dim=0,keepdim=True)
     t1 = X1.mean(dim=0,keepdim=True)
@@ -218,7 +218,7 @@ def get_EFP(pred_cameras, image_size, B, S, default_focal=False):
     intrinsics = create_intri_matrix(focal_length, principal_point)
     return extrinsics, intrinsics
 def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor:
     """
     Convert rotations given as quaternions to rotation matrices.
@@ -278,7 +278,7 @@ def pose_encoding_to_camera(
         # Now converted back
         focal_length = (log_focal_length + log_focal_length_bias).exp()
         # clamp to avoid weird fl values
-        focal_length = torch.clamp(focal_length,
                                    min=min_focal_length, max=max_focal_length)
     elif pose_encoding_type == "absT_quaR_OneFL":
         # 3 for absT, 4 for quaR, 1 for absFL
@@ -287,7 +287,7 @@ def pose_encoding_to_camera(
         quaternion_R = pose_encoding_reshaped[:, 3:7]
         R = quaternion_to_matrix(quaternion_R)
         focal_length = pose_encoding_reshaped[:, 7:8]
-        focal_length = torch.clamp(focal_length,
                                    min=min_focal_length, max=max_focal_length)
     else:
         raise ValueError(f"Unknown pose encoding {pose_encoding_type}")
@@ -316,7 +316,7 @@ def pose_encoding_to_camera(
         R = extrinsics_4x4[:, :3, :3].clone()
         abs_T = extrinsics_4x4[:, :3, 3].clone()
     if return_dict:
         return {"focal_length": focal_length, "R": R, "T": abs_T}
@@ -326,7 +326,7 @@ def pose_encoding_to_camera(
 def camera_to_pose_encoding(
-    camera, pose_encoding_type="absT_quaR_logFL",
     log_focal_length_bias=1.8, min_focal_length=0.1, max_focal_length=30
 ):
     """
@@ -359,7 +359,7 @@ def camera_to_pose_encoding(
     return pose_encoding
-def init_pose_enc(B: int,
                   S: int, pose_encoding_type: str="absT_quaR_logFL",
                   device: Optional[torch.device]=None):
     """
@@ -378,7 +378,7 @@ def init_pose_enc(B: int,
         C = 8
     else:
         raise ValueError(f"Unknown pose encoding {pose_encoding_type}")
     pose_enc = torch.zeros(B, S, C, device=device)
     pose_enc[..., :3] = 0 # absT
     pose_enc[..., 3] = 1 # quaR
@@ -389,7 +389,7 @@ def first_pose_enc_norm(pose_enc: torch.Tensor,
                         pose_encoding_type: str="absT_quaR_OneFL",
                         pose_mode: str = "W2C"):
     """
-    make sure the poses in on window are normalized by the first frame, where the
     first frame transformation is the Identity Matrix.
     NOTE: Poses are all W2C
     args:
@@ -403,23 +403,23 @@ def first_pose_enc_norm(pose_enc: torch.Tensor,
         pose_enc, pose_encoding_type=pose_encoding_type,
         to_OpenCV=False
     ) #NOTE: the camera parameters are not in NDC
     R = pred_cameras.R    # [B*S, 3, 3]
     T = pred_cameras.T    # [B*S, 3]
     Tran_M = torch.cat([R, T.unsqueeze(-1)], dim=-1) # [B*S, 3, 4]
     extra_ = torch.tensor([[[0, 0, 0, 1]]],
                           device=Tran_M.device).expand(Tran_M.shape[0], -1, -1)
     Tran_M = torch.cat([Tran_M, extra_
                         ], dim=1)
     Tran_M = rearrange(Tran_M, '(b s) c d -> b s c d', b=B)
     # Take the first frame as the base of world coordinate
     if pose_mode == "C2W":
         Tran_M_new = (Tran_M[:,:1,...].inverse())@Tran_M
     elif pose_mode == "W2C":
         Tran_M_new = Tran_M@(Tran_M[:,:1,...].inverse())
     Tran_M_new = rearrange(Tran_M_new, 'b s c d -> (b s) c d')
     R_ = Tran_M_new[:, :3, :3]
@@ -429,7 +429,7 @@ def first_pose_enc_norm(pose_enc: torch.Tensor,
     pred_cameras.R = R_
     pred_cameras.T = T_
     pose_enc_norm = camera_to_pose_encoding(pred_cameras,
-                                             pose_encoding_type=pose_encoding_type)
     pose_enc_norm = rearrange(pose_enc_norm, '(b s) c -> b s c', b=B)
     return pose_enc_norm
@@ -439,7 +439,7 @@ def first_pose_enc_denorm(
                         pose_encoding_type: str="absT_quaR_OneFL",
                         pose_mode: str = "W2C"):
     """
-    make sure the poses in on window are de-normalized by the first frame, where the
     first frame transformation is the Identity Matrix.
     args:
         pose_enc: [B S C]
@@ -457,7 +457,7 @@ def first_pose_enc_denorm(
     ) #NOTE: the camera parameters are not in NDC
     R = pred_cameras.R    # [B*(1+S), 3, 3]
     T = pred_cameras.T    # [B*(1+S), 3]
     Tran_M = torch.cat([R, T.unsqueeze(-1)], dim=-1) # [B*(1+S), 3, 4]
     extra_ = torch.tensor([[[0, 0, 0, 1]]],
                           device=Tran_M.device).expand(Tran_M.shape[0], -1, -1)
@@ -470,7 +470,7 @@ def first_pose_enc_denorm(
         Tran_M_new = Tran_M_1st@Tran_M_new
     elif pose_mode == "W2C":
         Tran_M_new = Tran_M_new@Tran_M_1st
     Tran_M_new_ = torch.cat([Tran_M_1st, Tran_M_new], dim=1)
     R_ = Tran_M_new_[..., :3, :3].view(-1, 3, 3)
     T_ = Tran_M_new_[..., :3, 3].view(-1, 3)
@@ -481,7 +481,7 @@ def first_pose_enc_denorm(
     # Cameras to Pose encoding
     pose_enc_denorm = camera_to_pose_encoding(pred_cameras,
-                                             pose_encoding_type=pose_encoding_type)
     pose_enc_denorm = rearrange(pose_enc_denorm, '(b s) c -> b s c', b=B)
     return pose_enc_denorm[:, 1:]
@@ -560,7 +560,7 @@ def median_loss(prediction, target, mask, Bs):
     target_nm, a_norm_gt, b_norm_gt = normalize_prediction_robust(target.float(), mask, Bs)
     depth_loss = nn.functional.l1_loss(prediction_nm[mask], target_nm[mask])
     scale = b_norm_gt/b_norm
-    shift = a_norm_gt - a_norm*scale
     return depth_loss, scale, shift, prediction_nm, target_nm
 def reduction_batch_based(image_loss, M):
@@ -593,7 +593,7 @@ class ScaleAndShiftInvariantLoss(nn.Module):
     def forward(self, prediction, target, mask, Bs,
                  interpolate=True, return_interpolated=False):
         if prediction.shape[-1] != target.shape[-1] and interpolate:
             prediction = nn.functional.interpolate(prediction, target.shape[-2:], mode='bilinear', align_corners=True)
             intr_input = prediction
@@ -602,7 +602,7 @@ class ScaleAndShiftInvariantLoss(nn.Module):
         prediction, target, mask = prediction.squeeze(), target.squeeze(), mask.squeeze()
         assert prediction.shape == target.shape, f"Shape mismatch: Expected same shape but got {prediction.shape} and {target.shape}."
         scale, shift = compute_scale_and_shift(prediction, target, mask)
         a_norm = scale.view(Bs, -1, 1, 1).mean(dim=1, keepdim=True)
@@ -634,7 +634,7 @@ class GradientLoss(nn.Module):
         for scale in range(self.__scales):
             step = pow(2, scale)
-            l1_ln, a_nm, b_nm = ScaleAndShiftInvariantLoss_fn(prediction[:, ::step, ::step],
                                                    target[:, ::step, ::step], mask[:, ::step, ::step], 1)
             total += l1_ln
             a_nm = a_nm.squeeze().detach()  # [B, 1, 1]
@@ -663,7 +663,7 @@ def gradient_loss(prediction, target, mask, reduction=reduction_batch_based):
     image_loss = torch.sum(grad_x, (1, 2)) + torch.sum(grad_y, (1, 2))
     return reduction(image_loss, M)
 def loss_fn(
         poses_preds: List[torch.Tensor],
         poses_pred_all: List[torch.Tensor],
@@ -700,7 +700,7 @@ def loss_fn(
             if logger is not None:
                 if poses_preds_ij.max()>5e1:
                     logger.info(f"pose_pred_max_and_mean: {poses_preds_ij.max(), poses_preds_ij.mean()}")
             trans_loss = (poses_preds_ij[...,:3] - poses_gt_i_norm[...,:3]).abs().sum(dim=-1).mean()
             rot_loss = (poses_preds_ij[...,3:7] - poses_gt_i_norm[...,3:7]).abs().sum(dim=-1).mean()
             focal_loss = (poses_preds_ij[...,7:] - poses_gt_i_norm[...,7:]).abs().sum(dim=-1).mean()
@@ -714,7 +714,7 @@ def loss_fn(
                 logger_tf.add_scalar(f"loss@pose/rot_iter{idx}",
                                             rot_loss, global_step=global_step)
                 logger_tf.add_scalar(f"loss@pose/focal_iter{idx}",
-                                            focal_loss, global_step=global_step)
         # compute the uncertainty loss
         with torch.no_grad():
             pose_loss_dist = (poses_preds_ij-poses_gt_i_norm).detach().abs()
@@ -726,9 +726,9 @@ def loss_fn(
                                     unc_loss,
                                     global_step=global_step)
         # if logger is not None:
-        #     logger.info(f"pose_loss: {pose_loss}, unc_loss: {unc_loss}")
         # total loss
-        loss_total += 0.1*unc_loss + 2*pose_loss
     poses_gt_norm = poses_gt
     pose_all_loss = 0.0
@@ -743,7 +743,7 @@ def loss_fn(
                 prev_loss = (trans_loss + rot_loss + focal_loss)
             else:
                 des_loss = (trans_loss + rot_loss + focal_loss) - prev_loss
-                prev_loss = trans_loss + rot_loss + focal_loss
                 logger_tf.add_scalar(f"loss@global_pose/des_iter{idx}",
                                         des_loss, global_step=global_step)
             logger_tf.add_scalar(f"loss@global_pose/trans_iter{idx}",
@@ -751,20 +751,20 @@ def loss_fn(
             logger_tf.add_scalar(f"loss@global_pose/rot_iter{idx}",
                                         rot_loss, global_step=global_step)
             logger_tf.add_scalar(f"loss@global_pose/focal_iter{idx}",
-                                        focal_loss, global_step=global_step)
         if torch.isnan((trans_loss + rot_loss + focal_loss)).any():
             pose_all_loss += 0
         else:
             pose_all_loss += i_weight*(trans_loss + rot_loss + focal_loss)
     # if logger is not None:
-    #     logger.info(f"global_pose_loss: {pose_all_loss}")
     # compute the depth loss
     if inv_depth_preds[0] is not None:
         depths_gt = depths_gt[:,:,0]
         msk = depths_gt > 5e-2
-        inv_gt = 1.0 / (depths_gt.clamp(1e-3, 1e16))
         inv_gt_reshp = rearrange(inv_gt, 'b t h w -> (b t) h w')
         inv_depth_preds_reshp = rearrange(inv_depth_preds[0], 'b t h w -> (b t) h w')
         inv_raw_reshp = rearrange(inv_depth_raw[0], 'b t h w -> (b t) h w')
@@ -785,11 +785,11 @@ def loss_fn(
                                         depth_loss,
                                         global_step=global_step)
         # if logger is not None:
-        #         logger.info(f"opt_depth: {huber_loss_raw - huber_loss}")
     else:
         depth_loss = 0.0
     loss_total = loss_total/(len(poses_preds)) + 20*depth_loss + pose_all_loss
     return loss_total, (huber_loss_raw - huber_loss)
@@ -803,7 +803,7 @@ def vis_depth(x: torch.tensor,
     """
     assert len(x.shape) == 2
-    depth_map_normalized = cv2.normalize(x.cpu().numpy(),
                                         None, 0, 255, cv2.NORM_MINMAX)
     depth_map_colored = cv2.applyColorMap(depth_map_normalized.astype(np.uint8),
                                         cv2.COLORMAP_JET)
@@ -848,7 +848,7 @@ def vis_pcd(
     return pcl
 def vis_result(rgbs, poses_pred, poses_gt,
-                depth_gt, depth_pred, iter_num=0,
                 vis=None, logger_tf=None, cfg=None):
     """
     Args:
@@ -863,7 +863,7 @@ def vis_result(rgbs, poses_pred, poses_gt,
     if vis is None:
         return
     S, _, H, W = depth_gt.shape
-    # get the xy
     yx = torch.meshgrid(torch.arange(H).to(depth_pred.device),
                         torch.arange(W).to(depth_pred.device),indexing='ij')
     xy = torch.stack(yx[::-1], dim=0).float().to(depth_pred.device)
@@ -880,7 +880,7 @@ def vis_result(rgbs, poses_pred, poses_gt,
                                             pose_encoding_type="absT_quaR_OneFL",to_OpenCV=False)
     poses_pred_vis = pose_encoding_to_camera(poses_pred,
                                               pose_encoding_type="absT_quaR_OneFL",to_OpenCV=False)
     R_gt = poses_gt_vis.R.float()
     R_pred = poses_pred_vis.R.float()
     T_gt = poses_gt_vis.T.float()
@@ -890,8 +890,8 @@ def vis_result(rgbs, poses_pred, poses_gt,
     T_gt_c2w = (-R_gt_c2w @ T_gt[:, :, None]).squeeze(-1)
     R_pred_c2w = R_pred.permute(0,2,1)
     T_pred_c2w = (-R_pred_c2w @ T_pred[:, :, None]).squeeze(-1)
-    with torch.cuda.amp.autocast(enabled=False):
-        pick_idx = torch.randperm(S)[:min(24, S)]
         # pick_idx = [1]
         #NOTE: very strange that the camera need C2W Rotation and W2C translation as input
         poses_gt_vis = PerspectiveCamerasVisual(
@@ -922,9 +922,9 @@ def vis_result(rgbs, poses_pred, poses_gt,
         fig = plot_scene(visual_dict, camera_scale=0.05)
         vis.plotlyplot(fig, env=env_name, win="3D")
         vis.save([env_name])
     return
 def depth2pcd(
         xy_depth: torch.Tensor,
         focal_length: torch.Tensor,
@@ -953,7 +953,7 @@ def depth2pcd(
     K_inv = K.inverse()
     # xyz
     xyz = xy_depth.view(S, -1, 3).permute(0, 2, 1) # S 3 (H W)
-    depth = xyz[:, 2:].clone() # S (H W) 1
     xyz[:, 2] = 1
     xyz = K_inv @ xyz # S 3 (H W)
     xyz = xyz * depth
@@ -963,29 +963,29 @@ def depth2pcd(
     return xyz
-def pose_enc2mat(poses_pred,
                  H_resize, W_resize, resolution=336):
     """
     This function convert the pose encoding into `intrinsic` and `extrinsic`
     Args:
         poses_pred: B T 8
-    Return:
         Intrinsic B T 3 3
         Extrinsic B T 4 4
     """
     B, T, _ = poses_pred.shape
     focal_pred = poses_pred[:, :, -1].clone()
-    pos_quat_preds = poses_pred[:, :, :7].clone()
-    pos_quat_preds = pos_quat_preds.view(B*T, -1)
-    # get extrinsic
     c2w_rot = quaternion_to_matrix(pos_quat_preds[:, 3:])
     c2w_tran = pos_quat_preds[:, :3]
     c2w_traj = torch.eye(4)[None].repeat(B*T, 1, 1).to(poses_pred.device)
     c2w_traj[:, :3, :3], c2w_traj[:, :3, 3] = c2w_rot, c2w_tran
     c2w_traj = c2w_traj.view(B, T, 4, 4)
     # get intrinsic
-    fxs, fys = focal_pred*resolution, focal_pred*resolution
     intrs = torch.eye(3).to(c2w_traj.device).to(c2w_traj.dtype)[None, None].repeat(B, T, 1, 1)
     intrs[:,:,0,0], intrs[:,:,1,1] = fxs, fys
     intrs[:,:,0,2], intrs[:,:,1,2] = W_resize/2, H_resize/2
@@ -1001,7 +1001,7 @@ def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
     positive_mask = x > 0
     ret[positive_mask] = torch.sqrt(x[positive_mask])
     return ret
 def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
     """
     Convert a unit quaternion to a standard form: one in which the real
@@ -1086,11 +1086,11 @@ def meshgrid2d(B, Y, X, stack=False, norm=False, device="cuda"):
         return grid
     else:
         return grid_y, grid_x
 def get_points_on_a_grid(grid_size, interp_shape,
                           grid_center=(0, 0), device="cuda"):
     if grid_size == 1:
-        return torch.tensor([interp_shape[1] / 2,
                              interp_shape[0] / 2], device=device)[
             None, None
         ]
@@ -1114,12 +1114,12 @@ def get_points_on_a_grid(grid_size, interp_shape,
     xy = torch.stack([grid_x, grid_y], dim=-1).to(device)
     return xy
-def normalize_rgb(x,input_size=224,
                 resize_mode: Literal['resize', 'padding'] = 'resize',
                 if_da=False):
         """
         normalize the image for depth anything input
         args:
             x: the input images  [B T C H W]
         """
@@ -1127,8 +1127,8 @@ def normalize_rgb(x,input_size=224,
             x = torch.from_numpy(x) / 255.0
         elif isinstance(x, torch.Tensor):
             x = x / 255.0
-        B, T, C, H, W = x.shape
-        x = x.view(B * T, C, H, W)
         Resizer = Resize(
                 width=input_size,
                 height=input_size,
@@ -1136,7 +1136,7 @@ def normalize_rgb(x,input_size=224,
                 keep_aspect_ratio=True,
                 ensure_multiple_of=14,
                 resize_method='lower_bound',
-            )
         if resize_mode == 'padding':
             # zero padding to make the input size to be multiple of 14
             if H > W:
@@ -1160,7 +1160,7 @@ def normalize_rgb(x,input_size=224,
             x = F.interpolate(x, size=(int(H_scale), int(W_scale)),
                                     mode='bicubic', align_corners=True)
         # get the mean and std
-        __mean__ = torch.tensor([0.485,
                                  0.456, 0.406]).view(1, 3, 1, 1).to(x.device)
         __std__ = torch.tensor([0.229,
                                  0.224, 0.225]).view(1, 3, 1, 1).to(x.device)
@@ -1168,7 +1168,7 @@ def normalize_rgb(x,input_size=224,
         if if_da:
             x = (x - __mean__) / __std__
         else:
-            x = x
         return x.view(B, T, C, x.shape[-2], x.shape[-1])
 def get_track_points(H, W, T, device, size=100, support_frame=0,

         return fmtstr.format(**self.__dict__)
+def procrustes_analysis(X0,X1): # [N,3]
     # translation
     t0 = X0.mean(dim=0,keepdim=True)
     t1 = X1.mean(dim=0,keepdim=True)
     intrinsics = create_intri_matrix(focal_length, principal_point)
     return extrinsics, intrinsics
 def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor:
     """
     Convert rotations given as quaternions to rotation matrices.
         # Now converted back
         focal_length = (log_focal_length + log_focal_length_bias).exp()
         # clamp to avoid weird fl values
+        focal_length = torch.clamp(focal_length,
                                    min=min_focal_length, max=max_focal_length)
     elif pose_encoding_type == "absT_quaR_OneFL":
         # 3 for absT, 4 for quaR, 1 for absFL
         quaternion_R = pose_encoding_reshaped[:, 3:7]
         R = quaternion_to_matrix(quaternion_R)
         focal_length = pose_encoding_reshaped[:, 7:8]
+        focal_length = torch.clamp(focal_length,
                                    min=min_focal_length, max=max_focal_length)
     else:
         raise ValueError(f"Unknown pose encoding {pose_encoding_type}")
         R = extrinsics_4x4[:, :3, :3].clone()
         abs_T = extrinsics_4x4[:, :3, 3].clone()
     if return_dict:
         return {"focal_length": focal_length, "R": R, "T": abs_T}
 def camera_to_pose_encoding(
+    camera, pose_encoding_type="absT_quaR_logFL",
     log_focal_length_bias=1.8, min_focal_length=0.1, max_focal_length=30
 ):
     """
     return pose_encoding
+def init_pose_enc(B: int,
                   S: int, pose_encoding_type: str="absT_quaR_logFL",
                   device: Optional[torch.device]=None):
     """
         C = 8
     else:
         raise ValueError(f"Unknown pose encoding {pose_encoding_type}")
     pose_enc = torch.zeros(B, S, C, device=device)
     pose_enc[..., :3] = 0 # absT
     pose_enc[..., 3] = 1 # quaR
                         pose_encoding_type: str="absT_quaR_OneFL",
                         pose_mode: str = "W2C"):
     """
+    make sure the poses in on window are normalized by the first frame, where the
     first frame transformation is the Identity Matrix.
     NOTE: Poses are all W2C
     args:
         pose_enc, pose_encoding_type=pose_encoding_type,
         to_OpenCV=False
     ) #NOTE: the camera parameters are not in NDC
     R = pred_cameras.R    # [B*S, 3, 3]
     T = pred_cameras.T    # [B*S, 3]
     Tran_M = torch.cat([R, T.unsqueeze(-1)], dim=-1) # [B*S, 3, 4]
     extra_ = torch.tensor([[[0, 0, 0, 1]]],
                           device=Tran_M.device).expand(Tran_M.shape[0], -1, -1)
     Tran_M = torch.cat([Tran_M, extra_
                         ], dim=1)
     Tran_M = rearrange(Tran_M, '(b s) c d -> b s c d', b=B)
     # Take the first frame as the base of world coordinate
     if pose_mode == "C2W":
         Tran_M_new = (Tran_M[:,:1,...].inverse())@Tran_M
     elif pose_mode == "W2C":
         Tran_M_new = Tran_M@(Tran_M[:,:1,...].inverse())
     Tran_M_new = rearrange(Tran_M_new, 'b s c d -> (b s) c d')
     R_ = Tran_M_new[:, :3, :3]
     pred_cameras.R = R_
     pred_cameras.T = T_
     pose_enc_norm = camera_to_pose_encoding(pred_cameras,
+                                             pose_encoding_type=pose_encoding_type)
     pose_enc_norm = rearrange(pose_enc_norm, '(b s) c -> b s c', b=B)
     return pose_enc_norm
                         pose_encoding_type: str="absT_quaR_OneFL",
                         pose_mode: str = "W2C"):
     """
+    make sure the poses in on window are de-normalized by the first frame, where the
     first frame transformation is the Identity Matrix.
     args:
         pose_enc: [B S C]
     ) #NOTE: the camera parameters are not in NDC
     R = pred_cameras.R    # [B*(1+S), 3, 3]
     T = pred_cameras.T    # [B*(1+S), 3]
     Tran_M = torch.cat([R, T.unsqueeze(-1)], dim=-1) # [B*(1+S), 3, 4]
     extra_ = torch.tensor([[[0, 0, 0, 1]]],
                           device=Tran_M.device).expand(Tran_M.shape[0], -1, -1)
         Tran_M_new = Tran_M_1st@Tran_M_new
     elif pose_mode == "W2C":
         Tran_M_new = Tran_M_new@Tran_M_1st
     Tran_M_new_ = torch.cat([Tran_M_1st, Tran_M_new], dim=1)
     R_ = Tran_M_new_[..., :3, :3].view(-1, 3, 3)
     T_ = Tran_M_new_[..., :3, 3].view(-1, 3)
     # Cameras to Pose encoding
     pose_enc_denorm = camera_to_pose_encoding(pred_cameras,
+                                             pose_encoding_type=pose_encoding_type)
     pose_enc_denorm = rearrange(pose_enc_denorm, '(b s) c -> b s c', b=B)
     return pose_enc_denorm[:, 1:]
     target_nm, a_norm_gt, b_norm_gt = normalize_prediction_robust(target.float(), mask, Bs)
     depth_loss = nn.functional.l1_loss(prediction_nm[mask], target_nm[mask])
     scale = b_norm_gt/b_norm
+    shift = a_norm_gt - a_norm*scale
     return depth_loss, scale, shift, prediction_nm, target_nm
 def reduction_batch_based(image_loss, M):
     def forward(self, prediction, target, mask, Bs,
                  interpolate=True, return_interpolated=False):
         if prediction.shape[-1] != target.shape[-1] and interpolate:
             prediction = nn.functional.interpolate(prediction, target.shape[-2:], mode='bilinear', align_corners=True)
             intr_input = prediction
         prediction, target, mask = prediction.squeeze(), target.squeeze(), mask.squeeze()
         assert prediction.shape == target.shape, f"Shape mismatch: Expected same shape but got {prediction.shape} and {target.shape}."
         scale, shift = compute_scale_and_shift(prediction, target, mask)
         a_norm = scale.view(Bs, -1, 1, 1).mean(dim=1, keepdim=True)
         for scale in range(self.__scales):
             step = pow(2, scale)
+            l1_ln, a_nm, b_nm = ScaleAndShiftInvariantLoss_fn(prediction[:, ::step, ::step],
                                                    target[:, ::step, ::step], mask[:, ::step, ::step], 1)
             total += l1_ln
             a_nm = a_nm.squeeze().detach()  # [B, 1, 1]
     image_loss = torch.sum(grad_x, (1, 2)) + torch.sum(grad_y, (1, 2))
     return reduction(image_loss, M)
 def loss_fn(
         poses_preds: List[torch.Tensor],
         poses_pred_all: List[torch.Tensor],
             if logger is not None:
                 if poses_preds_ij.max()>5e1:
                     logger.info(f"pose_pred_max_and_mean: {poses_preds_ij.max(), poses_preds_ij.mean()}")
             trans_loss = (poses_preds_ij[...,:3] - poses_gt_i_norm[...,:3]).abs().sum(dim=-1).mean()
             rot_loss = (poses_preds_ij[...,3:7] - poses_gt_i_norm[...,3:7]).abs().sum(dim=-1).mean()
             focal_loss = (poses_preds_ij[...,7:] - poses_gt_i_norm[...,7:]).abs().sum(dim=-1).mean()
                 logger_tf.add_scalar(f"loss@pose/rot_iter{idx}",
                                             rot_loss, global_step=global_step)
                 logger_tf.add_scalar(f"loss@pose/focal_iter{idx}",
+                                            focal_loss, global_step=global_step)
         # compute the uncertainty loss
         with torch.no_grad():
             pose_loss_dist = (poses_preds_ij-poses_gt_i_norm).detach().abs()
                                     unc_loss,
                                     global_step=global_step)
         # if logger is not None:
+        #     logger.info(f"pose_loss: {pose_loss}, unc_loss: {unc_loss}")
         # total loss
+        loss_total += 0.1*unc_loss + 2*pose_loss
     poses_gt_norm = poses_gt
     pose_all_loss = 0.0
                 prev_loss = (trans_loss + rot_loss + focal_loss)
             else:
                 des_loss = (trans_loss + rot_loss + focal_loss) - prev_loss
+                prev_loss = trans_loss + rot_loss + focal_loss
                 logger_tf.add_scalar(f"loss@global_pose/des_iter{idx}",
                                         des_loss, global_step=global_step)
             logger_tf.add_scalar(f"loss@global_pose/trans_iter{idx}",
             logger_tf.add_scalar(f"loss@global_pose/rot_iter{idx}",
                                         rot_loss, global_step=global_step)
             logger_tf.add_scalar(f"loss@global_pose/focal_iter{idx}",
+                                        focal_loss, global_step=global_step)
         if torch.isnan((trans_loss + rot_loss + focal_loss)).any():
             pose_all_loss += 0
         else:
             pose_all_loss += i_weight*(trans_loss + rot_loss + focal_loss)
     # if logger is not None:
+    #     logger.info(f"global_pose_loss: {pose_all_loss}")
     # compute the depth loss
     if inv_depth_preds[0] is not None:
         depths_gt = depths_gt[:,:,0]
         msk = depths_gt > 5e-2
+        inv_gt = 1.0 / (depths_gt.clamp(1e-3, 1e16))
         inv_gt_reshp = rearrange(inv_gt, 'b t h w -> (b t) h w')
         inv_depth_preds_reshp = rearrange(inv_depth_preds[0], 'b t h w -> (b t) h w')
         inv_raw_reshp = rearrange(inv_depth_raw[0], 'b t h w -> (b t) h w')
                                         depth_loss,
                                         global_step=global_step)
         # if logger is not None:
+        #         logger.info(f"opt_depth: {huber_loss_raw - huber_loss}")
     else:
         depth_loss = 0.0
     loss_total = loss_total/(len(poses_preds)) + 20*depth_loss + pose_all_loss
     return loss_total, (huber_loss_raw - huber_loss)
     """
     assert len(x.shape) == 2
+    depth_map_normalized = cv2.normalize(x.cpu().numpy(),
                                         None, 0, 255, cv2.NORM_MINMAX)
     depth_map_colored = cv2.applyColorMap(depth_map_normalized.astype(np.uint8),
                                         cv2.COLORMAP_JET)
     return pcl
 def vis_result(rgbs, poses_pred, poses_gt,
+                depth_gt, depth_pred, iter_num=0,
                 vis=None, logger_tf=None, cfg=None):
     """
     Args:
     if vis is None:
         return
     S, _, H, W = depth_gt.shape
+    # get the xy
     yx = torch.meshgrid(torch.arange(H).to(depth_pred.device),
                         torch.arange(W).to(depth_pred.device),indexing='ij')
     xy = torch.stack(yx[::-1], dim=0).float().to(depth_pred.device)
                                             pose_encoding_type="absT_quaR_OneFL",to_OpenCV=False)
     poses_pred_vis = pose_encoding_to_camera(poses_pred,
                                               pose_encoding_type="absT_quaR_OneFL",to_OpenCV=False)
     R_gt = poses_gt_vis.R.float()
     R_pred = poses_pred_vis.R.float()
     T_gt = poses_gt_vis.T.float()
     T_gt_c2w = (-R_gt_c2w @ T_gt[:, :, None]).squeeze(-1)
     R_pred_c2w = R_pred.permute(0,2,1)
     T_pred_c2w = (-R_pred_c2w @ T_pred[:, :, None]).squeeze(-1)
+    with torch.amp.autocast('cuda', enabled=False):
+        pick_idx = torch.randperm(S)[:min(24, S)]
         # pick_idx = [1]
         #NOTE: very strange that the camera need C2W Rotation and W2C translation as input
         poses_gt_vis = PerspectiveCamerasVisual(
         fig = plot_scene(visual_dict, camera_scale=0.05)
         vis.plotlyplot(fig, env=env_name, win="3D")
         vis.save([env_name])
     return
 def depth2pcd(
         xy_depth: torch.Tensor,
         focal_length: torch.Tensor,
     K_inv = K.inverse()
     # xyz
     xyz = xy_depth.view(S, -1, 3).permute(0, 2, 1) # S 3 (H W)
+    depth = xyz[:, 2:].clone() # S (H W) 1
     xyz[:, 2] = 1
     xyz = K_inv @ xyz # S 3 (H W)
     xyz = xyz * depth
     return xyz
+def pose_enc2mat(poses_pred,
                  H_resize, W_resize, resolution=336):
     """
     This function convert the pose encoding into `intrinsic` and `extrinsic`
     Args:
         poses_pred: B T 8
+    Return:
         Intrinsic B T 3 3
         Extrinsic B T 4 4
     """
     B, T, _ = poses_pred.shape
     focal_pred = poses_pred[:, :, -1].clone()
+    pos_quat_preds = poses_pred[:, :, :7].clone()
+    pos_quat_preds = pos_quat_preds.view(B*T, -1)
+    # get extrinsic
     c2w_rot = quaternion_to_matrix(pos_quat_preds[:, 3:])
     c2w_tran = pos_quat_preds[:, :3]
     c2w_traj = torch.eye(4)[None].repeat(B*T, 1, 1).to(poses_pred.device)
     c2w_traj[:, :3, :3], c2w_traj[:, :3, 3] = c2w_rot, c2w_tran
     c2w_traj = c2w_traj.view(B, T, 4, 4)
     # get intrinsic
+    fxs, fys = focal_pred*resolution, focal_pred*resolution
     intrs = torch.eye(3).to(c2w_traj.device).to(c2w_traj.dtype)[None, None].repeat(B, T, 1, 1)
     intrs[:,:,0,0], intrs[:,:,1,1] = fxs, fys
     intrs[:,:,0,2], intrs[:,:,1,2] = W_resize/2, H_resize/2
     positive_mask = x > 0
     ret[positive_mask] = torch.sqrt(x[positive_mask])
     return ret
 def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
     """
     Convert a unit quaternion to a standard form: one in which the real
         return grid
     else:
         return grid_y, grid_x
 def get_points_on_a_grid(grid_size, interp_shape,
                           grid_center=(0, 0), device="cuda"):
     if grid_size == 1:
+        return torch.tensor([interp_shape[1] / 2,
                              interp_shape[0] / 2], device=device)[
             None, None
         ]
     xy = torch.stack([grid_x, grid_y], dim=-1).to(device)
     return xy
+def normalize_rgb(x,input_size=224,
                 resize_mode: Literal['resize', 'padding'] = 'resize',
                 if_da=False):
         """
         normalize the image for depth anything input
         args:
             x: the input images  [B T C H W]
         """
             x = torch.from_numpy(x) / 255.0
         elif isinstance(x, torch.Tensor):
             x = x / 255.0
+        B, T, C, H, W = x.shape
+        x = x.view(B * T, C, H, W)
         Resizer = Resize(
                 width=input_size,
                 height=input_size,
                 keep_aspect_ratio=True,
                 ensure_multiple_of=14,
                 resize_method='lower_bound',
+            )
         if resize_mode == 'padding':
             # zero padding to make the input size to be multiple of 14
             if H > W:
             x = F.interpolate(x, size=(int(H_scale), int(W_scale)),
                                     mode='bicubic', align_corners=True)
         # get the mean and std
+        __mean__ = torch.tensor([0.485,
                                  0.456, 0.406]).view(1, 3, 1, 1).to(x.device)
         __std__ = torch.tensor([0.229,
                                  0.224, 0.225]).view(1, 3, 1, 1).to(x.device)
         if if_da:
             x = (x - __mean__) / __std__
         else:
+            x = x
         return x.view(B, T, C, x.shape[-2], x.shape[-1])
 def get_track_points(H, W, T, device, size=100, support_frame=0,

models/SpaTrackV2/models/vggt4track/models/tracker_front.py CHANGED Viewed

@@ -75,15 +75,15 @@ class FrontTracker(nn.Module, PyTorchModelHubMixin):
         B, T, C, H, W = images.shape
         images = (images - self.base_model.image_mean) / self.base_model.image_std
         H_14 = H // 14 * 14
-        W_14 = W // 14 * 14
         image_14 = F.interpolate(images.view(B*T, C, H, W), (H_14, W_14), mode="bilinear", align_corners=False, antialias=True).view(B, T, C, H_14, W_14)
         with torch.no_grad():
-            features = self.base_model.backbone.get_intermediate_layers(rearrange(image_14, 'b t c h w -> (b t) c h w'),
                                                                         self.base_model.intermediate_layers, return_class_token=True)
         # aggregate the features with checkpoint
         aggregated_tokens_list, patch_start_idx = self.aggregator(image_14, patch_tokens=features[-1][0])
         # enhance the features
         enhanced_features = []
         for layer_i, layer in enumerate(self.intermediate_layers):
@@ -94,7 +94,7 @@ class FrontTracker(nn.Module, PyTorchModelHubMixin):
         predictions = {}
-        with torch.cuda.amp.autocast(enabled=False):
             if self.camera_head is not None:
                 pose_enc_list = self.camera_head(aggregated_tokens_list)
                 predictions["pose_enc"] = pose_enc_list[-1]  # pose encoding of the last iteration
@@ -104,7 +104,7 @@ class FrontTracker(nn.Module, PyTorchModelHubMixin):
             # Predict points (and mask) with checkpoint
             output = self.base_model.head(enhanced_features, image_14)
             points, mask = output
             # Post-process points and mask
             points, mask = points.permute(0, 2, 3, 1), mask.squeeze(1)
             points = self.base_model._remap_points(points)     # slightly improves the performance in case of very large output values
@@ -119,13 +119,13 @@ class FrontTracker(nn.Module, PyTorchModelHubMixin):
         if self.training:
             loss = compute_loss(predictions, annots)
             predictions["loss"] = loss
         # rescale the points
         if self.scale_head is not None:
             points_scale = points * predictions["scale"].view(B*T, 1, 1, 2)[..., :1]
             points_scale[..., 2:] += predictions["scale"].view(B*T, 1, 1, 2)[..., 1:]
             predictions["points_map"] = points_scale
         predictions["poses_pred"] = torch.eye(4)[None].repeat(predictions["images"].shape[1], 1, 1)[None]
         predictions["poses_pred"][:,:,:3,:4], predictions["intrs"] = pose_encoding_to_extri_intri(predictions["pose_enc_list"][-1],
                                                                                                             predictions["images"].shape[-2:])

         B, T, C, H, W = images.shape
         images = (images - self.base_model.image_mean) / self.base_model.image_std
         H_14 = H // 14 * 14
+        W_14 = W // 14 * 14
         image_14 = F.interpolate(images.view(B*T, C, H, W), (H_14, W_14), mode="bilinear", align_corners=False, antialias=True).view(B, T, C, H_14, W_14)
         with torch.no_grad():
+            features = self.base_model.backbone.get_intermediate_layers(rearrange(image_14, 'b t c h w -> (b t) c h w'),
                                                                         self.base_model.intermediate_layers, return_class_token=True)
         # aggregate the features with checkpoint
         aggregated_tokens_list, patch_start_idx = self.aggregator(image_14, patch_tokens=features[-1][0])
         # enhance the features
         enhanced_features = []
         for layer_i, layer in enumerate(self.intermediate_layers):
         predictions = {}
+        with torch.amp.autocast('cuda', enabled=False):
             if self.camera_head is not None:
                 pose_enc_list = self.camera_head(aggregated_tokens_list)
                 predictions["pose_enc"] = pose_enc_list[-1]  # pose encoding of the last iteration
             # Predict points (and mask) with checkpoint
             output = self.base_model.head(enhanced_features, image_14)
             points, mask = output
             # Post-process points and mask
             points, mask = points.permute(0, 2, 3, 1), mask.squeeze(1)
             points = self.base_model._remap_points(points)     # slightly improves the performance in case of very large output values
         if self.training:
             loss = compute_loss(predictions, annots)
             predictions["loss"] = loss
         # rescale the points
         if self.scale_head is not None:
             points_scale = points * predictions["scale"].view(B*T, 1, 1, 2)[..., :1]
             points_scale[..., 2:] += predictions["scale"].view(B*T, 1, 1, 2)[..., 1:]
             predictions["points_map"] = points_scale
         predictions["poses_pred"] = torch.eye(4)[None].repeat(predictions["images"].shape[1], 1, 1)[None]
         predictions["poses_pred"][:,:,:3,:4], predictions["intrs"] = pose_encoding_to_extri_intri(predictions["pose_enc_list"][-1],
                                                                                                             predictions["images"].shape[-2:])

models/SpaTrackV2/models/vggt4track/models/vggt.py CHANGED Viewed

@@ -64,7 +64,7 @@ class VGGT(nn.Module, PyTorchModelHubMixin):
         predictions = {}
-        with torch.cuda.amp.autocast(enabled=False):
             if self.camera_head is not None:
                 pose_enc_list = self.camera_head(aggregated_tokens_list)
                 predictions["pose_enc"] = pose_enc_list[-1]  # pose encoding of the last iteration

         predictions = {}
+        with torch.amp.autocast('cuda', enabled=False):
             if self.camera_head is not None:
                 pose_enc_list = self.camera_head(aggregated_tokens_list)
                 predictions["pose_enc"] = pose_enc_list[-1]  # pose encoding of the last iteration

models/SpaTrackV2/models/vggt4track/models/vggt_moe.py CHANGED Viewed

@@ -65,13 +65,13 @@ class VGGT4Track(nn.Module, PyTorchModelHubMixin):
         if len(images.shape) == 4:
             images = images.unsqueeze(0)
         with torch.no_grad():
             aggregated_tokens_list, patch_start_idx = self.aggregator(images_proc)
         predictions = {}
-        with torch.cuda.amp.autocast(enabled=False):
             if self.camera_head is not None:
                 pose_enc_list = self.camera_head(aggregated_tokens_list)
                 predictions["pose_enc"] = pose_enc_list[-1]  # pose encoding of the last iteration
@@ -97,11 +97,11 @@ class VGGT4Track(nn.Module, PyTorchModelHubMixin):
                                                          size=(H, W), mode='bilinear', align_corners=True).permute(0,2,3,1)
         predictions["unc_metric"] = F.interpolate(predictions["unc_metric"][:,None],
                                                          size=(H, W), mode='bilinear', align_corners=True)[:,0]
-        predictions["intrs"][..., :1, :] *= W/W_proc
-        predictions["intrs"][..., 1:2, :] *= H/H_proc
         if self.training:
             loss = compute_loss(predictions, annots)
             predictions["loss"] = loss
         return predictions

         if len(images.shape) == 4:
             images = images.unsqueeze(0)
         with torch.no_grad():
             aggregated_tokens_list, patch_start_idx = self.aggregator(images_proc)
         predictions = {}
+        with torch.amp.autocast('cuda', enabled=False):
             if self.camera_head is not None:
                 pose_enc_list = self.camera_head(aggregated_tokens_list)
                 predictions["pose_enc"] = pose_enc_list[-1]  # pose encoding of the last iteration
                                                          size=(H, W), mode='bilinear', align_corners=True).permute(0,2,3,1)
         predictions["unc_metric"] = F.interpolate(predictions["unc_metric"][:,None],
                                                          size=(H, W), mode='bilinear', align_corners=True)[:,0]
+        predictions["intrs"][..., :1, :] *= W/W_proc
+        predictions["intrs"][..., 1:2, :] *= H/H_proc
         if self.training:
             loss = compute_loss(predictions, annots)
             predictions["loss"] = loss
         return predictions

models/vggt/vggt/models/tracker_front.py CHANGED Viewed

@@ -75,15 +75,15 @@ class FrontTracker(nn.Module, PyTorchModelHubMixin):
         B, T, C, H, W = images.shape
         images = (images - self.base_model.image_mean) / self.base_model.image_std
         H_14 = H // 14 * 14
-        W_14 = W // 14 * 14
         image_14 = F.interpolate(images.view(B*T, C, H, W), (H_14, W_14), mode="bilinear", align_corners=False, antialias=True).view(B, T, C, H_14, W_14)
         with torch.no_grad():
-            features = self.base_model.backbone.get_intermediate_layers(rearrange(image_14, 'b t c h w -> (b t) c h w'),
                                                                         self.base_model.intermediate_layers, return_class_token=True)
         # aggregate the features with checkpoint
         aggregated_tokens_list, patch_start_idx = self.aggregator(image_14, patch_tokens=features[-1][0])
         # enhance the features
         enhanced_features = []
         for layer_i, layer in enumerate(self.intermediate_layers):
@@ -94,7 +94,7 @@ class FrontTracker(nn.Module, PyTorchModelHubMixin):
         predictions = {}
-        with torch.cuda.amp.autocast(enabled=False):
             if self.camera_head is not None:
                 pose_enc_list = self.camera_head(aggregated_tokens_list)
                 predictions["pose_enc"] = pose_enc_list[-1]  # pose encoding of the last iteration
@@ -104,7 +104,7 @@ class FrontTracker(nn.Module, PyTorchModelHubMixin):
             # Predict points (and mask) with checkpoint
             output = self.base_model.head(enhanced_features, image_14)
             points, mask = output
             # Post-process points and mask
             points, mask = points.permute(0, 2, 3, 1), mask.squeeze(1)
             points = self.base_model._remap_points(points)     # slightly improves the performance in case of very large output values
@@ -119,13 +119,13 @@ class FrontTracker(nn.Module, PyTorchModelHubMixin):
         if self.training:
             loss = compute_loss(predictions, annots)
             predictions["loss"] = loss
         # rescale the points
         if self.scale_head is not None:
             points_scale = points * predictions["scale"].view(B*T, 1, 1, 2)[..., :1]
             points_scale[..., 2:] += predictions["scale"].view(B*T, 1, 1, 2)[..., 1:]
             predictions["points_map"] = points_scale
         predictions["poses_pred"] = torch.eye(4)[None].repeat(predictions["images"].shape[1], 1, 1)[None]
         predictions["poses_pred"][:,:,:3,:4], predictions["intrs"] = pose_encoding_to_extri_intri(predictions["pose_enc_list"][-1],
                                                                                                             predictions["images"].shape[-2:])

         B, T, C, H, W = images.shape
         images = (images - self.base_model.image_mean) / self.base_model.image_std
         H_14 = H // 14 * 14
+        W_14 = W // 14 * 14
         image_14 = F.interpolate(images.view(B*T, C, H, W), (H_14, W_14), mode="bilinear", align_corners=False, antialias=True).view(B, T, C, H_14, W_14)
         with torch.no_grad():
+            features = self.base_model.backbone.get_intermediate_layers(rearrange(image_14, 'b t c h w -> (b t) c h w'),
                                                                         self.base_model.intermediate_layers, return_class_token=True)
         # aggregate the features with checkpoint
         aggregated_tokens_list, patch_start_idx = self.aggregator(image_14, patch_tokens=features[-1][0])
         # enhance the features
         enhanced_features = []
         for layer_i, layer in enumerate(self.intermediate_layers):
         predictions = {}
+        with torch.amp.autocast('cuda', enabled=False):
             if self.camera_head is not None:
                 pose_enc_list = self.camera_head(aggregated_tokens_list)
                 predictions["pose_enc"] = pose_enc_list[-1]  # pose encoding of the last iteration
             # Predict points (and mask) with checkpoint
             output = self.base_model.head(enhanced_features, image_14)
             points, mask = output
             # Post-process points and mask
             points, mask = points.permute(0, 2, 3, 1), mask.squeeze(1)
             points = self.base_model._remap_points(points)     # slightly improves the performance in case of very large output values
         if self.training:
             loss = compute_loss(predictions, annots)
             predictions["loss"] = loss
         # rescale the points
         if self.scale_head is not None:
             points_scale = points * predictions["scale"].view(B*T, 1, 1, 2)[..., :1]
             points_scale[..., 2:] += predictions["scale"].view(B*T, 1, 1, 2)[..., 1:]
             predictions["points_map"] = points_scale
         predictions["poses_pred"] = torch.eye(4)[None].repeat(predictions["images"].shape[1], 1, 1)[None]
         predictions["poses_pred"][:,:,:3,:4], predictions["intrs"] = pose_encoding_to_extri_intri(predictions["pose_enc_list"][-1],
                                                                                                             predictions["images"].shape[-2:])

models/vggt/vggt/models/vggt.py CHANGED Viewed

@@ -64,7 +64,7 @@ class VGGT(nn.Module, PyTorchModelHubMixin):
         predictions = {}
-        with torch.cuda.amp.autocast(enabled=False):
             if self.camera_head is not None:
                 pose_enc_list = self.camera_head(aggregated_tokens_list)
                 predictions["pose_enc"] = pose_enc_list[-1]  # pose encoding of the last iteration

         predictions = {}
+        with torch.amp.autocast('cuda', enabled=False):
             if self.camera_head is not None:
                 pose_enc_list = self.camera_head(aggregated_tokens_list)
                 predictions["pose_enc"] = pose_enc_list[-1]  # pose encoding of the last iteration

models/vggt/vggt/models/vggt_moe.py CHANGED Viewed

@@ -65,13 +65,13 @@ class VGGT_MoE(nn.Module, PyTorchModelHubMixin):
         if len(images.shape) == 4:
             images = images.unsqueeze(0)
         with torch.no_grad():
             aggregated_tokens_list, patch_start_idx = self.aggregator(images_proc)
         predictions = {}
-        with torch.cuda.amp.autocast(enabled=False):
             if self.camera_head is not None:
                 pose_enc_list = self.camera_head(aggregated_tokens_list)
                 predictions["pose_enc"] = pose_enc_list[-1]  # pose encoding of the last iteration
@@ -97,11 +97,11 @@ class VGGT_MoE(nn.Module, PyTorchModelHubMixin):
                                                          size=(H, W), mode='bilinear', align_corners=True).permute(0,2,3,1)
         predictions["unc_metric"] = F.interpolate(predictions["unc_metric"][:,None],
                                                          size=(H, W), mode='bilinear', align_corners=True)[:,0]
-        predictions["intrs"][..., :1, :] *= W/W_proc
-        predictions["intrs"][..., 1:2, :] *= H/H_proc
         if self.training:
             loss = compute_loss(predictions, annots)
             predictions["loss"] = loss
         return predictions

         if len(images.shape) == 4:
             images = images.unsqueeze(0)
         with torch.no_grad():
             aggregated_tokens_list, patch_start_idx = self.aggregator(images_proc)
         predictions = {}
+        with torch.amp.autocast('cuda', enabled=False):
             if self.camera_head is not None:
                 pose_enc_list = self.camera_head(aggregated_tokens_list)
                 predictions["pose_enc"] = pose_enc_list[-1]  # pose encoding of the last iteration
                                                          size=(H, W), mode='bilinear', align_corners=True).permute(0,2,3,1)
         predictions["unc_metric"] = F.interpolate(predictions["unc_metric"][:,None],
                                                          size=(H, W), mode='bilinear', align_corners=True)[:,0]
+        predictions["intrs"][..., :1, :] *= W/W_proc
+        predictions["intrs"][..., 1:2, :] *= H/H_proc
         if self.training:
             loss = compute_loss(predictions, annots)
             predictions["loss"] = loss
         return predictions