Spaces:

yanboding
/

MTVCrafter

Runtime error

yanboding commited on May 29, 2025

Commit

9d9257a

verified ·

1 Parent(s): c3941ce

Update inference_engine.py

Files changed (1) hide show

inference_engine.py CHANGED Viewed

@@ -9,6 +9,7 @@ from torchvision.transforms import ToPILImage, transforms, InterpolationMode, fu
 import numpy as np
 import pickle
 import copy
 from draw_pose import get_pose_images
 from utils import concat_images_grid, sample_video, get_sample_indexes, get_new_height_width
@@ -18,7 +19,7 @@ def run_inference(device, motion_data_path, ref_image_path='', dst_width=512, ds
     normalize = transforms.Normalize([0.5], [0.5])
     pretrained_model_path = "THUDM/CogVideoX-5b"
     transformer_path = "yanboding/MTVCrafter/MV-DiT/CogVideoX"
-    tokenizer_path = "mp_rank_00_model_states.pt"
     with open(motion_data_path, 'rb') as f:
         data_list = pickle.load(f)
@@ -38,6 +39,11 @@ def run_inference(device, motion_data_path, ref_image_path='', dst_width=512, ds
     pipe.vae.enable_slicing()
     # load VQVAE
     state_dict = torch.load(tokenizer_path, map_location="cpu")
     motion_encoder = Encoder(in_channels=3, mid_channels=[128, 512], out_channels=3072, downsample_time=[2, 2], downsample_joint=[1, 1])
     motion_quant = VectorQuantizer(nb_code=8192, code_dim=3072, is_train=False)

 import numpy as np
 import pickle
 import copy
+from huggingface_hub import hf_hub_download
 from draw_pose import get_pose_images
 from utils import concat_images_grid, sample_video, get_sample_indexes, get_new_height_width
     normalize = transforms.Normalize([0.5], [0.5])
     pretrained_model_path = "THUDM/CogVideoX-5b"
     transformer_path = "yanboding/MTVCrafter/MV-DiT/CogVideoX"
+    tokenizer_path = "4DMoT/mp_rank_00_model_states.pt"
     with open(motion_data_path, 'rb') as f:
         data_list = pickle.load(f)
     pipe.vae.enable_slicing()
     # load VQVAE
+    vqvae_model_path = hf_hub_download(
+        repo_id="yanboding/MTVCrafter",
+        filename="4DMoT/mp_rank_00_model_states.pt"
+    )
     state_dict = torch.load(tokenizer_path, map_location="cpu")
     motion_encoder = Encoder(in_channels=3, mid_channels=[128, 512], out_channels=3072, downsample_time=[2, 2], downsample_joint=[1, 1])
     motion_quant = VectorQuantizer(nb_code=8192, code_dim=3072, is_train=False)