Spaces:

yslan
/

worldmem

Running on Zero

App Files Files Community

xizaoqu commited on Apr 12

Commit

100414d

1 Parent(s): f07d258

rm

Browse files

Files changed (32) hide show

app.py +1 -25
configurations/README.md +0 -7
configurations/algorithm/base_algo.yaml +0 -3
configurations/algorithm/base_pytorch_algo.yaml +0 -4
configurations/algorithm/df_base.yaml +0 -42
configurations/algorithm/df_video_worldmemminecraft.yaml +0 -42
configurations/algorithm/pose_prediction.yaml +0 -19
configurations/config.yaml +0 -16
configurations/dataset/base_dataset.yaml +0 -3
configurations/dataset/base_video.yaml +0 -14
configurations/dataset/video_minecraft.yaml +0 -14
configurations/dataset/video_minecraft_pose.yaml +0 -14
configurations/experiment/base_experiment.yaml +0 -2
configurations/experiment/base_pytorch.yaml +0 -50
configurations/experiment/exp_pose.yaml +0 -31
configurations/experiment/exp_video.yaml +0 -31
datasets/README.md +0 -11
datasets/__init__.py +0 -1
datasets/video/__init__.py +0 -2
datasets/video/base_video_dataset.py +0 -158
datasets/video/minecraft_video_dataset.py +0 -262
datasets/video/minecraft_video_dataset_oasis_filter.py +0 -99
datasets/video/minecraft_video_dataset_pose.py +0 -421
experiments/README.md +0 -19
experiments/__init__.py +0 -35
experiments/exp_base.py +0 -473
experiments/exp_pose.py +0 -310
experiments/exp_video.py +0 -25
main.py +0 -219
scripts/README.md +0 -10
scripts/dummy_script.sh +0 -1
split_checkpoint.py +0 -9

app.py CHANGED Viewed

@@ -10,13 +10,8 @@ import hydra
 from omegaconf import DictConfig, OmegaConf
 from omegaconf.omegaconf import open_dict
-from utils.print_utils import cyan
-from utils.ckpt_utils import download_latest_checkpoint, is_run_id
-from utils.cluster_utils import submit_slurm_job
-from utils.distributed_utils import is_rank_zero
 import numpy as np
 import torch
-from datasets.video.minecraft_video_dataset import *
 import torchvision.transforms as transforms
 import cv2
 import subprocess
@@ -351,18 +346,7 @@ def set_memory(examples_case, image_display, log_output, slider_denoising_step,
     return input_history, out_video[-1], temporal_video_path, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx
-css = """
-h1 {
-    text-align: center;
-    display:block;
-}
-"""
-def on_select(evt: gr.SelectData):
-    selected_index = evt.index
-    return examples[selected_index]
-with gr.Blocks(css=css) as demo:
     gr.Markdown(
         """
         # WORLDMEM: Long-term Consistent World Generation with Memory
@@ -515,13 +499,6 @@ with gr.Blocks(css=css) as demo:
     example_case = gr.Textbox(label="Case", visible=False)
     image_output = gr.Image(visible=False)
-    # gr.Examples(examples=example_images,
-    #     inputs=[example_case, image_output, log_output, slider_denoising_step, slider_context_length, slider_memory_length],
-    #     fn=set_memory,
-    #     outputs=[log_output, image_display, video_display, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx],
-    #     cache_examples=True
-    # )
     examples = gr.Examples(
         examples=example_images,
         inputs=[example_case, image_output, log_output, slider_denoising_step, slider_context_length, slider_memory_length],
@@ -534,7 +511,6 @@ with gr.Blocks(css=css) as demo:
         outputs=[log_output, image_display, video_display, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx]
     )
     submit_button.click(generate, inputs=[input_box, log_output, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx], outputs=[image_display, video_display, log_output, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx])
     reset_btn.click(reset, inputs=[selected_image], outputs=[log_output, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx])
     image_display_1.select(lambda: on_image_click(SUNFLOWERS_IMAGE), outputs=[log_output, selected_image, image_display, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx])

 from omegaconf import DictConfig, OmegaConf
 from omegaconf.omegaconf import open_dict
 import numpy as np
 import torch
 import torchvision.transforms as transforms
 import cv2
 import subprocess
     return input_history, out_video[-1], temporal_video_path, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx
+with gr.Blocks() as demo:
     gr.Markdown(
         """
         # WORLDMEM: Long-term Consistent World Generation with Memory
     example_case = gr.Textbox(label="Case", visible=False)
     image_output = gr.Image(visible=False)
     examples = gr.Examples(
         examples=example_images,
         inputs=[example_case, image_output, log_output, slider_denoising_step, slider_context_length, slider_memory_length],
         outputs=[log_output, image_display, video_display, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx]
     )
     submit_button.click(generate, inputs=[input_box, log_output, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx], outputs=[image_display, video_display, log_output, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx])
     reset_btn.click(reset, inputs=[selected_image], outputs=[log_output, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx])
     image_display_1.select(lambda: on_image_click(SUNFLOWERS_IMAGE), outputs=[log_output, selected_image, image_display, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx])

configurations/README.md DELETED Viewed

@@ -1,7 +0,0 @@
-# configurations
-We use [Hydra](https://hydra.cc/docs/intro/) to manage configurations. Change/Add the yaml files in this folder
-to change the default configurations. You can also override the default configurations by
-passing command line arguments.
-All configurations are automatically saved in wandb run.

configurations/algorithm/base_algo.yaml DELETED Viewed

@@ -1,3 +0,0 @@
-# This will be passed as the cfg to Algo.__init__(cfg) of your algorithm class
-debug: ${debug} # inherited from configurations/config.yaml

configurations/algorithm/base_pytorch_algo.yaml DELETED Viewed

@@ -1,4 +0,0 @@
-defaults:
-  - base_algo # inherits from configurations/algorithm/base_algo.yaml
-lr: ${experiment.training.lr}

configurations/algorithm/df_base.yaml DELETED Viewed

@@ -1,42 +0,0 @@
-defaults:
-  - base_pytorch_algo
-# dataset-dependent configurations
-x_shape: ${dataset.observation_shape}
-frame_stack: 1
-frame_skip: 1
-data_mean: ${dataset.data_mean}
-data_std: ${dataset.data_std}
-external_cond_dim: 0 #${dataset.action_dim}
-context_frames: ${dataset.context_length}
-# training hyperparameters
-weight_decay: 1e-4
-warmup_steps: 10000
-optimizer_beta: [0.9, 0.999]
-# diffusion-related
-uncertainty_scale: 1
-guidance_scale: 0.0
-chunk_size: 1 # -1 for full trajectory diffusion, number to specify diffusion chunk size
-scheduling_matrix: autoregressive
-noise_level: random_all
-causal: True
-diffusion:
-  # training
-  objective: pred_x0
-  beta_schedule: cosine
-  schedule_fn_kwargs: {}
-  clip_noise: 20.0
-  use_snr: False
-  use_cum_snr: False
-  use_fused_snr: False
-  snr_clip: 5.0
-  cum_snr_decay: 0.98
-  timesteps: 1000
-  # sampling
-  sampling_timesteps: 50 # fixme, numer of diffusion steps, should be increased
-  ddim_sampling_eta: 1.0
-  stabilization_level: 10
-  # architecture
-  architecture:
-    network_size: 64

configurations/algorithm/df_video_worldmemminecraft.yaml DELETED Viewed

@@ -1,42 +0,0 @@
-defaults:
-  - df_base
-n_frames: ${dataset.n_frames}
-frame_skip: ${dataset.frame_skip}
-metadata: ${dataset.metadata}
-# training hyperparameters
-weight_decay: 2e-3
-warmup_steps: 10000
-optimizer_beta: [0.9, 0.99]
-action_cond_dim: 25
-diffusion:
-  # training
-  beta_schedule: sigmoid
-  objective: pred_v
-  use_fused_snr: True
-  cum_snr_decay: 0.96
-  clip_noise: 20.
-  # sampling
-  sampling_timesteps: 20
-  ddim_sampling_eta: 0.0
-  stabilization_level: 15
-  # architecture
-  architecture:
-    network_size: 64
-    attn_heads: 4
-    attn_dim_head: 64
-    dim_mults: [1, 2, 4, 8]
-    resolution: ${dataset.resolution}
-    attn_resolutions: [16, 32, 64, 128]
-    use_init_temporal_attn: True
-    use_linear_attn: True
-    time_emb_type: rotary
-metrics:
-  # - fvd
-  # - fid
-  # - lpips
-_name: df_video_worldmemminecraft

configurations/algorithm/pose_prediction.yaml DELETED Viewed

@@ -1,19 +0,0 @@
-defaults:
-  - df_base
-n_frames: ${dataset.n_frames}
-frame_skip: ${dataset.frame_skip}
-metadata: ${dataset.metadata}
-# training hyperparameters
-weight_decay: 2e-3
-warmup_steps: 10000
-optimizer_beta: [0.9, 0.99]
-metrics:
-  # - fvd
-  # - fid
-  # - lpips
-_name: pose_prediction

configurations/config.yaml DELETED Viewed

@@ -1,16 +0,0 @@
-# configuration parsing starts here
-defaults:
-  - experiment: exp_video # experiment yaml file name in configurations/experiments folder [fixme]
-  - dataset: video_minecraft_oasis # dataset yaml file name in configurations/dataset folder [fixme]
-  - algorithm: df_video # algorithm yaml file name in configurations/algorithm folder [fixme]
-  - cluster: null # optional, cluster yaml file name in configurations/cluster folder. Leave null for local compute
-debug: false # global debug flag will be passed into configuration of experiment, dataset and algorithm
-wandb:
-  entity: xizaoqu # wandb account name / organization name [fixme]
-  project: diffusion-forcing # wandb project name; if not provided, defaults to root folder name [fixme]
-  mode: online # set wandb logging to online, offline or dryrun
-resume: null # wandb run id to resume logging and loading checkpoint from
-load: null # wandb run id containing checkpoint or a path to a checkpoint file

configurations/dataset/base_dataset.yaml DELETED Viewed

@@ -1,3 +0,0 @@
-# This will be passed as the cfg to Dataset.__init__(cfg) of your dataset class
-debug: ${debug} # inherited from configurations/config.yaml

configurations/dataset/base_video.yaml DELETED Viewed

@@ -1,14 +0,0 @@
-defaults:
-  - base_dataset
-metadata: "data/${dataset.name}/metadata.json"
-data_mean: "data/${dataset.name}/data_mean.npy"
-data_std: "data/${dataset.name}/data_std.npy"
-save_dir: ???
-n_frames: 32
-context_length: 4
-resolution: 128
-observation_shape: [3, "${dataset.resolution}", "${dataset.resolution}"]
-external_cond_dim: 0
-validation_multiplier: 1
-frame_skip: 1

configurations/dataset/video_minecraft.yaml DELETED Viewed

@@ -1,14 +0,0 @@
-defaults:
-  - base_video
-save_dir: data/minecraft_simple_backforward
-n_frames: 16 # TODO: increase later
-resolution: 128
-data_mean: 0.5
-data_std: 0.5
-action_cond_dim: 25
-context_length: 1
-frame_skip: 1
-validation_multiplier: 1
-_name: video_minecraft_oasis

configurations/dataset/video_minecraft_pose.yaml DELETED Viewed

@@ -1,14 +0,0 @@
-defaults:
-  - base_video
-save_dir: data/minecraft_simple_backforward
-n_frames: 16 # TODO: increase later
-resolution: 128
-data_mean: 0.5
-data_std: 0.5
-external_cond_dim: 25
-context_length: 1
-frame_skip: 1
-validation_multiplier: 1
-_name: video_minecraft_pose

configurations/experiment/base_experiment.yaml DELETED Viewed

	@@ -1,2 +0,0 @@
1	- debug: ${debug} # inherited from configurations/config.yaml
2	- tasks: [main] # tasks to run sequantially, such as [training, test], useful when your project has multiple stages and you want to run only a subset of them.

configurations/experiment/base_pytorch.yaml DELETED Viewed

@@ -1,50 +0,0 @@
-# inherites from base_experiment.yaml
-# most of the options have docs at https://lightning.ai/docs/pytorch/stable/common/trainer.html
-defaults:
-  - base_experiment
-tasks: [training] # tasks to run sequantially, change when your project has multiple stages and you want to run only a subset of them.
-num_nodes: 1 # number of gpu servers used in large scale distributed training
-training:
-  precision: 16-mixed # set float precision, 16-mixed is faster while 32 is more stable
-  compile: False # whether to compile the model with torch.compile
-  lr: 0.001 # learning rate
-  batch_size: 16 # training batch size; effective batch size is this number * gpu * nodes iff using distributed training
-  max_epochs: 1000 # set to -1 to train forever
-  max_steps: -1 # set to -1 to train forever, will override max_epochs
-  max_time: null # set to something like "00:12:00:00" to enable
-  data:
-    num_workers: 4 # number of CPU threads for data preprocessing.
-    shuffle: True # whether training data will be shuffled
-  optim:
-    accumulate_grad_batches: 1 # accumulate gradients for n batches before backprop
-    gradient_clip_val: 0 # clip gradients with norm above this value, set to 0 to disable
-  checkpointing:
-    # these are arguments to pytorch lightning's callback, `ModelCheckpoint` class
-    every_n_train_steps: 5000 # save a checkpoint every n train steps
-    every_n_epochs: null # mutually exclusive with ``every_n_train_steps`` and ``train_time_interval``
-    train_time_interval: null # in format of "00:12:00:00", mutually exclusive with ``every_n_train_steps`` and ``every_n_epochs``.
-    enable_version_counter: False # If this is ``False``, later checkpoint will be overwrite previous ones.
-validation:
-  precision: 16-mixed
-  compile: False # whether to compile the model with torch.compile
-  batch_size: 16 # validation batch size per GPU; effective batch size is this number * gpu * nodes iff using distributed training
-  val_every_n_step: 2000 # controls how frequent do we run validation, can be float (fraction of epoches) or int (steps) or null (if val_every_n_epoch is set)
-  val_every_n_epoch: null # if you want to do validation every n epoches, requires val_every_n_step to be null.
-  limit_batch: null # if null, run through validation set. Otherwise limit the number of batches to use for validation.
-  inference_mode: True # whether to run validation in inference mode (enable_grad won't work!)
-  data:
-    num_workers: 4 # number of CPU threads for data preprocessing, for validation.
-    shuffle: False # whether validation data will be shuffled
-test:
-  precision: 16-mixed
-  compile: False # whether to compile the model with torch.compile
-  batch_size: 4 # test batch size per GPU; effective batch size is this number * gpu * nodes iff using distributed training
-  limit_batch: null # if null, run through test set. Otherwise limit the number of batches to use for test.
-  data:
-    num_workers: 4 # number of CPU threads for data preprocessing, for test.
-    shuffle: False # whether test data will be shuffled

configurations/experiment/exp_pose.yaml DELETED Viewed

@@ -1,31 +0,0 @@
-defaults:
-  - base_pytorch
-tasks: [training]
-training:
-  lr: 8e-5
-  precision: 16-mixed
-  batch_size: 4
-  max_epochs: -1
-  max_steps: 2000005
-  checkpointing:
-    every_n_train_steps: 2500
-  optim:
-    gradient_clip_val: 1.0
-validation:
-  val_every_n_step: 300
-  val_every_n_epoch: null
-  batch_size: 4
-  limit_batch: 1
-test:
-  limit_batch: 1
-  batch_size: 1
-logging:
-  metrics:
-    # - fvd
-    # - fid
-    # - lpips

configurations/experiment/exp_video.yaml DELETED Viewed

@@ -1,31 +0,0 @@
-defaults:
-  - base_pytorch
-tasks: [training]
-training:
-  lr: 8e-5
-  precision: 16-mixed
-  batch_size: 4
-  max_epochs: -1
-  max_steps: 2000005
-  checkpointing:
-    every_n_train_steps: 2500
-  optim:
-    gradient_clip_val: 1.0
-validation:
-  val_every_n_step: 300
-  val_every_n_epoch: null
-  batch_size: 4
-  limit_batch: 1
-test:
-  limit_batch: 1
-  batch_size: 1
-logging:
-  metrics:
-    # - fvd
-    # - fid
-    # - lpips

datasets/README.md DELETED Viewed

@@ -1,11 +0,0 @@
-The `datasets` folder is used to contain dataset code or environment code.
-Don't store actual data like images here! For those, please use the `data` folder instead of `datasets`.
-Create a folder to create your own pytorch dataset definition. Then, update the `__init__.py`
-at every level to register all datasets.
-Each dataset class takes in a DictConfig file `cfg` in its `__init__`, which allows you to pass in arguments via configuration file in `configurations/dataset` or [command line override](https://hydra.cc/docs/tutorials/basic/your_first_app/simple_cli/).
----
-This repo is forked from [Boyuan Chen](https://boyuan.space/)'s research template [repo](https://github.com/buoyancy99/research-template). By its MIT license, you must keep the above sentence in `README.md` and the `LICENSE` file to credit the author.

datasets/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .video import MinecraftVideoDataset

datasets/video/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .minecraft_video_dataset import MinecraftVideoDataset
2	- from .minecraft_video_dataset_pose import MinecraftVideoPoseDataset

datasets/video/base_video_dataset.py DELETED Viewed

@@ -1,158 +0,0 @@
-from typing import Sequence
-import torch
-import random
-import os
-import numpy as np
-import cv2
-from omegaconf import DictConfig
-from torchvision import transforms
-from pathlib import Path
-from abc import abstractmethod, ABC
-import json
-class BaseVideoDataset(torch.utils.data.Dataset, ABC):
-    """
-    Base class for video datasets. Videos may be of variable length.
-    Folder structure of each dataset:
-    - [save_dir] (specified in config, e.g., data/phys101)
-        - /[split] (one per split)
-            - /data_folder_name (e.g., videos)
-            metadata.json
-    """
-    def __init__(self, cfg: DictConfig, split: str = "training"):
-        super().__init__()
-        self.cfg = cfg
-        self.split = split
-        self.resolution = cfg.resolution
-        self.external_cond_dim = cfg.external_cond_dim
-        self.n_frames = (
-            cfg.n_frames * cfg.frame_skip
-            if split == "training"
-            else cfg.n_frames * cfg.frame_skip * cfg.validation_multiplier
-        )
-        self.frame_skip = cfg.frame_skip
-        self.save_dir = Path(cfg.save_dir)
-        self.save_dir.mkdir(exist_ok=True, parents=True)
-        self.split_dir = self.save_dir / f"{split}"
-        self.metadata_path = self.save_dir / "metadata.json"
-        self.data_paths = self.get_data_paths(self.split)
-        if self.split == 'training':
-            self.metadata = [1200] * len(self.data_paths) # total 1500 f
-        else:
-            self.metadata = [1] * len(self.data_paths) # total 1500 f
-        # self.clips_per_video = np.clip(np.array(self.metadata[split]) - self.n_frames + 1, a_min=1, a_max=None).astype(
-        #     np.int32
-        # )
-        self.clips_per_video = np.clip(np.array(self.metadata) - self.n_frames + 1, a_min=1, a_max=None).astype(
-            np.int32
-        )
-        self.cum_clips_per_video = np.cumsum(self.clips_per_video)
-        self.transform = transforms.Resize((self.resolution, self.resolution), antialias=True)
-        # shuffle but keep the same order for each epoch, so validation sample is diverse yet deterministic
-        random.seed(0)
-        self.idx_remap = list(range(self.__len__()))
-        random.shuffle(self.idx_remap)
-    @abstractmethod
-    def download_dataset(self) -> Sequence[int]:
-        """
-        Download dataset from the internet and build it in save_dir
-        Returns a list of video lengths
-        """
-        raise NotImplementedError
-    @abstractmethod
-    def get_data_paths(self, split):
-        """Return a list of data paths (e.g. xxx.mp4) for a given split"""
-        raise NotImplementedError
-    def get_data_lengths(self, split):
-        """Return a list of num_frames for each data path (e.g. xxx.mp4) for a given split"""
-        lengths = []
-        for path in self.get_data_paths(split):
-            length = cv2.VideoCapture(str(path)).get(cv2.CAP_PROP_FRAME_COUNT)
-            lengths.append(length)
-        return lengths
-    def split_idx(self, idx):
-        video_idx = np.argmax(self.cum_clips_per_video > idx)
-        frame_idx = idx - np.pad(self.cum_clips_per_video, (1, 0))[video_idx]
-        return video_idx, frame_idx
-    @staticmethod
-    def load_video(path: Path):
-        """
-        Load video from a path
-        :param filename: path to the video
-        :return: video as a numpy array
-        """
-        cap = cv2.VideoCapture(str(path))
-        frames = []
-        while cap.isOpened():
-            ret, frame = cap.read()
-            if ret:
-                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                frames.append(frame)
-            else:
-                break
-        cap.release()
-        frames = np.stack(frames, dtype=np.uint8)
-        return np.transpose(frames, (0, 3, 1, 2))  # (T, C, H, W)
-    @staticmethod
-    def load_image(filename: Path):
-        """
-        Load image from a path
-        :param filename: path to the image
-        :return: image as a numpy array
-        """
-        image = cv2.imread(str(filename))
-        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-        return np.transpose(image, (2, 0, 1))
-    def __len__(self):
-        return self.clips_per_video.sum()
-    def __getitem__(self, idx):
-        idx = self.idx_remap[idx]
-        video_idx, frame_idx = self.split_idx(idx)
-        video_path = self.data_paths[video_idx]
-        video = self.load_video(video_path)[frame_idx : frame_idx + self.n_frames]
-        pad_len = self.n_frames - len(video)
-        nonterminal = np.ones(self.n_frames)
-        if len(video) < self.n_frames:
-            video = np.pad(video, ((0, pad_len), (0, 0), (0, 0), (0, 0)))
-            nonterminal[-pad_len:] = 0
-        video = torch.from_numpy(video / 256.0).float()
-        video = self.transform(video)
-        if self.external_cond_dim:
-            external_cond = np.load(
-                # pylint: disable=no-member
-                self.condition_dir
-                / f"{video_path.name.replace('.mp4', '.npy')}"
-            )
-            if len(external_cond) < self.n_frames:
-                external_cond = np.pad(external_cond, ((0, pad_len),))
-            external_cond = torch.from_numpy(external_cond).float()
-            return (
-                video[:: self.frame_skip],
-                external_cond[:: self.frame_skip],
-                nonterminal[:: self.frame_skip],
-            )
-        else:
-            return video[:: self.frame_skip], nonterminal[:: self.frame_skip]

datasets/video/minecraft_video_dataset.py DELETED Viewed

@@ -1,262 +0,0 @@
-import os
-import io
-import tarfile
-import numpy as np
-import torch
-from typing import Sequence, Mapping
-from omegaconf import DictConfig
-from pytorchvideo.data.encoded_video import EncodedVideo
-import random
-from .base_video_dataset import BaseVideoDataset
-ACTION_KEYS = [
-    "inventory",
-    "ESC",
-    "hotbar.1",
-    "hotbar.2",
-    "hotbar.3",
-    "hotbar.4",
-    "hotbar.5",
-    "hotbar.6",
-    "hotbar.7",
-    "hotbar.8",
-    "hotbar.9",
-    "forward",
-    "back",
-    "left",
-    "right",
-    "cameraY",
-    "cameraX",
-    "jump",
-    "sneak",
-    "sprint",
-    "swapHands",
-    "attack",
-    "use",
-    "pickItem",
-    "drop",
-]
-def convert_action_space(actions):
-    vec_25 = torch.zeros(len(actions), len(ACTION_KEYS))
-    vec_25[actions[:,0]==1, 11] = 1
-    vec_25[actions[:,0]==2, 12] = 1
-    vec_25[actions[:,4]==11, 16] = -1
-    vec_25[actions[:,4]==13, 16] = 1
-    vec_25[actions[:,3]==11, 15] = -1
-    vec_25[actions[:,3]==13, 15] = 1
-    vec_25[actions[:,5]==6, 24] = 1
-    vec_25[actions[:,5]==1, 24] = 1
-    vec_25[actions[:,1]==1, 13] = 1
-    vec_25[actions[:,1]==2, 14] = 1
-    vec_25[actions[:,7]==1, 2] = 1
-    return vec_25
-# Dataset class
-class MinecraftVideoDataset(BaseVideoDataset):
-    """
-    Minecraft video dataset for training and validation.
-    Args:
-        cfg (DictConfig): Configuration object.
-        split (str): Dataset split ("training" or "validation").
-    """
-    def __init__(self, cfg: DictConfig, split: str = "training"):
-        if split == "test":
-            split = "validation"
-        super().__init__(cfg, split)
-        self.n_frames = cfg.n_frames_valid if split == "validation" and hasattr(cfg, "n_frames_valid") else cfg.n_frames
-        self.use_plucker = cfg.use_plucker
-        self.condition_similar_length = cfg.condition_similar_length
-        self.customized_validation = cfg.customized_validation
-        self.angle_range = cfg.angle_range
-        self.pos_range = cfg.pos_range
-        self.add_frame_timestep_embedder = cfg.add_frame_timestep_embedder
-        self.training_dropout = 0.1
-        self.sample_more_place = getattr(cfg, "sample_more_place", False)
-        self.within_context = getattr(cfg, "within_context", False)
-        self.sample_more_event = getattr(cfg, "sample_more_event", False)
-        self.causal_frame = getattr(cfg, "causal_frame", False)
-    def get_data_paths(self, split: str):
-        """
-        Retrieve all video file paths for the given split.
-        Args:
-            split (str): Dataset split ("training" or "validation").
-        Returns:
-            List[Path]: List of video file paths.
-        """
-        data_dir = self.save_dir / split
-        paths = sorted(list(data_dir.glob("**/*.mp4")), key=lambda x: x.name)
-        if not paths:
-            sub_dirs = os.listdir(data_dir)
-            for sub_dir in sub_dirs:
-                sub_path = data_dir / sub_dir
-                paths += sorted(list(sub_path.glob("**/*.mp4")), key=lambda x: x.name)
-        return paths
-    def download_dataset(self):
-        pass
-    def __getitem__(self, idx: int):
-        """
-        Retrieve a single data sample by index.
-        Args:
-            idx (int): Index of the data sample.
-        Returns:
-            Tuple[torch.Tensor, torch.Tensor, np.ndarray, np.ndarray]: Video, actions, poses, and timesteps.
-        """
-        max_retries = 1000
-        for _ in range(max_retries):
-            try:
-                return self.load_data(idx)
-            except Exception as e:
-                print(f"Retrying due to error: {e}")
-                idx = (idx + 1) % len(self)
-    def load_data(self, idx):
-        idx = self.idx_remap[idx]
-        file_idx, frame_idx = self.split_idx(idx)
-        action_path = self.data_paths[file_idx]
-        video_path = self.data_paths[file_idx]
-        action_path = video_path.with_suffix(".npz")
-        actions_pool = np.load(action_path)['actions']
-        poses_pool = np.load(action_path)['poses']
-        poses_pool[0,1] = poses_pool[1,1] # wrong first in place
-        assert poses_pool[:,1].max() - poses_pool[:,1].min() < 2, f"wrong~~~~{poses_pool[:,1].max() - poses_pool[:,1].min()}-{video_path}"
-        if len(poses_pool) < len(actions_pool):
-            poses_pool = np.pad(poses_pool, ((1, 0), (0, 0)))
-        actions_pool = convert_action_space(actions_pool)
-        video_raw = EncodedVideo.from_path(video_path, decode_audio=False)
-        frame_idx = frame_idx + 100 # avoid first frames # first frame is useless
-        if self.split == "validation":
-            frame_idx = 240
-        if self.sample_more_place and self.split == "training":
-            if random.uniform(0, 1) > 0.5:
-                place_mask = (actions_pool[:,24]==1)
-                place_mask[:100] = 0
-                valid_indices = np.where(place_mask)[0]
-                random_index = np.random.choice(valid_indices)
-                frame_idx = random_index - random.randint(1, self.n_frames-1)
-        total_frame = video_raw.duration.numerator
-        fps = 10 # video_raw.duration.denominator
-        total_frame = total_frame * fps / video_raw.duration.denominator
-        video = video_raw.get_clip(start_sec=frame_idx/fps, end_sec=(frame_idx+self.n_frames)/fps)["video"]
-        video = video.permute(1, 2, 3, 0).numpy()
-        if self.split != "validation" and 'degrees' in np.load(action_path).keys():
-            degrees = np.load(action_path)['degrees']
-            actions_pool[:,16] *= degrees
-        actions = np.copy(actions_pool[frame_idx : frame_idx + self.n_frames])
-        poses = np.copy(poses_pool[frame_idx : frame_idx + self.n_frames])
-        pad_len = self.n_frames - len(video)
-        poses_pool[:,:3] -= poses[:1,:3]
-        poses_pool[:,-1] = -poses_pool[:,-1]
-        poses_pool[:,3:] %= 360
-        poses[:,:3] -= poses[:1,:3] # do not normalize angle
-        poses[:,-1] = -poses[:,-1]
-        poses[:,3:] %= 360
-        assert len(video) >= self.n_frames, f"{video_path}"
-        if self.split == "training" and self.condition_similar_length>0:
-            if random.uniform(0, 1) > self.training_dropout:
-                refer_frame_dis = poses[:,None] - poses_pool[None,:]
-                refer_frame_dis = np.abs(refer_frame_dis)
-                refer_frame_dis[...,3:][refer_frame_dis[...,3:] > 180] = 360 - refer_frame_dis[...,3:][refer_frame_dis[...,3:] > 180]
-                valid_index = ((((refer_frame_dis[..., :3] <= self.pos_range).sum(-1))>=3) & (((refer_frame_dis[..., 3:] <= self.angle_range).sum(-1))>=2) & \
-                    ((((refer_frame_dis[..., :3] > 0).sum(-1))>=1) | (((refer_frame_dis[..., 3:] > 0).sum(-1))>=1))
-                    ).sum(0)
-                valid_index[:100] = 0 # mute bad initial scene
-                if self.add_frame_timestep_embedder and self.causal_frame and (actions_pool[:frame_idx,24]==1).sum() > 0:
-                    valid_index[frame_idx:] = 0
-                mask = valid_index >= 1
-                mask[0] = False
-                candidate_indices = np.argwhere(mask)
-                mask2 = valid_index >= 0
-                mask2[0] = False
-                count = min(self.condition_similar_length, candidate_indices.shape[0])
-                selected_indices = candidate_indices[np.random.choice(candidate_indices.shape[0], count, replace=True)][:,0]
-                if count < self.condition_similar_length:
-                    candidate_indices2 = np.argwhere(mask2)
-                    selected_indices2 = candidate_indices2[np.random.choice(candidate_indices2.shape[0], self.condition_similar_length-count, replace=True)][:,0]
-                    selected_indices = np.concatenate([selected_indices, selected_indices2])
-                if self.sample_more_event:
-                    if random.uniform(0, 1) > 0.3:
-                        valid_idx = torch.nonzero(actions_pool[:frame_idx,24]==1)[:,0]
-                        if len(valid_idx) > self.condition_similar_length //2:
-                            valid_idx = valid_idx[-self.condition_similar_length //2:]
-                        if len(valid_idx) > 0:
-                            selected_indices[-len(valid_idx):] = valid_idx + 4
-            else:
-                selected_indices = np.array(list(range(self.condition_similar_length))) * 0 + random.randint(0, frame_idx)
-            video_pool = []
-            for si in selected_indices:
-                video_pool.append(video_raw.get_clip(start_sec=si/fps, end_sec=(si+1)/fps)["video"][:,0].permute(1,2,0))
-            video_pool = np.stack(video_pool)
-            video = np.concatenate([video, video_pool])
-            actions = np.concatenate([actions, actions_pool[selected_indices]])
-            poses = np.concatenate([poses, poses_pool[selected_indices]])
-            timestep = np.concatenate([np.array(list(range(frame_idx, frame_idx + self.n_frames))), selected_indices])
-        else:
-            timestep = np.array(list(range(self.n_frames)))
-        video = torch.from_numpy(video / 255.0).float().permute(0, 3, 1, 2).contiguous()
-        if self.split == "validation" and not self.customized_validation:
-            num_frame = actions.shape[0]
-            actions[:] = 0
-            actions[:,16] = 1
-            poses[:] = 0
-            for ff in range(1, num_frame):
-                poses[ff,4] = poses[ff-1,4] + actions[ff,16] * -15
-            if self.within_context:
-                actions[:] = 0
-                actions[:self.n_frames//2+1,16] = 1
-                actions[self.n_frames//2+1:,16] = -1
-                poses[:] = 0
-                for ff in range(1, num_frame):
-                    poses[ff,4] = poses[ff-1,4] + actions[ff,16] * -15
-        return (
-            video[:: self.frame_skip],
-            actions[:: self.frame_skip],
-            poses[:: self.frame_skip],
-            timestep
-        )

datasets/video/minecraft_video_dataset_oasis_filter.py DELETED Viewed

@@ -1,99 +0,0 @@
-import torch
-from typing import Sequence
-import numpy as np
-import io
-from omegaconf import DictConfig
-from tqdm import tqdm
-from typing import Mapping, Sequence
-import os
-import math
-from packaging import version as pver
-from PIL import Image
-import random
-import shutil
-import os
-from pathlib import Path
-import traceback
-class OASISMinecraftVideoFilterDataset(torch.utils.data.Dataset):
-    """
-    Minecraft dataset
-    """
-    def __init__(self, source_dir, target_dir, split):
-        self.source_dir = Path(source_dir)
-        self.split_dir = self.source_dir / f"{split}"
-        self.data_paths = self.get_data_paths(split)
-        self.target_dir = Path(target_dir) / f"{split}"
-        self.target_dir.mkdir(exist_ok=True, parents=True)
-    def get_data_paths(self, split):
-        data_dir = self.source_dir / split
-        paths = sorted(list(data_dir.glob("**/*.mp4")), key=lambda x: x.name)
-        if len(paths) == 0:
-            sub_path = os.listdir(data_dir)
-            for sp in sub_path:
-                data_dir = self.source_dir / split / sp
-                paths = paths+sorted(list(data_dir.glob("**/*.mp4")), key=lambda x: x.name)
-        return paths
-    def __len__(self):
-        return len(self.data_paths)
-    def __getitem__(self, idx):
-        return self.sub_get(idx)
-        # try:
-        #     return self.sub_get(idx)
-        # except Exception as e:
-        #     traceback.print_exc()
-        #     # return self.sub_get(0)
-    def sub_get(self, idx):
-        action_path = self.data_paths[idx]
-        video_path = self.data_paths[idx]
-        action_path = video_path.with_suffix(".npz")
-        actions_pool = np.load(action_path)['actions']
-        poses_pool = np.load(action_path)['poses']
-        poses_pool[0,1] = poses_pool[1,1] # wrong first in place
-        print(poses_pool.shape)
-        if poses_pool[:,1].max() - poses_pool[:,1].min() < 2:
-            target_action_path = self.target_dir / action_path.parent.name / action_path.name
-            target_video_path = self.target_dir / video_path.parent.name / video_path.name
-            target_action_path.parent.mkdir(exist_ok=True, parents=True)
-            target_video_path.parent.mkdir(exist_ok=True, parents=True)
-            try:
-                shutil.copy2(action_path, target_action_path)
-                shutil.copy2(video_path, target_video_path)
-            except:
-                import pdb;pdb.set_trace()
-        return poses_pool[:10]
-if __name__ == "__main__":
-    import torch
-    from unittest.mock import MagicMock
-    import tqdm
-    cfg = MagicMock()
-    cfg.resolution = 64
-    cfg.external_cond_dim = 0
-    cfg.n_frames = 64
-    cfg.save_dir = "data/minecraft"
-    cfg.validation_multiplier = 1
-    dataset = MinecraftVideoDataset(cfg, "training")
-    dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=16)
-    for batch in tqdm.tqdm(dataloader):
-        pass

datasets/video/minecraft_video_dataset_pose.py DELETED Viewed

@@ -1,421 +0,0 @@
-import torch
-from typing import Sequence
-import numpy as np
-import io
-import tarfile
-from pytorchvideo.data.encoded_video import EncodedVideo
-from omegaconf import DictConfig
-from tqdm import tqdm
-from .base_video_dataset import BaseVideoDataset
-from typing import Mapping, Sequence
-import os
-import math
-from packaging import version as pver
-from PIL import Image
-import random
-def euler_to_rotation_matrix(pitch, yaw):
-    """
-    Convert euler angles (pitch, yaw) to a 3x3 rotation matrix.
-    pitch: rotation around x-axis (in radians)
-    yaw: rotation around y-axis (in radians)
-    """
-    # Rotation matrix around x-axis (pitch)
-    R_x = np.array([
-        [1, 0, 0],
-        [0, math.cos(pitch), -math.sin(pitch)],
-        [0, math.sin(pitch), math.cos(pitch)]
-    ])
-    # Rotation matrix around y-axis (yaw)
-    R_y = np.array([
-        [math.cos(yaw), 0, math.sin(yaw)],
-        [0, 1, 0],
-        [-math.sin(yaw), 0, math.cos(yaw)]
-    ])
-    # Combined rotation matrix
-    R = np.dot(R_y, R_x)
-    return R
-def custom_meshgrid(*args):
-    # ref: https://pytorch.org/docs/stable/generated/torch.meshgrid.html?highlight=meshgrid#torch.meshgrid
-    if pver.parse(torch.__version__) < pver.parse('1.10'):
-        return torch.meshgrid(*args)
-    else:
-        return torch.meshgrid(*args, indexing='ij')
-def camera_to_world_to_world_to_camera(camera_to_world):
-    """
-    Convert Camera-to-World matrix to World-to-Camera matrix by inverting the transformation.
-    """
-    # Extract rotation (R) and translation (T)
-    R = camera_to_world[:3, :3]
-    T = camera_to_world[:3, 3]
-    # Calculate World-to-Camera (inverse) matrix
-    world_to_camera = np.eye(4)
-    # The rotation part of World-to-Camera is the transpose of Camera-to-World's rotation
-    world_to_camera[:3, :3] = R.T
-    # The translation part is the negative of the rotated translation
-    world_to_camera[:3, 3] = -np.dot(R.T, T)
-    return world_to_camera
-def euler_to_camera_to_world_matrix(pose):
-    x, y, z, pitch, yaw = pose
-    # Convert pitch and yaw to radians
-    pitch = math.radians(pitch)
-    yaw = math.radians(yaw)
-    # Get the rotation matrix from Euler angles
-    R = euler_to_rotation_matrix(pitch, yaw)
-    # Create the 4x4 transformation matrix (rotation + translation)
-    camera_to_world = np.eye(4)
-    # Set the rotation part (upper 3x3)
-    camera_to_world[:3, :3] = R
-    # Set the translation part (last column)
-    camera_to_world[:3, 3] = [x, y, z]
-    return camera_to_world
-def tensor_to_gif(tensor, output_path, fps=10):
-    """
-    Converts a PyTorch tensor of shape (F, 3, H, W) to a GIF.
-    Args:
-        tensor (torch.Tensor): Input tensor of shape (F, 3, H, W) with values in range [0, 1] or [0, 255].
-        output_path (str): Path to save the output GIF.
-        fps (int): Frames per second for the GIF.
-    """
-    # Ensure the tensor is in [0, 255] range
-    if tensor.max() <= 1.0:
-        tensor = (tensor * 255).byte()
-    else:
-        tensor = tensor.byte()
-    # Convert tensor to numpy array and rearrange to (F, H, W, 3)
-    frames = tensor.permute(0, 2, 3, 1).cpu().numpy()
-    # Convert frames to PIL Images
-    pil_frames = [Image.fromarray(frame) for frame in frames]
-    # Save as GIF
-    pil_frames[0].save(
-        output_path,
-        save_all=True,
-        append_images=pil_frames[1:],
-        duration=int(1000 / fps),
-        loop=0
-    )
-def get_relative_pose(cam_params, zero_first_frame_scale):
-    abs_w2cs = [cam_param.w2c_mat for cam_param in cam_params]
-    abs_c2ws = [cam_param.c2w_mat for cam_param in cam_params]
-    source_cam_c2w = abs_c2ws[0]
-    if zero_first_frame_scale:
-        cam_to_origin = 0
-    else:
-        cam_to_origin = np.linalg.norm(source_cam_c2w[:3, 3])
-    target_cam_c2w = np.array([
-        [1, 0, 0, 0],
-        [0, 1, 0, -cam_to_origin],
-        [0, 0, 1, 0],
-        [0, 0, 0, 1]
-    ])
-    abs2rel = target_cam_c2w @ abs_w2cs[0]
-    ret_poses = [target_cam_c2w, ] + [abs2rel @ abs_c2w for abs_c2w in abs_c2ws[1:]]
-    ret_poses = np.array(ret_poses, dtype=np.float32)
-    return ret_poses
-def ray_condition(K, c2w, H, W, device):
-    # c2w: B, V, 4, 4
-    # K: B, V, 4
-    B = K.shape[0]
-    j, i = custom_meshgrid(
-        torch.linspace(0, H - 1, H, device=device, dtype=c2w.dtype),
-        torch.linspace(0, W - 1, W, device=device, dtype=c2w.dtype),
-    )
-    i = i.reshape([1, 1, H * W]).expand([B, 1, H * W]) + 0.5  # [B, HxW]
-    j = j.reshape([1, 1, H * W]).expand([B, 1, H * W]) + 0.5  # [B, HxW]
-    fx, fy, cx, cy = K.chunk(4, dim=-1)  # B,V, 1
-    zs = torch.ones_like(i)  # [B, HxW]
-    xs = (i - cx) / fx * zs
-    ys = (j - cy) / fy * zs
-    zs = zs.expand_as(ys)
-    directions = torch.stack((xs, ys, zs), dim=-1)  # B, V, HW, 3
-    directions = directions / directions.norm(dim=-1, keepdim=True)  # B, V, HW, 3
-    rays_d = directions @ c2w[..., :3, :3].transpose(-1, -2)  # B, V, 3, HW
-    rays_o = c2w[..., :3, 3]  # B, V, 3
-    rays_o = rays_o[:, :, None].expand_as(rays_d)  # B, V, 3, HW
-    # c2w @ dirctions
-    rays_dxo = torch.linalg.cross(rays_o, rays_d)
-    plucker = torch.cat([rays_dxo, rays_d], dim=-1)
-    plucker = plucker.reshape(B, c2w.shape[1], H, W, 6)  # B, V, H, W, 6
-    return plucker
-class Camera(object):
-    def __init__(self, entry, focal_length=0.35):
-        self.fx = focal_length # 0.35 correspond to 110 fov
-        self.fy = focal_length*640/360
-        self.cx = 0.5
-        self.cy = 0.5
-        self.c2w_mat = euler_to_camera_to_world_matrix(entry)
-        self.w2c_mat = camera_to_world_to_world_to_camera(np.copy(self.c2w_mat))
-ACTION_KEYS = [
-    "inventory",
-    "ESC",
-    "hotbar.1",
-    "hotbar.2",
-    "hotbar.3",
-    "hotbar.4",
-    "hotbar.5",
-    "hotbar.6",
-    "hotbar.7",
-    "hotbar.8",
-    "hotbar.9",
-    "forward",
-    "back",
-    "left",
-    "right",
-    "cameraY",
-    "cameraX",
-    "jump",
-    "sneak",
-    "sprint",
-    "swapHands",
-    "attack",
-    "use",
-    "pickItem",
-    "drop",
-]
-def one_hot_actions(actions: Sequence[Mapping[str, int]]) -> torch.Tensor:
-    actions_one_hot = torch.zeros(len(actions), len(ACTION_KEYS))
-    for i, current_actions in enumerate(actions):
-        for j, action_key in enumerate(ACTION_KEYS):
-            if action_key.startswith("camera"):
-                if action_key == "cameraX":
-                    value = current_actions["camera"][0]
-                elif action_key == "cameraY":
-                    value = current_actions["camera"][1]
-                else:
-                    raise ValueError(f"Unknown camera action key: {action_key}")
-                max_val = 20
-                bin_size = 0.5
-                num_buckets = int(max_val / bin_size)
-                value = (value - num_buckets) / num_buckets
-                assert -1 - 1e-3 <= value <= 1 + 1e-3, f"Camera action value must be in [-1, 1], got {value}"
-            else:
-                value = current_actions[action_key]
-                assert 0 <= value <= 1, f"Action value must be in [0, 1] got {value}"
-            actions_one_hot[i, j] = value
-    return actions_one_hot
-def simpletomulti(actions):
-    vec_25 = torch.zeros(len(actions), len(ACTION_KEYS))
-    vec_25[actions==1, 11] = 1
-    vec_25[actions==2, 16] = -1
-    vec_25[actions==3, 16] = 1
-    vec_25[actions==4, 15] = -1
-    vec_25[actions==5, 15] = 1
-    return vec_25
-def simpletomulti2(actions):
-    vec_25 = torch.zeros(len(actions), len(ACTION_KEYS))
-    vec_25[actions[:,0]==1, 11] = 1
-    vec_25[actions[:,0]==2, 12] = 1
-    vec_25[actions[:,4]==11, 16] = -1
-    vec_25[actions[:,4]==13, 16] = 1
-    vec_25[actions[:,3]==11, 15] = -1
-    vec_25[actions[:,3]==13, 15] = 1
-    vec_25[actions[:,5]==6, 24] = 1
-    vec_25[actions[:,5]==1, 24] = 1
-    vec_25[actions[:,1]==1, 13] = 1
-    vec_25[actions[:,1]==2, 14] = 1
-    vec_25[actions[:,7]==1, 2] = 1
-    return vec_25
-class MinecraftVideoPoseDataset(BaseVideoDataset):
-    """
-    Minecraft dataset
-    """
-    def __init__(self, cfg: DictConfig, split: str = "training"):
-        if split == "test":
-            split = "validation"
-        super().__init__(cfg, split)
-        if hasattr(cfg, "n_frames_valid") and split == "validation":
-            self.n_frames = cfg.n_frames_valid
-    def get_data_paths(self, split):
-        data_dir = self.save_dir / split
-        paths = sorted(list(data_dir.glob("**/*.mp4")), key=lambda x: x.name)
-        if len(paths) == 0:
-            sub_path = os.listdir(data_dir)
-            for sp in sub_path:
-                data_dir = self.save_dir / split / sp
-                paths = paths+sorted(list(data_dir.glob("**/*.mp4")), key=lambda x: x.name)
-        return paths
-    def get_data_lengths(self, split):
-        lengths = [300] * len(self.get_data_paths(split))
-        return lengths
-    def download_dataset(self) -> Sequence[int]:
-        from internetarchive import download
-        part_suffixes = [
-            "aa",
-            "ab",
-            "ac",
-            "ad",
-            "ae",
-            "af",
-            "ag",
-            "ah",
-            "ai",
-            "aj",
-            "ak",
-        ]
-        for part_suffix in part_suffixes:
-            identifier = f"minecraft_marsh_dataset_{part_suffix}"
-            file_name = f"minecraft.tar.part{part_suffix}"
-            download(identifier, file_name, destdir=self.save_dir, verbose=True)
-        combined_bytes = io.BytesIO()
-        for part_suffix in part_suffixes:
-            identifier = f"minecraft_marsh_dataset_{part_suffix}"
-            file_name = f"minecraft.tar.part{part_suffix}"
-            part_file = self.save_dir / identifier / file_name
-            with open(part_file, "rb") as part:
-                combined_bytes.write(part.read())
-        combined_bytes.seek(0)
-        with tarfile.open(fileobj=combined_bytes, mode="r") as combined_archive:
-            combined_archive.extractall(self.save_dir)
-        (self.save_dir / "minecraft/test").rename(self.save_dir / "validation")
-        (self.save_dir / "minecraft/train").rename(self.save_dir / "training")
-        (self.save_dir / "minecraft").rmdir()
-        for part_suffix in part_suffixes:
-            identifier = f"minecraft_marsh_dataset_{part_suffix}"
-            file_name = f"minecraft.tar.part{part_suffix}"
-            part_file = self.save_dir / identifier / file_name
-            part_file.rmdir()
-    def __getitem__(self, idx):
-        # return self.load_data(idx)
-        max_retries = 1000
-        for mr in range(max_retries):
-            try:
-                return self.load_data(idx)
-            except Exception as e:
-                print(f"{mr} Error: {e}")
-                # idx = self.idx_remap[idx]
-                # file_idx, frame_idx = self.split_idx(idx)
-                # video_path = self.data_paths[file_idx]
-                # os.remove(video_path)
-                idx = (idx + 1) % self.__len__()
-    def load_data(self, idx):
-        idx = self.idx_remap[idx]
-        file_idx, frame_idx = self.split_idx(idx)
-        action_path = self.data_paths[file_idx]
-        video_path = self.data_paths[file_idx]
-        action_path = video_path.with_suffix(".npz")
-        actions_pool = np.load(action_path)['actions']
-        poses_pool = np.load(action_path)['poses']
-        poses_pool[0,1] = poses_pool[1,1] # wrong first in place
-        assert poses_pool[:,1].max() - poses_pool[:,1].min() < 2, f"wrong~~~~{poses_pool[:,1].max() - poses_pool[:,1].min()}-{video_path}"
-        if len(poses_pool) < len(actions_pool):
-            poses_pool = np.pad(poses_pool, ((1, 0), (0, 0)))
-        actions_pool = simpletomulti2(actions_pool)
-        video_raw = EncodedVideo.from_path(video_path, decode_audio=False)
-        frame_idx = frame_idx + 100 # avoid first frames # first frame is useless
-        if self.split == "validation":
-            frame_idx = 240
-        total_frame = video_raw.duration.numerator
-        fps = 10 # video_raw.duration.denominator
-        total_frame = total_frame * fps / video_raw.duration.denominator
-        video = video_raw.get_clip(start_sec=frame_idx/fps, end_sec=(frame_idx+self.n_frames)/fps)["video"]
-        video = video.permute(1, 2, 3, 0).numpy()
-        if self.split != "validation" and 'degrees' in np.load(action_path).keys():
-            degrees = np.load(action_path)['degrees']
-            actions_pool[:,16] *= degrees
-        actions = np.copy(actions_pool[frame_idx : frame_idx + self.n_frames])  # (t, )
-        poses = np.copy(poses_pool[frame_idx : frame_idx + self.n_frames])
-        pad_len = self.n_frames - len(video)
-        poses_pool[:,:3] -= poses[:1,:3]
-        # poses_pool[:,3:] = -poses_pool[:,3:]
-        poses_pool[:,-1] = -poses_pool[:,-1]
-        poses_pool[:,3:] %= 360
-        poses[:,:3] -= poses[:1,:3] # do not normalize angle
-        # poses[:,3:] = -poses[:,3:]
-        poses[:,-1] = -poses[:,-1]
-        poses[:,3:] %= 360
-        nonterminal = np.ones(self.n_frames)
-        if len(video) < self.n_frames:
-            video = np.pad(video, ((0, pad_len), (0, 0), (0, 0), (0, 0)))
-            actions = np.pad(actions, ((0, pad_len),))
-            poses = np.pad(actions, ((0, pad_len),))
-            nonterminal[-pad_len:] = 0
-        video = torch.from_numpy(video / 255.0).float().permute(0, 3, 1, 2).contiguous()
-        return (
-            video[:: self.frame_skip],
-            actions[:: self.frame_skip],
-            poses[:: self.frame_skip]
-        )
-if __name__ == "__main__":
-    import torch
-    from unittest.mock import MagicMock
-    import tqdm
-    cfg = MagicMock()
-    cfg.resolution = 64
-    cfg.external_cond_dim = 0
-    cfg.n_frames = 64
-    cfg.save_dir = "data/minecraft"
-    cfg.validation_multiplier = 1
-    dataset = MinecraftVideoDataset(cfg, "training")
-    dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=16)
-    for batch in tqdm.tqdm(dataloader):
-        pass

experiments/README.md DELETED Viewed

@@ -1,19 +0,0 @@
-# experiments
-`experiments` folder contains code of experiments. Each file in the experiment folder represents a certain type of
-benchmark specific to a project. Such experiment can be instantiated with a certain dataset and a certain algorithm.
-You should create a new `.py` file for your experiment,
-inherent from any suitable base classes in `experiments/exp_base.py`,
-and then register your new experiment in `experiments/__init__.py`.
-You run an experiment by running `python -m main [options]` in the root directory of the
-project. You should not log any data in this folder, but storing them under `outputs` under root project
-directory.
-This folder is only intend to contain formal experiments. For debug code and unit tests, put them under `debug` folder.
-For scripts that's not meant to be an experiment please use `scripts` folder.
----
-This repo is forked from [Boyuan Chen](https://boyuan.space/)'s research template [repo](https://github.com/buoyancy99/research-template). By its MIT license, you must keep the above sentence in `README.md` and the `LICENSE` file to credit the author.

experiments/__init__.py DELETED Viewed

@@ -1,35 +0,0 @@
-from typing import Optional, Union
-from omegaconf import DictConfig
-import pathlib
-from lightning.pytorch.loggers.wandb import WandbLogger
-from .exp_base import BaseExperiment
-from .exp_video import VideoPredictionExperiment
-from .exp_pose import PoseExperiment
-# each key has to be a yaml file under '[project_root]/configurations/experiment' without .yaml suffix
-exp_registry = dict(
-    exp_video=VideoPredictionExperiment,
-    exp_pose=PoseExperiment
-)
-def build_experiment(
-    cfg: DictConfig,
-    logger: Optional[WandbLogger] = None,
-    ckpt_path: Optional[Union[str, pathlib.Path]] = None,
-) -> BaseExperiment:
-    """
-    Build an experiment instance based on registry
-    :param cfg: configuration file
-    :param logger: optional logger for the experiment
-    :param ckpt_path: optional checkpoint path for saving and loading
-    :return:
-    """
-    if cfg.experiment._name not in exp_registry:
-        raise ValueError(
-            f"Experiment {cfg.experiment._name} not found in registry {list(exp_registry.keys())}. "
-            "Make sure you register it correctly in 'experiments/__init__.py' under the same name as yaml file."
-        )
-    return exp_registry[cfg.experiment._name](cfg, logger, ckpt_path)

experiments/exp_base.py DELETED Viewed

@@ -1,473 +0,0 @@
-"""
-This repo is forked from [Boyuan Chen](https://boyuan.space/)'s research
-template [repo](https://github.com/buoyancy99/research-template).
-By its MIT license, you must keep the above sentence in `README.md`
-and the `LICENSE` file to credit the author.
-"""
-from abc import ABC, abstractmethod
-from typing import Optional, Union, Literal, List, Dict
-import pathlib
-import os
-import hydra
-import torch
-from lightning.pytorch.strategies.ddp import DDPStrategy
-import lightning.pytorch as pl
-from lightning.pytorch.loggers.wandb import WandbLogger
-from lightning.pytorch.utilities.types import TRAIN_DATALOADERS
-from lightning.pytorch.callbacks import LearningRateMonitor, ModelCheckpoint
-from pytorch_lightning.utilities import rank_zero_info
-from omegaconf import DictConfig
-from utils.print_utils import cyan
-from utils.distributed_utils import is_rank_zero
-from safetensors.torch import load_model
-from pathlib import Path
-from huggingface_hub import hf_hub_download
-torch.set_float32_matmul_precision("high")
-def load_custom_checkpoint(algo, optimizer, checkpoint_path):
-    if not checkpoint_path:
-        rank_zero_info("No checkpoint path provided, skipping checkpoint loading.")
-        return None
-    if not isinstance(checkpoint_path, Path):
-        checkpoint_path = Path(checkpoint_path)
-    if  "yslan" in str(checkpoint_path):
-        hf_ckpt = str(checkpoint_path).split('/')
-        repo_id = '/'.join(hf_ckpt[:2])
-        file_name = '/'.join(hf_ckpt[2:])
-        model_path = hf_hub_download(repo_id=repo_id,
-                            filename=file_name)
-        ckpt = torch.load(model_path, map_location=torch.device('cpu'))
-        algo.load_state_dict(ckpt['state_dict'], strict=False)
-    elif checkpoint_path.suffix == ".pt":
-        ckpt = torch.load(checkpoint_path, weights_only=True)
-        algo.load_state_dict(ckpt, strict=False)
-    elif checkpoint_path.suffix == ".ckpt":
-        ckpt = torch.load(checkpoint_path, map_location=torch.device('cpu'))
-        algo.load_state_dict(ckpt['state_dict'], strict=False)
-    elif checkpoint_path.suffix == ".safetensors":
-        load_model(algo, checkpoint_path, strict=False)
-    elif os.path.isdir(checkpoint_path):
-        ckpt_files = [f for f in os.listdir(checkpoint_path) if f.endswith('.ckpt')]
-        if not ckpt_files:
-            raise FileNotFoundError("在指定文件夹中未找到任何 .ckpt 文件！")
-        selected_ckpt = max(ckpt_files)
-        selected_ckpt_path = os.path.join(checkpoint_path, selected_ckpt)
-        print(f"加载的 checkpoint 文件为: {selected_ckpt_path}")
-        ckpt = torch.load(selected_ckpt_path, map_location=torch.device('cpu'))
-        algo.load_state_dict(ckpt['state_dict'], strict=False)
-    rank_zero_info("Model weights loaded.")
-class BaseExperiment(ABC):
-    """
-    Abstract class for an experiment. This generalizes the pytorch lightning Trainer & lightning Module to more
-    flexible experiments that doesn't fit in the typical ml loop, e.g. multi-stage reinforcement learning benchmarks.
-    """
-    # each key has to be a yaml file under '[project_root]/configurations/algorithm' without .yaml suffix
-    compatible_algorithms: Dict = NotImplementedError
-    def __init__(
-        self,
-        root_cfg: DictConfig,
-        logger: Optional[WandbLogger] = None,
-        ckpt_path: Optional[Union[str, pathlib.Path]] = None,
-    ) -> None:
-        """
-        Constructor
-        Args:
-            cfg: configuration file that contains everything about the experiment
-            logger: a pytorch-lightning WandbLogger instance
-            ckpt_path: an optional path to saved checkpoint
-        """
-        super().__init__()
-        self.root_cfg = root_cfg
-        self.cfg = root_cfg.experiment
-        self.debug = root_cfg.debug
-        self.logger = logger
-        self.ckpt_path = ckpt_path
-        self.algo = None
-        self.customized_load = self.cfg.customized_load
-        self.load_vae = self.cfg.load_vae
-        self.load_t_to_r = self.cfg.load_t_to_r
-        self.zero_init_gate=self.cfg.zero_init_gate
-        self.only_tune_refer = self.cfg.only_tune_refer
-        self.diffusion_path = self.cfg.diffusion_path
-        self.vae_path = self.cfg.vae_path # "/mnt/xiaozeqi/.cache/huggingface/hub/models--Etched--oasis-500m/snapshots/4ca7d2d811f4f0c6fd1d5719bf83f14af3446c0c/vit-l-20.safetensors"
-        self.pose_predictor_path = self.cfg.pose_predictor_path # "/mnt/xiaozeqi/diffusionforcing/outputs/2025-03-28/16-45-11/checkpoints/epoch0step595000.ckpt"
-    def _build_algo(self):
-        """
-        Build the lightning module
-        :return:  a pytorch-lightning module to be launched
-        """
-        algo_name = self.root_cfg.algorithm._name
-        if algo_name not in self.compatible_algorithms:
-            raise ValueError(
-                f"Algorithm {algo_name} not found in compatible_algorithms for this Experiment class. "
-                "Make sure you define compatible_algorithms correctly and make sure that each key has "
-                "same name as yaml file under '[project_root]/configurations/algorithm' without .yaml suffix"
-            )
-        return self.compatible_algorithms[algo_name](self.root_cfg.algorithm)
-    def exec_task(self, task: str) -> None:
-        """
-        Executing a certain task specified by string. Each task should be a stage of experiment.
-        In most computer vision / nlp applications, tasks should be just train and test.
-        In reinforcement learning, you might have more stages such as collecting dataset etc
-        Args:
-            task: a string specifying a task implemented for this experiment
-        """
-        if hasattr(self, task) and callable(getattr(self, task)):
-            if is_rank_zero:
-                print(cyan("Executing task:"), f"{task} out of {self.cfg.tasks}")
-            getattr(self, task)()
-        else:
-            raise ValueError(
-                f"Specified task '{task}' not defined for class {self.__class__.__name__} or is not callable."
-            )
-    def exec_interactive(self, task: str) -> None:
-        """
-        Executing a certain task specified by string. Each task should be a stage of experiment.
-        In most computer vision / nlp applications, tasks should be just train and test.
-        In reinforcement learning, you might have more stages such as collecting dataset etc
-        Args:
-            task: a string specifying a task implemented for this experiment
-        """
-        if hasattr(self, task) and callable(getattr(self, task)):
-            if is_rank_zero:
-                print(cyan("Executing task:"), f"{task} out of {self.cfg.tasks}")
-            return getattr(self, task)()
-        else:
-            raise ValueError(
-                f"Specified task '{task}' not defined for class {self.__class__.__name__} or is not callable."
-            )
-class BaseLightningExperiment(BaseExperiment):
-    """
-    Abstract class for pytorch lightning experiments. Useful for computer vision & nlp where main components are
-    simply models, datasets and train loop.
-    """
-    # each key has to be a yaml file under '[project_root]/configurations/algorithm' without .yaml suffix
-    compatible_algorithms: Dict = NotImplementedError
-    # each key has to be a yaml file under '[project_root]/configurations/dataset' without .yaml suffix
-    compatible_datasets: Dict = NotImplementedError
-    def _build_trainer_callbacks(self):
-        callbacks = []
-        if self.logger:
-            callbacks.append(LearningRateMonitor("step", True))
-    def _build_training_loader(self) -> Optional[Union[TRAIN_DATALOADERS, pl.LightningDataModule]]:
-        train_dataset = self._build_dataset("training")
-        shuffle = (
-            False if isinstance(train_dataset, torch.utils.data.IterableDataset) else self.cfg.training.data.shuffle
-        )
-        if train_dataset:
-            return torch.utils.data.DataLoader(
-                train_dataset,
-                batch_size=self.cfg.training.batch_size,
-                num_workers=min(os.cpu_count(), self.cfg.training.data.num_workers),
-                shuffle=shuffle,
-                persistent_workers=True,
-            )
-        else:
-            return None
-    def _build_validation_loader(self) -> Optional[Union[TRAIN_DATALOADERS, pl.LightningDataModule]]:
-        validation_dataset = self._build_dataset("validation")
-        shuffle = (
-            False
-            if isinstance(validation_dataset, torch.utils.data.IterableDataset)
-            else self.cfg.validation.data.shuffle
-        )
-        if validation_dataset:
-            return torch.utils.data.DataLoader(
-                validation_dataset,
-                batch_size=self.cfg.validation.batch_size,
-                num_workers=min(os.cpu_count(), self.cfg.validation.data.num_workers),
-                shuffle=shuffle,
-                persistent_workers=True,
-            )
-        else:
-            return None
-    def _build_test_loader(self) -> Optional[Union[TRAIN_DATALOADERS, pl.LightningDataModule]]:
-        test_dataset = self._build_dataset("test")
-        shuffle = False if isinstance(test_dataset, torch.utils.data.IterableDataset) else self.cfg.test.data.shuffle
-        if test_dataset:
-            return torch.utils.data.DataLoader(
-                test_dataset,
-                batch_size=self.cfg.test.batch_size,
-                num_workers=min(os.cpu_count(), self.cfg.test.data.num_workers),
-                shuffle=shuffle,
-                persistent_workers=True,
-            )
-        else:
-            return None
-    def training(self) -> None:
-        """
-        All training happens here
-        """
-        if not self.algo:
-            self.algo = self._build_algo()
-        if self.cfg.training.compile:
-            self.algo = torch.compile(self.algo)
-        callbacks = []
-        if self.logger:
-            callbacks.append(LearningRateMonitor("step", True))
-        if "checkpointing" in self.cfg.training:
-            callbacks.append(
-                ModelCheckpoint(
-                    pathlib.Path(hydra.core.hydra_config.HydraConfig.get()["runtime"]["output_dir"]) / "checkpoints",
-                    **self.cfg.training.checkpointing,
-                )
-            )
-        # TODO do not upload checkpoint to wandb
-        # trainer = pl.Trainer(
-        #     accelerator="auto",
-        #     logger=self.logger if self.logger else False,
-        #     devices=torch.cuda.device_count(),
-        #     num_nodes=self.cfg.num_nodes,
-        #     strategy=DDPStrategy(find_unused_parameters=True) if torch.cuda.device_count() > 1 else "auto",
-        #     callbacks=callbacks,
-        #     gradient_clip_val=self.cfg.training.optim.gradient_clip_val,
-        #     val_check_interval=self.cfg.validation.val_every_n_step,
-        #     limit_val_batches=self.cfg.validation.limit_batch,
-        #     check_val_every_n_epoch=self.cfg.validation.val_every_n_epoch,
-        #     accumulate_grad_batches=self.cfg.training.optim.accumulate_grad_batches,
-        #     precision=self.cfg.training.precision,
-        #     detect_anomaly=False,  # self.cfg.debug,
-        #     num_sanity_val_steps=int(self.cfg.debug),
-        #     max_epochs=self.cfg.training.max_epochs,
-        #     max_steps=self.cfg.training.max_steps,
-        #     max_time=self.cfg.training.max_time,
-        # )
-        trainer = pl.Trainer(
-            accelerator="auto",
-            devices="auto",  # 自动选择设备
-            strategy=DDPStrategy(find_unused_parameters=True) if torch.cuda.device_count() > 1 else "auto",
-            logger=self.logger or False,  # 简化写法
-            callbacks=callbacks,
-            gradient_clip_val=self.cfg.training.optim.gradient_clip_val or 0.0,  # 确保默认值
-            val_check_interval=self.cfg.validation.val_every_n_step if self.cfg.validation.val_every_n_step else None,
-            limit_val_batches=self.cfg.validation.limit_batch,
-            check_val_every_n_epoch=self.cfg.validation.val_every_n_epoch if not self.cfg.validation.val_every_n_step else None,
-            accumulate_grad_batches=self.cfg.training.optim.accumulate_grad_batches or 1,  # 默认累积为1
-            precision=self.cfg.training.precision or 32,  # 默认32位精度
-            detect_anomaly=False,  # 默认关闭异常检测
-            num_sanity_val_steps=int(self.cfg.debug) if self.cfg.debug else 0,
-            max_epochs=self.cfg.training.max_epochs,
-            max_steps=self.cfg.training.max_steps,
-            max_time=self.cfg.training.max_time
-        )
-        if self.customized_load:
-            if self.load_vae:
-                load_custom_checkpoint(algo=self.algo.diffusion_model.model,optimizer=None,checkpoint_path=self.ckpt_path)
-                load_custom_checkpoint(algo=self.algo.vae,optimizer=None,checkpoint_path=self.vae_path)
-            else:
-                load_custom_checkpoint(algo=self.algo,optimizer=None,checkpoint_path=self.ckpt_path)
-                if self.load_t_to_r:
-                    param_list = []
-                    for name, para in self.algo.diffusion_model.named_parameters():
-                        if 't_' in name and 't_embedder' not in name:
-                            print(name)
-                            param_list.append(para)
-                    it = 0
-                    for name, para in self.algo.diffusion_model.named_parameters():
-                        if 'r_' in name:
-                            para.requires_grad_(False)
-                            try:
-                                para.copy_(param_list[it].detach().cpu())
-                            except:
-                                import pdb;pdb.set_trace()
-                            para.requires_grad_(True)
-                            it += 1
-            if self.zero_init_gate:
-                for name, para in self.algo.diffusion_model.named_parameters():
-                    if 'r_adaLN_modulation' in name:
-                        para.requires_grad_(False)
-                        para[2*1024:3*1024] = 0
-                        para[5*1024:6*1024] = 0
-                        para.requires_grad_(True)
-            if self.only_tune_refer:
-                for name, para in self.algo.diffusion_model.named_parameters():
-                    para.requires_grad_(False)
-                    if 'r_' in name or 'pose_embedder' in name or 'pose_cond_mlp' in name or 'lora_' in name:
-                        para.requires_grad_(True)
-            trainer.fit(
-                self.algo,
-                train_dataloaders=self._build_training_loader(),
-                val_dataloaders=self._build_validation_loader(),
-                ckpt_path=None,
-            )
-        else:
-            if self.only_tune_refer:
-                for name, para in self.algo.diffusion_model.named_parameters():
-                    para.requires_grad_(False)
-                    if 'r_' in name or 'pose_embedder' in name or 'pose_cond_mlp' in name or 'lora_' in name:
-                        para.requires_grad_(True)
-            trainer.fit(
-                self.algo,
-                train_dataloaders=self._build_training_loader(),
-                val_dataloaders=self._build_validation_loader(),
-                ckpt_path=self.ckpt_path,
-            )
-    def validation(self) -> None:
-        """
-        All validation happens here
-        """
-        if not self.algo:
-            self.algo = self._build_algo()
-        if self.cfg.validation.compile:
-            self.algo = torch.compile(self.algo)
-        callbacks = []
-        trainer = pl.Trainer(
-            accelerator="auto",
-            logger=self.logger,
-            devices="auto",
-            num_nodes=self.cfg.num_nodes,
-            strategy=DDPStrategy(find_unused_parameters=False) if torch.cuda.device_count() > 1 else "auto",
-            callbacks=callbacks,
-            # limit_val_batches=self.cfg.validation.limit_batch,
-            limit_val_batches=self.cfg.validation.limit_batch,
-            precision=self.cfg.validation.precision,
-            detect_anomaly=False,  # self.cfg.debug,
-            inference_mode=self.cfg.validation.inference_mode,
-        )
-        if self.customized_load:
-            if self.load_vae:
-                load_custom_checkpoint(algo=self.algo.diffusion_model.model,optimizer=None,checkpoint_path=self.ckpt_path)
-                load_custom_checkpoint(algo=self.algo.vae,optimizer=None,checkpoint_path=self.vae_path)
-            else:
-                load_custom_checkpoint(algo=self.algo,optimizer=None,checkpoint_path=self.ckpt_path)
-                if self.load_t_to_r:
-                    param_list = []
-                    for name, para in self.algo.diffusion_model.named_parameters():
-                        if 't_' in name and 't_embedder' not in name:
-                            print(name)
-                            param_list.append(para)
-                    it = 0
-                    for name, para in self.algo.diffusion_model.named_parameters():
-                        if 'r_' in name:
-                            para.requires_grad_(False)
-                            try:
-                                para.copy_(param_list[it].detach().cpu())
-                            except:
-                                import pdb;pdb.set_trace()
-                            para.requires_grad_(True)
-                            it += 1
-            if self.zero_init_gate:
-                for name, para in self.algo.diffusion_model.named_parameters():
-                    if 'r_adaLN_modulation' in name:
-                        para.requires_grad_(False)
-                        para[2*1024:3*1024] = 0
-                        para[5*1024:6*1024] = 0
-                        para.requires_grad_(True)
-            trainer.validate(
-                self.algo,
-                dataloaders=self._build_validation_loader(),
-                ckpt_path=None,
-            )
-        else:
-            trainer.validate(
-                self.algo,
-                dataloaders=self._build_validation_loader(),
-                ckpt_path=self.ckpt_path,
-            )
-    def test(self) -> None:
-        """
-        All testing happens here
-        """
-        if not self.algo:
-            self.algo = self._build_algo()
-        if self.cfg.test.compile:
-            self.algo = torch.compile(self.algo)
-        callbacks = []
-        trainer = pl.Trainer(
-            accelerator="auto",
-            logger=self.logger,
-            devices="auto",
-            num_nodes=self.cfg.num_nodes,
-            strategy=DDPStrategy(find_unused_parameters=False) if torch.cuda.device_count() > 1 else "auto",
-            callbacks=callbacks,
-            limit_test_batches=self.cfg.test.limit_batch,
-            precision=self.cfg.test.precision,
-            detect_anomaly=False,  # self.cfg.debug,
-        )
-        # Only load the checkpoint if only testing. Otherwise, it will have been loaded
-        # and further trained during train.
-        trainer.test(
-            self.algo,
-            dataloaders=self._build_test_loader(),
-            ckpt_path=self.ckpt_path,
-        )
-        if not self.algo:
-            self.algo = self._build_algo()
-        if self.cfg.validation.compile:
-            self.algo = torch.compile(self.algo)
-    def interactive(self):
-        if not self.algo:
-            self.algo = self._build_algo()
-        if self.cfg.validation.compile:
-            self.algo = torch.compile(self.algo)
-        if self.customized_load:
-            load_custom_checkpoint(algo=self.algo.diffusion_model,optimizer=None,checkpoint_path=self.diffusion_path)
-            load_custom_checkpoint(algo=self.algo.vae,optimizer=None,checkpoint_path=self.vae_path)
-            load_custom_checkpoint(algo=self.algo.pose_prediction_model,optimizer=None,checkpoint_path=self.pose_predictor_path)
-            return self.algo
-        else:
-            raise NotImplementedError
-    def _build_dataset(self, split: str) -> Optional[torch.utils.data.Dataset]:
-        if split in ["training", "test", "validation"]:
-            return self.compatible_datasets[self.root_cfg.dataset._name](self.root_cfg.dataset, split=split)
-        else:
-            raise NotImplementedError(f"split '{split}' is not implemented")

experiments/exp_pose.py DELETED Viewed

@@ -1,310 +0,0 @@
-"""
-This repo is forked from [Boyuan Chen](https://boyuan.space/)'s research
-template [repo](https://github.com/buoyancy99/research-template).
-By its MIT license, you must keep the above sentence in `README.md`
-and the `LICENSE` file to credit the author.
-"""
-from abc import ABC, abstractmethod
-from typing import Optional, Union, Literal, List, Dict
-import pathlib
-import os
-import hydra
-import torch
-from lightning.pytorch.strategies.ddp import DDPStrategy
-import lightning.pytorch as pl
-from lightning.pytorch.loggers.wandb import WandbLogger
-from lightning.pytorch.utilities.types import TRAIN_DATALOADERS
-from lightning.pytorch.callbacks import LearningRateMonitor, ModelCheckpoint
-from pytorch_lightning.utilities import rank_zero_info
-from omegaconf import DictConfig
-from utils.print_utils import cyan
-from utils.distributed_utils import is_rank_zero
-from safetensors.torch import load_model
-from pathlib import Path
-from algorithms.worldmem import PosePrediction
-from datasets.video import MinecraftVideoPoseDataset
-torch.set_float32_matmul_precision("high")
-def load_custom_checkpoint(algo, optimizer, checkpoint_path):
-    if not checkpoint_path:
-        rank_zero_info("No checkpoint path provided, skipping checkpoint loading.")
-        return None
-    if not isinstance(checkpoint_path, Path):
-        checkpoint_path = Path(checkpoint_path)
-    if checkpoint_path.suffix == ".pt":
-        ckpt = torch.load(checkpoint_path, weights_only=True)
-        algo.load_state_dict(ckpt, strict=False)
-    elif checkpoint_path.suffix == ".ckpt":
-        ckpt = torch.load(checkpoint_path, map_location=torch.device('cpu'))
-        algo.load_state_dict(ckpt['state_dict'], strict=False)
-    elif checkpoint_path.suffix == ".safetensors":
-        load_model(algo, checkpoint_path, strict=False)
-    elif os.path.isdir(checkpoint_path):
-        ckpt_files = [f for f in os.listdir(checkpoint_path) if f.endswith('.ckpt')]
-        if not ckpt_files:
-            raise FileNotFoundError("在指定文件夹中未找到任何 .ckpt 文件！")
-        selected_ckpt = max(ckpt_files)
-        selected_ckpt_path = os.path.join(checkpoint_path, selected_ckpt)
-        print(f"加载的 checkpoint 文件为: {selected_ckpt_path}")
-        ckpt = torch.load(selected_ckpt_path, map_location=torch.device('cpu'))
-        algo.load_state_dict(ckpt['state_dict'], strict=False)
-    rank_zero_info("Model weights loaded.")
-class PoseExperiment(ABC):
-    """
-    Abstract class for an experiment. This generalizes the pytorch lightning Trainer & lightning Module to more
-    flexible experiments that doesn't fit in the typical ml loop, e.g. multi-stage reinforcement learning benchmarks.
-    """
-    # each key has to be a yaml file under '[project_root]/configurations/algorithm' without .yaml suffix
-    compatible_algorithms = dict(
-        pose_prediction=PosePrediction
-    )
-    compatible_datasets = dict(
-        video_minecraft_pose=MinecraftVideoPoseDataset
-    )
-    def __init__(
-        self,
-        root_cfg: DictConfig,
-        logger: Optional[WandbLogger] = None,
-        ckpt_path: Optional[Union[str, pathlib.Path]] = None,
-    ) -> None:
-        """
-        Constructor
-        Args:
-            cfg: configuration file that contains everything about the experiment
-            logger: a pytorch-lightning WandbLogger instance
-            ckpt_path: an optional path to saved checkpoint
-        """
-        super().__init__()
-        self.root_cfg = root_cfg
-        self.cfg = root_cfg.experiment
-        self.debug = root_cfg.debug
-        self.logger = logger
-        self.ckpt_path = ckpt_path
-        self.algo = None
-        self.vae_path = "/cpfs01/user/xiaozeqi/.cache/huggingface/hub/models--Etched--oasis-500m/snapshots/4ca7d2d811f4f0c6fd1d5719bf83f14af3446c0c/vit-l-20.safetensors"
-    def _build_algo(self):
-        """
-        Build the lightning module
-        :return:  a pytorch-lightning module to be launched
-        """
-        algo_name = self.root_cfg.algorithm._name
-        if algo_name not in self.compatible_algorithms:
-            raise ValueError(
-                f"Algorithm {algo_name} not found in compatible_algorithms for this Experiment class. "
-                "Make sure you define compatible_algorithms correctly and make sure that each key has "
-                "same name as yaml file under '[project_root]/configurations/algorithm' without .yaml suffix"
-            )
-        return self.compatible_algorithms[algo_name](self.root_cfg.algorithm)
-    def exec_task(self, task: str) -> None:
-        """
-        Executing a certain task specified by string. Each task should be a stage of experiment.
-        In most computer vision / nlp applications, tasks should be just train and test.
-        In reinforcement learning, you might have more stages such as collecting dataset etc
-        Args:
-            task: a string specifying a task implemented for this experiment
-        """
-        if hasattr(self, task) and callable(getattr(self, task)):
-            if is_rank_zero:
-                print(cyan("Executing task:"), f"{task} out of {self.cfg.tasks}")
-            getattr(self, task)()
-        else:
-            raise ValueError(
-                f"Specified task '{task}' not defined for class {self.__class__.__name__} or is not callable."
-            )
-    def _build_trainer_callbacks(self):
-        callbacks = []
-        if self.logger:
-            callbacks.append(LearningRateMonitor("step", True))
-    def _build_training_loader(self) -> Optional[Union[TRAIN_DATALOADERS, pl.LightningDataModule]]:
-        train_dataset = self._build_dataset("training")
-        shuffle = (
-            False if isinstance(train_dataset, torch.utils.data.IterableDataset) else self.cfg.training.data.shuffle
-        )
-        if train_dataset:
-            return torch.utils.data.DataLoader(
-                train_dataset,
-                batch_size=self.cfg.training.batch_size,
-                num_workers=min(os.cpu_count(), self.cfg.training.data.num_workers),
-                shuffle=shuffle,
-                persistent_workers=True,
-            )
-        else:
-            return None
-    def _build_validation_loader(self) -> Optional[Union[TRAIN_DATALOADERS, pl.LightningDataModule]]:
-        validation_dataset = self._build_dataset("validation")
-        shuffle = (
-            False
-            if isinstance(validation_dataset, torch.utils.data.IterableDataset)
-            else self.cfg.validation.data.shuffle
-        )
-        if validation_dataset:
-            return torch.utils.data.DataLoader(
-                validation_dataset,
-                batch_size=self.cfg.validation.batch_size,
-                num_workers=min(os.cpu_count(), self.cfg.validation.data.num_workers),
-                shuffle=shuffle,
-                persistent_workers=True,
-            )
-        else:
-            return None
-    def _build_test_loader(self) -> Optional[Union[TRAIN_DATALOADERS, pl.LightningDataModule]]:
-        test_dataset = self._build_dataset("test")
-        shuffle = False if isinstance(test_dataset, torch.utils.data.IterableDataset) else self.cfg.test.data.shuffle
-        if test_dataset:
-            return torch.utils.data.DataLoader(
-                test_dataset,
-                batch_size=self.cfg.test.batch_size,
-                num_workers=min(os.cpu_count(), self.cfg.test.data.num_workers),
-                shuffle=shuffle,
-                persistent_workers=True,
-            )
-        else:
-            return None
-    def training(self) -> None:
-        """
-        All training happens here
-        """
-        if not self.algo:
-            self.algo = self._build_algo()
-        if self.cfg.training.compile:
-            self.algo = torch.compile(self.algo)
-        callbacks = []
-        if self.logger:
-            callbacks.append(LearningRateMonitor("step", True))
-        if "checkpointing" in self.cfg.training:
-            callbacks.append(
-                ModelCheckpoint(
-                    pathlib.Path(hydra.core.hydra_config.HydraConfig.get()["runtime"]["output_dir"]) / "checkpoints",
-                    **self.cfg.training.checkpointing,
-                )
-            )
-        trainer = pl.Trainer(
-            accelerator="auto",
-            devices="auto",  # 自动选择设备
-            strategy=DDPStrategy(find_unused_parameters=True) if torch.cuda.device_count() > 1 else "auto",
-            logger=self.logger or False,  # 简化写法
-            callbacks=callbacks,
-            gradient_clip_val=self.cfg.training.optim.gradient_clip_val or 0.0,  # 确保默认值
-            val_check_interval=self.cfg.validation.val_every_n_step if self.cfg.validation.val_every_n_step else None,
-            limit_val_batches=self.cfg.validation.limit_batch,
-            check_val_every_n_epoch=self.cfg.validation.val_every_n_epoch if not self.cfg.validation.val_every_n_step else None,
-            accumulate_grad_batches=self.cfg.training.optim.accumulate_grad_batches or 1,  # 默认累积为1
-            precision=self.cfg.training.precision or 32,  # 默认32位精度
-            detect_anomaly=False,  # 默认关闭异常检测
-            num_sanity_val_steps=int(self.cfg.debug) if self.cfg.debug else 0,
-            max_epochs=self.cfg.training.max_epochs,
-            max_steps=self.cfg.training.max_steps,
-            max_time=self.cfg.training.max_time
-        )
-        load_custom_checkpoint(algo=self.algo.vae,optimizer=None,checkpoint_path=self.vae_path)
-        trainer.fit(
-            self.algo,
-            train_dataloaders=self._build_training_loader(),
-            val_dataloaders=self._build_validation_loader(),
-            ckpt_path=self.ckpt_path,
-        )
-    def validation(self) -> None:
-        """
-        All validation happens here
-        """
-        if not self.algo:
-            self.algo = self._build_algo()
-        if self.cfg.validation.compile:
-            self.algo = torch.compile(self.algo)
-        callbacks = []
-        trainer = pl.Trainer(
-            accelerator="auto",
-            logger=self.logger,
-            devices="auto",
-            num_nodes=self.cfg.num_nodes,
-            strategy=DDPStrategy(find_unused_parameters=False) if torch.cuda.device_count() > 1 else "auto",
-            callbacks=callbacks,
-            # limit_val_batches=self.cfg.validation.limit_batch,
-            limit_val_batches=self.cfg.validation.limit_batch,
-            precision=self.cfg.validation.precision,
-            detect_anomaly=False,  # self.cfg.debug,
-            inference_mode=self.cfg.validation.inference_mode,
-        )
-        load_custom_checkpoint(algo=self.algo.vae,optimizer=None,checkpoint_path=self.vae_path)
-        trainer.validate(
-            self.algo,
-            dataloaders=self._build_validation_loader(),
-            ckpt_path=self.ckpt_path,
-        )
-    def test(self) -> None:
-        """
-        All testing happens here
-        """
-        if not self.algo:
-            self.algo = self._build_algo()
-        if self.cfg.test.compile:
-            self.algo = torch.compile(self.algo)
-        callbacks = []
-        trainer = pl.Trainer(
-            accelerator="auto",
-            logger=self.logger,
-            devices="auto",
-            num_nodes=self.cfg.num_nodes,
-            strategy=DDPStrategy(find_unused_parameters=False) if torch.cuda.device_count() > 1 else "auto",
-            callbacks=callbacks,
-            limit_test_batches=self.cfg.test.limit_batch,
-            precision=self.cfg.test.precision,
-            detect_anomaly=False,  # self.cfg.debug,
-        )
-        # Only load the checkpoint if only testing. Otherwise, it will have been loaded
-        # and further trained during train.
-        trainer.test(
-            self.algo,
-            dataloaders=self._build_test_loader(),
-            ckpt_path=self.ckpt_path,
-        )
-        if not self.algo:
-            self.algo = self._build_algo()
-        if self.cfg.validation.compile:
-            self.algo = torch.compile(self.algo)
-    def _build_dataset(self, split: str) -> Optional[torch.utils.data.Dataset]:
-        if split in ["training", "test", "validation"]:
-            return self.compatible_datasets[self.root_cfg.dataset._name](self.root_cfg.dataset, split=split)
-        else:
-            raise NotImplementedError(f"split '{split}' is not implemented")

experiments/exp_video.py DELETED Viewed

@@ -1,25 +0,0 @@
-from datasets.video import (
-    MinecraftVideoDataset,
-    MinecraftVideoPoseDataset
-)
-from algorithms.worldmem import WorldMemMinecraft
-from algorithms.worldmem import PosePrediction
-from .exp_base import BaseLightningExperiment
-class VideoPredictionExperiment(BaseLightningExperiment):
-    """
-    A video prediction experiment
-    """
-    compatible_algorithms = dict(
-        df_video_worldmemminecraft=WorldMemMinecraft,
-        pose_prediction=PosePrediction
-    )
-    compatible_datasets = dict(
-        # video datasets
-        video_minecraft=MinecraftVideoDataset,
-        video_minecraft_pose=MinecraftVideoPoseDataset
-    )

main.py DELETED Viewed

@@ -1,219 +0,0 @@
-"""
-This repo is forked from [Boyuan Chen](https://boyuan.space/)'s research
-template [repo](https://github.com/buoyancy99/research-template).
-By its MIT license, you must keep the above sentence in `README.md`
-and the `LICENSE` file to credit the author.
-Main file for the project. This will create and run new experiments and load checkpoints from wandb.
-Borrowed part of the code from David Charatan and wandb.
-"""
-import sys
-import subprocess
-import time
-from pathlib import Path
-import hydra
-from omegaconf import DictConfig, OmegaConf
-from omegaconf.omegaconf import open_dict
-from utils.print_utils import cyan
-from utils.ckpt_utils import download_latest_checkpoint, is_run_id
-from utils.cluster_utils import submit_slurm_job
-from utils.distributed_utils import is_rank_zero
-def get_latest_checkpoint(checkpoint_folder: Path, pattern: str = '*.ckpt'):
-    # 获取文件夹中所有符合 pattern 的文件
-    checkpoint_files = list(checkpoint_folder.glob(pattern))
-    if not checkpoint_files:
-        return None  # 如果没有找到 checkpoint 文件，返回 None
-    # 根据文件修改时间（st_mtime）选取最新的文件
-    latest_checkpoint = max(checkpoint_files, key=lambda f: f.stat().st_mtime)
-    return latest_checkpoint
-def run_local(cfg: DictConfig):
-    # delay some imports in case they are not needed in non-local envs for submission
-    from experiments import build_experiment
-    from utils.wandb_utils import OfflineWandbLogger, SpaceEfficientWandbLogger
-    # Get yaml names
-    hydra_cfg = hydra.core.hydra_config.HydraConfig.get()
-    cfg_choice = OmegaConf.to_container(hydra_cfg.runtime.choices)
-    with open_dict(cfg):
-        if cfg_choice["experiment"] is not None:
-            cfg.experiment._name = cfg_choice["experiment"]
-        if cfg_choice["dataset"] is not None:
-            cfg.dataset._name = cfg_choice["dataset"]
-        if cfg_choice["algorithm"] is not None:
-            cfg.algorithm._name = cfg_choice["algorithm"]
-    # import pdb;pdb.set_trace()
-    # Set up the output directory.
-    output_dir = getattr(cfg, "output_dir", None)
-    if output_dir is not None:
-        OmegaConf.set_readonly(hydra_cfg, False)
-        hydra_cfg.runtime.output_dir = output_dir
-        OmegaConf.set_readonly(hydra_cfg, True)
-    output_dir = Path(hydra_cfg.runtime.output_dir)
-    if is_rank_zero:
-        print(cyan(f"Outputs will be saved to:"), output_dir)
-        (output_dir.parents[1] / "latest-run").unlink(missing_ok=True)
-        (output_dir.parents[1] / "latest-run").symlink_to(output_dir, target_is_directory=True)
-    # Set up logging with wandb.
-    if cfg.wandb.mode != "disabled":
-        # If resuming, merge into the existing run on wandb.
-        resume = cfg.get("resume", None)
-        name = f"{cfg.name} ({output_dir.parent.name}/{output_dir.name})" if resume is None else None
-        if "_on_compute_node" in cfg and cfg.cluster.is_compute_node_offline:
-            logger_cls = OfflineWandbLogger
-        else:
-            logger_cls = SpaceEfficientWandbLogger
-        offline = cfg.wandb.mode != "online"
-        logger = logger_cls(
-            name=name,
-            save_dir=str(output_dir),
-            offline=offline,
-            entity=cfg.wandb.entity,
-            project=cfg.wandb.project,
-            log_model=False,
-            config=OmegaConf.to_container(cfg),
-            id=resume,
-            resume="auto"
-        )
-    else:
-        logger = None
-    # Load ckpt
-    resume = cfg.get("resume", None)
-    load = cfg.get("load", None)
-    checkpoint_path = None
-    load_id = None
-    if load and not is_run_id(load):
-        checkpoint_path = load
-    if resume:
-        load_id = resume
-    elif load and is_run_id(load):
-        load_id = load
-    else:
-        load_id = None
-    if load_id:
-        run_path = f"{cfg.wandb.entity}/{cfg.wandb.project}/{load_id}"
-        checkpoint_path = Path("outputs/downloaded") / run_path / "model.ckpt"
-        checkpoint_path = output_dir / get_latest_checkpoint(output_dir / "checkpoints")
-    if checkpoint_path and is_rank_zero:
-        print(f"Will load checkpoint from {checkpoint_path}")
-    # launch experiment
-    experiment = build_experiment(cfg, logger, checkpoint_path)
-    for task in cfg.experiment.tasks:
-        experiment.exec_task(task)
-def run_slurm(cfg: DictConfig):
-    python_args = " ".join(sys.argv[1:]) + " +_on_compute_node=True"
-    project_root = Path.cwd()
-    while not (project_root / ".git").exists():
-        project_root = project_root.parent
-        if project_root == Path("/"):
-            raise Exception("Could not find repo directory!")
-    slurm_log_dir = submit_slurm_job(
-        cfg,
-        python_args,
-        project_root,
-    )
-    if "cluster" in cfg and cfg.cluster.is_compute_node_offline and cfg.wandb.mode == "online":
-        print("Job submitted to a compute node without internet. This requires manual syncing on login node.")
-        osh_command_dir = project_root / ".wandb_osh_command_dir"
-        osh_proc = None
-        # if click.confirm("Do you want us to run the sync loop for you?", default=True):
-        osh_proc = subprocess.Popen(["wandb-osh", "--command-dir", osh_command_dir])
-        print(f"Running wandb-osh in background... PID: {osh_proc.pid}")
-        print(f"To kill the sync process, run 'kill {osh_proc.pid}' in the terminal.")
-        print(
-            f"You can manually start a sync loop later by running the following:",
-            cyan(f"wandb-osh --command-dir {osh_command_dir}"),
-        )
-    print(
-        "Once the job gets allocated and starts running, we will print a command below "
-        "for you to trace the errors and outputs: (Ctrl + C to exit without waiting)"
-    )
-    msg = f"tail -f {slurm_log_dir}/* \n"
-    try:
-        while not list(slurm_log_dir.glob("*.out")) and not list(slurm_log_dir.glob("*.err")):
-            time.sleep(1)
-        print(cyan("To trace the outputs and errors, run the following command:"), msg)
-    except KeyboardInterrupt:
-        print("Keyboard interrupt detected. Exiting...")
-        print(
-            cyan("To trace the outputs and errors, manually wait for the job to start and run the following command:"),
-            msg,
-        )
-@hydra.main(
-    version_base=None,
-    config_path="configurations",
-    config_name="config",
-)
-def run(cfg: DictConfig):
-    if "_on_compute_node" in cfg and cfg.cluster.is_compute_node_offline:
-        with open_dict(cfg):
-            if cfg.cluster.is_compute_node_offline and cfg.wandb.mode == "online":
-                cfg.wandb.mode = "offline"
-    if "name" not in cfg:
-        raise ValueError("must specify a name for the run with command line argument '+name=[name]'")
-    if not cfg.wandb.get("entity", None):
-        raise ValueError(
-            "must specify wandb entity in 'configurations/config.yaml' or with command line"
-            " argument 'wandb.entity=[entity]' \n An entity is your wandb user name or group"
-            " name. This is used for logging. If you don't have an wandb account, please signup at https://wandb.ai/"
-        )
-    if cfg.wandb.project is None:
-        cfg.wandb.project = str(Path(__file__).parent.name)
-    # If resuming or loading a wandb ckpt and not on a compute node, download the checkpoint.
-    resume = cfg.get("resume", None)
-    load = cfg.get("load", None)
-    if resume and load:
-        raise ValueError(
-            "When resuming a wandb run with `resume=[wandb id]`, checkpoint will be loaded from the cloud"
-            "and `load` should not be specified."
-        )
-    if resume:
-        load_id = resume
-    elif load and is_run_id(load):
-        load_id = load
-    else:
-        load_id = None
-    # if load_id and "_on_compute_node" not in cfg:
-    #     run_path = f"{cfg.wandb.entity}/{cfg.wandb.project}/{load_id}"
-    #     download_latest_checkpoint(run_path, Path("outputs/downloaded"))
-    if "cluster" in cfg and not "_on_compute_node" in cfg:
-        print(cyan("Slurm detected, submitting to compute node instead of running locally..."))
-        run_slurm(cfg)
-    else:
-        run_local(cfg)
-if __name__ == "__main__":
-    run()  # pylint: disable=no-value-for-parameter

scripts/README.md DELETED Viewed

@@ -1,10 +0,0 @@
-# scirpts
-`scripts` folder contains bash scripts for you to scale up your project on cloud.
-Don't put your jupyter notebooks here! They belongs to `debug` folder.
-General scripts that are useful for all projects can be put in the `script` folder directly.
----
-This repo is forked from [Boyuan Chen](https://boyuan.space/)'s research template [repo](https://github.com/buoyancy99/research-template). By its MIT license, you must keep the above sentence in `README.md` and the `LICENSE` file to credit the author.

scripts/dummy_script.sh DELETED Viewed

	@@ -1 +0,0 @@
1	- echo 'hello world'

split_checkpoint.py DELETED Viewed

@@ -1,9 +0,0 @@
-import torch
-ckpt_path = "/mnt/xiaozeqi/diffusionforcing/outputs/2025-03-28/16-45-11/checkpoints/epoch0step595000.ckpt"
-checkpoint = torch.load(ckpt_path, map_location="cpu")  # map_location 可根据需要更换
-state_dict = checkpoint['state_dict']
-pose_prediction_model_dict = {k.replace('pose_prediction_model.', ''): v for k, v in state_dict.items() if k.startswith('pose_prediction_model.')}
-torch.save({'state_dict': pose_prediction_model_dict}, "pose_prediction_model_only.ckpt")