diff-storyboard / examples /HunyuanVideo /hunyuanvideo_i2v_24G.py

Upload folder using huggingface_hub

feb33a0 verified 11 days ago

1.65 kB

	import torch
	from diffsynth import ModelManager, HunyuanVideoPipeline, download_models, save_video
	from modelscope import dataset_snapshot_download
	from PIL import Image


	download_models(["HunyuanVideoI2V"])
	model_manager = ModelManager()

	# The DiT model is loaded in bfloat16.
	model_manager.load_models(
	[
	"models/HunyuanVideoI2V/transformers/mp_rank_00_model_states.pt"
	],
	torch_dtype=torch.bfloat16,
	device="cpu"
	)

	# The other modules are loaded in float16.
	model_manager.load_models(
	[
	"models/HunyuanVideoI2V/text_encoder/model.safetensors",
	"models/HunyuanVideoI2V/text_encoder_2",
	'models/HunyuanVideoI2V/vae/pytorch_model.pt'
	],
	torch_dtype=torch.float16,
	device="cpu"
	)
	# The computation device is "cuda".
	pipe = HunyuanVideoPipeline.from_model_manager(model_manager,
	torch_dtype=torch.bfloat16,
	device="cuda",
	enable_vram_management=True)

	dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth",
	local_dir="./",
	allow_file_pattern=f"data/examples/hunyuanvideo/*")

	i2v_resolution = "720p"
	prompt = "An Asian man with short hair in black tactical uniform and white clothes waves a firework stick."
	images = [Image.open("data/examples/hunyuanvideo/0.jpg").convert('RGB')]
	video = pipe(prompt, input_images=images, num_inference_steps=50, seed=0, i2v_resolution=i2v_resolution)
	save_video(video, f"video_{i2v_resolution}_low_vram.mp4", fps=30, quality=6)