File size: 7,882 Bytes
5407fb2
 
029ea00
1716635
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20d2f51
1716635
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
029ea00
1716635
 
 
 
 
 
 
 
5407fb2
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import gradio

import av, pathlib, diffusers, torch, transformers, builtins, numpy, re
from animatediff.generate import controlnet_preprocess, img2img_preprocess, wild_card_conversion, region_preprocess, unload_controlnet_models
from animatediff.settings import get_model_config, get_infer_config
from animatediff.utils.pipeline import send_to_device
from animatediff.utils.util import set_tensor_interpolation_method
from animatediff.pipelines import load_text_embeddings
from animatediff.pipelines.lora import load_lcm_lora
import huggingface_hub
import animatediff

width=432
height=768
length=1440
model_config = get_model_config('config/prompts/prompt_travel.json')
is_sdxl = False
infer_config = get_infer_config(True, is_sdxl)
set_tensor_interpolation_method(model_config.tensor_interpolation_slerp)
device = torch.device('cuda')
save_dir = pathlib.Path('output')
controlnet_image_map, controlnet_type_map, controlnet_ref_map, controlnet_no_shrink = controlnet_preprocess(model_config.controlnet_map, width, height, length, save_dir, device, is_sdxl)
img2img_map = img2img_preprocess(model_config.img2img_map, width, height, length, save_dir)

base_model = pathlib.Path('/tmp/base')
diffusers.StableDiffusionPipeline.from_pretrained('stable-diffusion-v1-5/stable-diffusion-v1-5').save_pretrained(base_model)

tokenizer = transformers.CLIPTokenizer.from_pretrained(base_model, subfolder='tokenizer')
text_encoder = transformers.CLIPTextModel.from_pretrained(base_model, subfolder='text_encoder')
vae = diffusers.AutoencoderKL.from_single_file('https://huggingface.co/chaowenguoback/pal/blob/main/vae-ft-mse-840000-ema-pruned.safetensors')
huggingface_hub.hf_hub_download(repo_id='wangfuyun/AnimateLCM', filename='AnimateLCM_sd15_t2v.ckpt', local_dir=pathlib.Path.cwd())
unet = animatediff.models.unet.UNet2DConditionModel.from_pretrained_2d(
    pretrained_model_path=base_model,
    motion_module_path=pathlib.Path.cwd().joinpath('AnimateLCM_sd15_t2v.ckpt'),
    subfolder='unet',
    unet_additional_kwargs=infer_config.unet_additional_kwargs,
    feature_extractor = transformers.CLIPImageProcessor.from_pretrained(base_model, subfolder='feature_extractor')
)

pipeline = diffusers.StableDiffusionPipeline.from_single_file('https://huggingface.co/chaowenguoback/15/blob/main/chilloutMix-Ni.safetensors', config='stable-diffusion-v1-5/stable-diffusion-v1-5', safety_checker=None, use_safetensors=True)
unet.load_state_dict(pipeline.unet.state_dict(), strict=False)
text_encoder.load_state_dict(pipeline.text_encoder.state_dict(), strict=False)
del pipeline

unet.enable_xformers_memory_efficient_attention()

pipeline = animatediff.pipelines.AnimationPipeline(
    vae=vae,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    unet=unet,
    scheduler=diffusers.LCMScheduler.from_config(infer_config.noise_scheduler_kwargs),
    feature_extractor=feature_extractor,
    controlnet_map=None,
)

lcm_lora = pathlib.Path.cwd().joinpath('data/models/lcm_lora/sd15')
lcm_lora.mkdir(parents=True)
huggingface_hub.hf_hub_download(repo_id='wangfuyun/AnimateLCM', filename='AnimateLCM_sd15_t2v_lora.safetensors', local_dir=lcm_lora)
load_lcm_lora(pipeline, {'start_scale':0.15, 'end_scale':0.75, 'gradient_start':0.2, 'gradient_end':0.75}, is_sdxl=is_sdxl)
pipeline.lora_map = None
pipeline.load_lora_weights('chaowenguoback/15', weight_name='add_detail.safetensors', adapter_name='detail')
pipeline.load_lora_weights('chaowenguoback/15', weight_name='b1r1av5-000007.safetensors', adapter_name='bikini')
pipeline.load_lora_weights('chaowenguoback/15', weight_name='btcstr.safetensors', adapter_name='c-string')
pipeline.load_lora_weights('chaowenguoback/15', weight_name='蓝洁瑛.safetensors', adapter_name='character')
pipeline.set_adapters(['detail', 'bikini', 'c-string', 'character'], [1, 0.4, 0.2, 0.8])

pipeline.unet = pipeline.unet.half()
pipeline.text_encoder = pipeline.text_encoder.half()
pipeline.text_encoder = pipeline.text_encoder.to(device)
load_text_embeddings(pipeline)
pipeline.text_encoder = pipeline.text_encoder.to('cpu')
pipeline = send_to_device(pipeline, device, freeze=True, force_half=False, compile=False, is_sdxl=is_sdxl)
wild_card_conversion(model_config)

is_init_img_exist = img2img_map != None
region_condi_list, region_list, ip_adapter_config_map, region2index = region_preprocess(model_config, width, height, length, save_dir, is_init_img_exist, is_sdxl)

if controlnet_type_map:
    for c in controlnet_type_map:
        tmp_r = [region2index[r] for r in controlnet_type_map[c]["control_region_list"]]
        controlnet_type_map[c]["control_region_list"] = [r for r in tmp_r if r != -1]

prompt_map = region_condi_list[0]["prompt_map"]
prompt_tags = [re.compile(r"[^\w\-, ]").sub("", tag).strip().replace(" ", "-") for tag in prompt_map[list(prompt_map.keys())[0]].split(",")]
prompt_str = "_".join((prompt_tags[:6]))[:50]

output = pipeline(
    n_prompt='nipple, waistband, back view, monochrome, longbody, lowres, bad anatomy, bad hands, fused fingers, missing fingers, too many fingers, cropped, worst quality, low quality, deformed body, bloated, ugly, unrealistic, extra hands and arms',
    num_inference_steps=8,
    guidance_scale=3,
    unet_batch_size=1,
    width=width,
    height=height,
    video_length=length,
    return_dict=False,
    context_frames=16,
    context_stride=1,
    context_overlap=16 // 4,
    context_schedule='composite',
    clip_skip=2,
    controlnet_type_map=controlnet_image_map,
    controlnet_image_map=controlnet_image_map,
    controlnet_ref_map=controlnet_ref_map,
    controlnet_no_shrink=controlnet_no_shrink,
    controlnet_max_samples_on_vram=model_config.controlnet_map["max_samples_on_vram"] if "max_samples_on_vram" in model_config.controlnet_map else 999,
    controlnet_max_models_on_vram=model_config.controlnet_map["max_models_on_vram"] if "max_models_on_vram" in model_config.controlnet_map else 99,
    controlnet_is_loop = model_config.controlnet_map["is_loop"] if "is_loop" in model_config.controlnet_map else True,
    img2img_map=img2img_map,
    ip_adapter_config_map=ip_adapter_config_map,
    region_list=region_list,
    region_condi_list=region_condi_list,
    interpolation_factor=1,
    is_single_prompt_mode=model_config.is_single_prompt_mode,
    gradual_latent_map=model_config.gradual_latent_hires_fix_map,
    callback=None,
    callback_steps=None,
)

unload_controlnet_models(pipe=pipeline)
frames = output.permute(0, 2, 1, 3, 4).squeeze(0)
frames = frames.mul(255).add_(0.5).clamp_(0, 255).permute(0, 2, 3, 1).to("cpu", torch.uint8).numpy()
del pipeline
torch.cuda.empty_cache()
pipeline = diffusers.AudioLDM2Pipeline.from_pretrained('cvssp/audioldm2-music', torch_dtype=torch.float16).to('cuda')
pipeline.scheduler = diffusers.DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
music = pipeline(prompt='Light rhythm techno', negative_prompt='low quality, average quality', num_inference_steps=20, audio_length_in_s=180).audios[0]
del pipeline
torch.cuda.empty_cache()

with av.open('video.mp4', mode='w') as writer:
    video = writer.add_stream('h264', rate=8)
    video.width = width * 4
    video.height = height * 4
    video.pix_fmt = 'yuv420p'
    audio = writer.add_stream('aac', rate=16000)
    for frame in frames: writer.mux(video.encode(av.VideoFrame.from_ndarray(frame)))
    writer.mux(video.encode())
    for _ in builtins.range(0, music.shape[0], audio.frame_size):
        frame = av.AudioFrame.from_ndarray(music[_:_ + audio.frame_size][None], format='fltp', layout='mono')
        frame.sample_rate = audio.sample_rate
        frame.pts = _
        writer.mux(audio.encode(frame))
    writer.mux(audio.encode())

def greet(name, intensity):
    return "Hello, " + name + "!" * int(intensity)

demo = gradio.Interface(
    fn=greet,
    inputs=["text", "slider"],
    outputs=["text"],
    api_name="predict"
)

demo.launch()