|
|
import gradio as gr |
|
|
import torch |
|
|
import torchaudio |
|
|
import logging |
|
|
import tempfile |
|
|
import os |
|
|
import sys |
|
|
from pathlib import Path |
|
|
import numpy as np |
|
|
from typing import Optional, Tuple |
|
|
import time |
|
|
import traceback |
|
|
|
|
|
|
|
|
current_dir = Path(__file__).parent |
|
|
hf_ac_path = current_dir / "hf_AC" |
|
|
if hf_ac_path.exists(): |
|
|
sys.path.insert(0, str(hf_ac_path)) |
|
|
|
|
|
|
|
|
EXAMPLE_PROMPTS = [ |
|
|
"Crackling fireplace with gentle flames", |
|
|
"Ocean waves crashing on rocky shore", |
|
|
"Forest ambience with bird songs", |
|
|
"Keyboard typing sounds", |
|
|
"Footsteps on wooden floor", |
|
|
"Rain on metal roof" |
|
|
] |
|
|
|
|
|
USAGE_TIPS = """ |
|
|
### ๐ก Usage Tips |
|
|
|
|
|
**Basic Settings:** |
|
|
- **Video Quality**: Use clear, well-lit videos, recommended 1-15 seconds |
|
|
- **Reference Audio**: Provide clear audio clips as timbre reference |
|
|
- **CFG Strength**: Between 1-8, higher values follow description more closely |
|
|
|
|
|
**Advanced Features:** |
|
|
- **mask_away_clip**: Enable when video content differs significantly from desired audio |
|
|
- **Fine-grained Control**: Use reference audio for precise timbre and style control |
|
|
- **Zero-shot Generation**: Generate novel sound combinations without training |
|
|
|
|
|
**Application Scenarios:** |
|
|
- Film post-production audio |
|
|
- Game sound effect creation |
|
|
- Music composition assistance |
|
|
- Sound design experimentation |
|
|
""" |
|
|
|
|
|
|
|
|
def check_dependencies(): |
|
|
"""Check if all required packages are available""" |
|
|
missing_packages = [] |
|
|
required_packages = [ |
|
|
'torch', 'torchaudio', 'numpy', 'scipy', 'librosa', |
|
|
'torchdiffeq', 'einops', 'hydra', 'tensordict', 'av' |
|
|
] |
|
|
|
|
|
for package in required_packages: |
|
|
try: |
|
|
if package == 'hydra': |
|
|
__import__('hydra') |
|
|
elif package == 'av': |
|
|
__import__('av') |
|
|
else: |
|
|
__import__(package) |
|
|
except ImportError: |
|
|
missing_packages.append(package) |
|
|
|
|
|
return missing_packages |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
missing_deps = check_dependencies() |
|
|
if missing_deps: |
|
|
print(f"Warning: Missing dependencies: {missing_deps}") |
|
|
print("Some dependencies may be installing in the background...") |
|
|
|
|
|
from hf_AC.mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, |
|
|
setup_eval_logging) |
|
|
from hf_AC.mmaudio.model.flow_matching import FlowMatching |
|
|
from hf_AC.mmaudio.model.networks import MMAudio, get_my_mmaudio |
|
|
from hf_AC.mmaudio.model.utils.features_utils import FeaturesUtils |
|
|
from hf_AC.inf import Audio |
|
|
|
|
|
|
|
|
setup_eval_logging() |
|
|
log = logging.getLogger() |
|
|
HF_AC_AVAILABLE = True |
|
|
print("โ
hf_AC modules loaded successfully!") |
|
|
|
|
|
except ImportError as e: |
|
|
print(f"Warning: hf_AC modules not available: {e}") |
|
|
print("This may be due to missing dependencies. Please wait for installation to complete.") |
|
|
log = logging.getLogger() |
|
|
HF_AC_AVAILABLE = False |
|
|
|
|
|
class AudioFoleyModel: |
|
|
def __init__(self): |
|
|
self.device = 'cpu' |
|
|
if torch.cuda.is_available(): |
|
|
self.device = 'cuda' |
|
|
elif torch.backends.mps.is_available(): |
|
|
self.device = 'mps' |
|
|
|
|
|
self.dtype = torch.bfloat16 |
|
|
self.model = None |
|
|
self.net = None |
|
|
self.fm = None |
|
|
self.feature_utils = None |
|
|
|
|
|
def load_model(self, variant='large_44k', model_path=None): |
|
|
"""Load the hf_AC model with progress updates""" |
|
|
global model_loading_status |
|
|
|
|
|
try: |
|
|
if not HF_AC_AVAILABLE: |
|
|
return "โ hf_AC modules not available. Please install the hf_AC package." |
|
|
|
|
|
if variant not in all_model_cfg: |
|
|
available_variants = list(all_model_cfg.keys()) if all_model_cfg else [] |
|
|
return f"โ Unknown model variant: {variant}. Available: {available_variants}" |
|
|
|
|
|
|
|
|
model_loading_status = "๐ง Initializing model configuration..." |
|
|
log.info(f"Loading model variant: {variant}") |
|
|
self.model: ModelConfig = all_model_cfg[variant] |
|
|
|
|
|
|
|
|
model_loading_status = "๐ฅ Downloading model components..." |
|
|
try: |
|
|
self.model.download_if_needed() |
|
|
except Exception as e: |
|
|
log.warning(f"Could not download model components: {e}") |
|
|
|
|
|
|
|
|
model_loading_status = "๐ฅ Downloading main model weights..." |
|
|
if not hasattr(self.model, 'model_path') or not self.model.model_path or not Path(self.model.model_path).exists(): |
|
|
try: |
|
|
from huggingface_hub import hf_hub_download |
|
|
log.info("Downloading main model weights from HuggingFace...") |
|
|
|
|
|
|
|
|
weights_dir = Path("weights") |
|
|
weights_dir.mkdir(exist_ok=True) |
|
|
|
|
|
|
|
|
model_file = hf_hub_download( |
|
|
repo_id="FF2416/AC-Foley", |
|
|
filename="model.pth", |
|
|
local_dir=str(weights_dir), |
|
|
local_dir_use_symlinks=False |
|
|
) |
|
|
self.model.model_path = Path(model_file) |
|
|
log.info(f"โ
Downloaded model weights to {model_file}") |
|
|
|
|
|
except Exception as e: |
|
|
log.warning(f"Could not download main model weights: {e}") |
|
|
log.info("Will proceed with available components only") |
|
|
|
|
|
|
|
|
if model_path and os.path.exists(model_path): |
|
|
self.model.model_path = Path(model_path) |
|
|
log.info(f"Using custom model path: {model_path}") |
|
|
|
|
|
|
|
|
model_loading_status = "๐ง Loading neural network..." |
|
|
self.net: MMAudio = get_my_mmaudio(self.model.model_name).to(self.device, self.dtype).eval() |
|
|
|
|
|
|
|
|
model_loading_status = "โ๏ธ Loading model weights..." |
|
|
if hasattr(self.model, 'model_path') and self.model.model_path and Path(self.model.model_path).exists(): |
|
|
try: |
|
|
weights = torch.load(self.model.model_path, map_location=self.device, weights_only=True) |
|
|
self.net.load_weights(weights['weights']) |
|
|
log.info(f'โ
Loaded weights from {self.model.model_path}') |
|
|
except Exception as e: |
|
|
log.error(f"Failed to load weights: {e}") |
|
|
model_loading_status = f"โ Failed to load model weights: {e}" |
|
|
return model_loading_status |
|
|
else: |
|
|
log.warning('โ ๏ธ No model weights found, using default initialization') |
|
|
model_loading_status = "โ ๏ธ ๆจกๅ็ปไปถๅทฒๅ ่ฝฝ๏ผไฝไธปๆ้ไธๅฏ็จใๆไบๅ่ฝๅฏ่ฝๅ้ใ" |
|
|
return model_loading_status |
|
|
|
|
|
|
|
|
model_loading_status = "๐ Initializing flow matching..." |
|
|
self.fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=25) |
|
|
|
|
|
|
|
|
model_loading_status = "๐ง Initializing feature utilities..." |
|
|
try: |
|
|
self.feature_utils = FeaturesUtils( |
|
|
tod_vae_ckpt=self.model.vae_path, |
|
|
synchformer_ckpt=self.model.synchformer_ckpt, |
|
|
enable_conditions=True, |
|
|
mode=self.model.mode, |
|
|
bigvgan_vocoder_ckpt=self.model.bigvgan_16k_path, |
|
|
need_vae_encoder=True |
|
|
) |
|
|
self.feature_utils = self.feature_utils.to(self.device, self.dtype).eval() |
|
|
except Exception as e: |
|
|
log.error(f"Failed to initialize feature utils: {e}") |
|
|
model_loading_status = f"โ Failed to initialize feature utilities: {e}" |
|
|
return model_loading_status |
|
|
|
|
|
|
|
|
model_loading_status = "โ
Model loaded successfully! Ready to generate audio." |
|
|
return model_loading_status |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"โ Model loading error: {str(e)}" |
|
|
log.error(error_msg) |
|
|
model_loading_status = error_msg |
|
|
return error_msg |
|
|
|
|
|
def generate_audio(self, video_file, prompt: str, negative_prompt: str = "", |
|
|
duration: float = 8.0, cfg_strength: float = 4.5, |
|
|
seed: int = 42, reference_audio: str = None, mask_away_clip: bool = False) -> Tuple[Optional[str], str]: |
|
|
"""Generate audio from video and text prompt""" |
|
|
try: |
|
|
|
|
|
if not HF_AC_AVAILABLE: |
|
|
return None, "โ hf_AC modules not available." |
|
|
|
|
|
if self.net is None or self.feature_utils is None: |
|
|
return None, "โ Model not loaded. Please load the model first." |
|
|
|
|
|
if video_file is None: |
|
|
return None, "โ Please upload a video file." |
|
|
|
|
|
log.info(f'๐ฌ Processing video: {video_file}') |
|
|
if prompt.strip(): |
|
|
log.info(f'๐ Prompt: "{prompt}"') |
|
|
else: |
|
|
log.info('๐ No prompt provided - will generate based on video content') |
|
|
if reference_audio: |
|
|
log.info(f'๐ต Reference audio: {reference_audio}') |
|
|
|
|
|
|
|
|
reference_audio_tensor = None |
|
|
if reference_audio and os.path.exists(reference_audio): |
|
|
try: |
|
|
|
|
|
SAMPLE_RATE = 44100 |
|
|
audio_processor = Audio([reference_audio], SAMPLE_RATE) |
|
|
audio_list = audio_processor.load_audio() |
|
|
if audio_list: |
|
|
reference_audio_tensor = audio_list[0] |
|
|
log.info(f'๐ต Reference audio loaded: {reference_audio_tensor.shape}') |
|
|
except Exception as e: |
|
|
log.warning(f"Failed to load reference audio: {e}") |
|
|
reference_audio_tensor = None |
|
|
|
|
|
|
|
|
try: |
|
|
video_path = Path(video_file) |
|
|
if not video_path.exists(): |
|
|
return None, f"โ Video file not found: {video_file}" |
|
|
|
|
|
video_info = load_video(video_path, duration) |
|
|
clip_frames = video_info.clip_frames |
|
|
sync_frames = video_info.sync_frames |
|
|
duration_sec = video_info.duration_sec |
|
|
|
|
|
log.info(f'๐น Video loaded: {duration_sec:.2f}s duration') |
|
|
|
|
|
except Exception as e: |
|
|
return None, f"โ Failed to load video: {str(e)}" |
|
|
|
|
|
|
|
|
if mask_away_clip: |
|
|
clip_frames = None |
|
|
log.info("๐ญ Using mask_away_clip: ignoring visual features") |
|
|
else: |
|
|
clip_frames = clip_frames.unsqueeze(0) if clip_frames is not None else None |
|
|
sync_frames = sync_frames.unsqueeze(0) |
|
|
|
|
|
|
|
|
try: |
|
|
self.model.seq_cfg.duration = duration_sec |
|
|
|
|
|
if reference_audio_tensor is not None: |
|
|
self.model.seq_cfg.audio_num_sample = reference_audio_tensor.shape[0] |
|
|
else: |
|
|
self.model.seq_cfg.audio_num_sample = 89088 |
|
|
|
|
|
self.net.update_seq_lengths( |
|
|
self.model.seq_cfg.latent_seq_len, |
|
|
self.model.seq_cfg.clip_seq_len, |
|
|
self.model.seq_cfg.sync_seq_len, |
|
|
self.model.seq_cfg.audio_seq_len |
|
|
) |
|
|
except Exception as e: |
|
|
return None, f"โ Failed to configure model: {str(e)}" |
|
|
|
|
|
|
|
|
try: |
|
|
log.info('๐ต Generating audio...') |
|
|
start_time = time.time() |
|
|
|
|
|
with torch.inference_mode(): |
|
|
audios = generate( |
|
|
clip_frames, |
|
|
sync_frames, |
|
|
[prompt], |
|
|
reference_audio_tensor, |
|
|
negative_text=[negative_prompt] if negative_prompt.strip() else None, |
|
|
feature_utils=self.feature_utils, |
|
|
net=self.net, |
|
|
fm=self.fm, |
|
|
rng=torch.Generator(device=self.device).manual_seed(seed), |
|
|
cfg_strength=cfg_strength |
|
|
) |
|
|
|
|
|
generation_time = time.time() - start_time |
|
|
log.info(f'โฑ๏ธ Generation completed in {generation_time:.2f}s') |
|
|
|
|
|
except Exception as e: |
|
|
return None, f"โ Audio generation failed: {str(e)}" |
|
|
|
|
|
|
|
|
try: |
|
|
audio = audios.float().cpu()[0] |
|
|
|
|
|
|
|
|
timestamp = int(time.time()) |
|
|
output_filename = f"generated_audio_{timestamp}.wav" |
|
|
permanent_path = f"/tmp/{output_filename}" |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
torchaudio.save(permanent_path, audio, self.model.seq_cfg.sampling_rate) |
|
|
except Exception as e: |
|
|
log.warning(f"torchaudio.save failed: {e}, trying alternative method...") |
|
|
try: |
|
|
|
|
|
import soundfile as sf |
|
|
sf.write(permanent_path, audio.numpy().T, self.model.seq_cfg.sampling_rate) |
|
|
except ImportError: |
|
|
try: |
|
|
|
|
|
from scipy.io.wavfile import write |
|
|
|
|
|
audio_int16 = (audio * 32767).clamp(-32768, 32767).to(torch.int16) |
|
|
write(permanent_path, self.model.seq_cfg.sampling_rate, audio_int16.numpy().T) |
|
|
except Exception as e2: |
|
|
return None, f"โ Audio saving failed: {str(e2)}" |
|
|
|
|
|
|
|
|
if not os.path.exists(permanent_path): |
|
|
return None, "โ Failed to save audio file" |
|
|
|
|
|
file_size = os.path.getsize(permanent_path) / 1024 |
|
|
success_msg = f"โ
Audio generated successfully!\n" |
|
|
success_msg += f"๐ Duration: {duration_sec:.2f}s | " |
|
|
success_msg += f"Size: {file_size:.1f}KB | " |
|
|
success_msg += f"Generation time: {generation_time:.2f}s" |
|
|
|
|
|
return permanent_path, success_msg |
|
|
|
|
|
except Exception as e: |
|
|
return None, f"โ Failed to save audio: {str(e)}" |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"โ Unexpected error: {str(e)}\n{traceback.format_exc()}" |
|
|
log.error(error_msg) |
|
|
return None, error_msg |
|
|
|
|
|
|
|
|
audio_model = None |
|
|
model_loading_status = "Not initialized" |
|
|
|
|
|
def initialize_model(): |
|
|
"""Initialize model once at startup""" |
|
|
global audio_model, model_loading_status |
|
|
|
|
|
if audio_model is None: |
|
|
try: |
|
|
model_loading_status = "Initializing model..." |
|
|
audio_model = AudioFoleyModel() |
|
|
load_result = audio_model.load_model() |
|
|
model_loading_status = load_result |
|
|
return load_result |
|
|
except Exception as e: |
|
|
model_loading_status = f"โ Model initialization failed: {str(e)}" |
|
|
return model_loading_status |
|
|
else: |
|
|
return "โ
Model already loaded" |
|
|
|
|
|
def generate_audio_interface(video_file, audio_file, prompt, duration, cfg_strength, mask_away_clip): |
|
|
"""Interface function for generating audio""" |
|
|
global audio_model, model_loading_status |
|
|
|
|
|
|
|
|
if audio_model is None or audio_model.net is None: |
|
|
return None, "โ Model not loaded. Please wait for initialization to complete or refresh the page." |
|
|
|
|
|
|
|
|
seed = 42 |
|
|
negative_prompt = "" |
|
|
|
|
|
audio_path, message = audio_model.generate_audio( |
|
|
video_file, prompt, negative_prompt, duration, cfg_strength, seed, audio_file, mask_away_clip |
|
|
) |
|
|
return audio_path, message |
|
|
|
|
|
def get_model_status(): |
|
|
"""Get current model loading status""" |
|
|
global model_loading_status |
|
|
return model_loading_status |
|
|
|
|
|
|
|
|
with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown(""" |
|
|
# ๐ต AC-Foley: Reference-Audio-Guided Video-to-Audio Synthesis |
|
|
|
|
|
## ๐ About |
|
|
AC-Foley is a reference-audio-guided video-to-audio synthesis model that enables precise fine-grained sound synthesis. Unlike traditional text-dependent methods, AC-Foley directly leverages reference audio to achieve precise control over generated sounds, addressing the ambiguity of textual descriptions in micro-acoustic features. |
|
|
|
|
|
## โจ Key Features |
|
|
- **Fine-grained Sound Synthesis**: Generate footsteps with distinct timbres (wood, marble, gravel, etc.) |
|
|
- **Timbre Transfer**: Transform violin melodies into bright, piercing suona tones |
|
|
- **Zero-shot Generation**: Create unique sound effects without specialized training |
|
|
- **Visual-Audio Alignment**: Automatically generate matching audio from video content |
|
|
|
|
|
*Based on paper: [AC-Foley: Reference-Audio-Guided Video-to-Audio Synthesis with Acoustic Transfer](https://openreview.net/forum?id=URPXhnWdBF)* |
|
|
""") |
|
|
|
|
|
|
|
|
model_status = gr.Textbox( |
|
|
label="Model Status", |
|
|
value=model_loading_status, |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
|
|
|
refresh_status_btn = gr.Button("๐ Refresh Status", size="sm") |
|
|
refresh_status_btn.click( |
|
|
fn=get_model_status, |
|
|
outputs=model_status |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=2): |
|
|
|
|
|
gr.Markdown("### ๐น Required Input") |
|
|
video_input = gr.Video( |
|
|
label="Video File - Upload video for audio generation", |
|
|
format="mp4" |
|
|
) |
|
|
|
|
|
|
|
|
gr.Markdown("### ๐๏ธ Optional Inputs") |
|
|
audio_input = gr.Audio( |
|
|
label="Reference Audio - Provide timbre, style, rhythm reference (fine-grained control)", |
|
|
type="filepath", |
|
|
sources=["upload"], |
|
|
format="wav" |
|
|
) |
|
|
|
|
|
prompt_input = gr.Textbox( |
|
|
label="Text Prompt - Describe desired audio type (leave empty for auto-generation from video)", |
|
|
placeholder="e.g., 'footsteps', 'metal clang', 'bird chirping'", |
|
|
lines=2 |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Accordion("๐ง Advanced Options", open=False): |
|
|
with gr.Row(): |
|
|
duration_slider = gr.Slider( |
|
|
minimum=1.0, |
|
|
maximum=15.0, |
|
|
value=8.0, |
|
|
step=0.5, |
|
|
label="Duration (seconds)" |
|
|
) |
|
|
|
|
|
cfg_strength_slider = gr.Slider( |
|
|
minimum=1.0, |
|
|
maximum=8.0, |
|
|
value=4.5, |
|
|
step=0.1, |
|
|
label="CFG Strength" |
|
|
) |
|
|
|
|
|
mask_away_clip = gr.Checkbox( |
|
|
label="Ignore Visual Features (mask_away_clip) - Enable when video and reference audio differ significantly", |
|
|
value=False |
|
|
) |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
|
|
|
gr.Markdown("### ๐ Usage Guide") |
|
|
gr.Markdown(""" |
|
|
**Four Generation Modes:** |
|
|
|
|
|
1๏ธโฃ **Video Only**: Upload video only |
|
|
- Auto-generate audio from visual content |
|
|
|
|
|
2๏ธโฃ **Video + Reference Audio**: Upload video + audio |
|
|
- Use reference audio's timbre and style |
|
|
- Achieve fine-grained timbre control |
|
|
|
|
|
3๏ธโฃ **Video + Text**: Upload video + text |
|
|
- Generate specified audio type from text description |
|
|
|
|
|
4๏ธโฃ **Complete Mode**: Video + Audio + Text |
|
|
- Most precise control method |
|
|
- Combine visual, timbral, and semantic guidance |
|
|
""") |
|
|
|
|
|
|
|
|
gr.Markdown("### ๐ฏ Example Prompts") |
|
|
example_buttons = [] |
|
|
for prompt in EXAMPLE_PROMPTS[:4]: |
|
|
btn = gr.Button(prompt, size="sm") |
|
|
example_buttons.append(btn) |
|
|
btn.click( |
|
|
fn=lambda p=prompt: p, |
|
|
outputs=prompt_input |
|
|
) |
|
|
|
|
|
|
|
|
generate_btn = gr.Button("๐ต Generate Audio", variant="primary", size="lg") |
|
|
|
|
|
|
|
|
gr.Markdown("### ๐ง Generated Results") |
|
|
audio_output = gr.Audio( |
|
|
label="Generated Audio", |
|
|
type="filepath", |
|
|
format="wav", |
|
|
autoplay=False |
|
|
) |
|
|
|
|
|
generation_status = gr.Textbox( |
|
|
label="Generation Status", |
|
|
interactive=False, |
|
|
lines=2 |
|
|
) |
|
|
|
|
|
|
|
|
generate_btn.click( |
|
|
fn=generate_audio_interface, |
|
|
inputs=[ |
|
|
video_input, audio_input, prompt_input, |
|
|
duration_slider, cfg_strength_slider, mask_away_clip |
|
|
], |
|
|
outputs=[audio_output, generation_status] |
|
|
) |
|
|
|
|
|
with gr.Accordion("๐ก Detailed Information", open=False): |
|
|
gr.Markdown(USAGE_TIPS) |
|
|
|
|
|
gr.Markdown(""" |
|
|
### ๐ฌ Application Examples |
|
|
|
|
|
**Fine-grained Sound Synthesis:** |
|
|
- "Footsteps on wooden floor" + reference audio โ Specific timbre footsteps |
|
|
- "Metal collision" + different reference audio โ Iron vs. copper distinction |
|
|
|
|
|
**Timbre Transfer:** |
|
|
- Piano melody video + violin reference audio โ Violin playing same melody |
|
|
- Human humming + instrument reference โ Instrumental version |
|
|
|
|
|
**Creative Sound Effects:** |
|
|
- Sci-fi scene video + real sound reference โ Unique sci-fi effects |
|
|
- Animation video + real sound effects โ Cartoon-reality hybrid effects |
|
|
|
|
|
### ๐ Technical Details |
|
|
- Model based on diffusion models and audio conditioning mechanisms |
|
|
- Supports 44.1kHz high-quality audio generation |
|
|
- Achieves visual-audio-text multimodal alignment |
|
|
""") |
|
|
|
|
|
|
|
|
demo.load( |
|
|
fn=initialize_model, |
|
|
outputs=[model_status] |
|
|
) |
|
|
|
|
|
|
|
|
if HF_AC_AVAILABLE: |
|
|
print("๐ Starting model initialization...") |
|
|
initialize_model() |
|
|
print(f"๐ Model status: {model_loading_status}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
demo.launch() |