Convert interface to English and optimize audio performance
Browse files- Convert all UI text from Chinese to English
- Update model loading status messages to English
- Update error messages and user feedback to English
- Optimize audio component performance:
- Add format='wav' to audio components
- Set autoplay=False for better loading
- Improve audio file handling
- Maintain all functionality while improving international accessibility
app.py
CHANGED
|
@@ -28,23 +28,23 @@ EXAMPLE_PROMPTS = [
|
|
| 28 |
]
|
| 29 |
|
| 30 |
USAGE_TIPS = """
|
| 31 |
-
### 💡
|
| 32 |
|
| 33 |
-
|
| 34 |
-
-
|
| 35 |
-
-
|
| 36 |
-
- **CFG
|
| 37 |
|
| 38 |
-
|
| 39 |
-
- **mask_away_clip**:
|
| 40 |
-
-
|
| 41 |
-
-
|
| 42 |
|
| 43 |
-
|
| 44 |
-
-
|
| 45 |
-
-
|
| 46 |
-
-
|
| 47 |
-
-
|
| 48 |
"""
|
| 49 |
|
| 50 |
# Check and install missing dependencies
|
|
@@ -123,19 +123,19 @@ class AudioFoleyModel:
|
|
| 123 |
return f"❌ Unknown model variant: {variant}. Available: {available_variants}"
|
| 124 |
|
| 125 |
# Step 1: Initialize model config
|
| 126 |
-
model_loading_status = "🔧
|
| 127 |
log.info(f"Loading model variant: {variant}")
|
| 128 |
self.model: ModelConfig = all_model_cfg[variant]
|
| 129 |
|
| 130 |
# Step 2: Download model components
|
| 131 |
-
model_loading_status = "📥
|
| 132 |
try:
|
| 133 |
self.model.download_if_needed()
|
| 134 |
except Exception as e:
|
| 135 |
log.warning(f"Could not download model components: {e}")
|
| 136 |
|
| 137 |
# Step 3: Download main model weights
|
| 138 |
-
model_loading_status = "📥
|
| 139 |
if not hasattr(self.model, 'model_path') or not self.model.model_path or not Path(self.model.model_path).exists():
|
| 140 |
try:
|
| 141 |
from huggingface_hub import hf_hub_download
|
|
@@ -165,11 +165,11 @@ class AudioFoleyModel:
|
|
| 165 |
log.info(f"Using custom model path: {model_path}")
|
| 166 |
|
| 167 |
# Step 4: Load neural network
|
| 168 |
-
model_loading_status = "🧠
|
| 169 |
self.net: MMAudio = get_my_mmaudio(self.model.model_name).to(self.device, self.dtype).eval()
|
| 170 |
|
| 171 |
# Step 5: Load weights
|
| 172 |
-
model_loading_status = "⚖️
|
| 173 |
if hasattr(self.model, 'model_path') and self.model.model_path and Path(self.model.model_path).exists():
|
| 174 |
try:
|
| 175 |
weights = torch.load(self.model.model_path, map_location=self.device, weights_only=True)
|
|
@@ -185,11 +185,11 @@ class AudioFoleyModel:
|
|
| 185 |
return model_loading_status
|
| 186 |
|
| 187 |
# Step 6: Initialize flow matching
|
| 188 |
-
model_loading_status = "🌊
|
| 189 |
self.fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=25)
|
| 190 |
|
| 191 |
# Step 7: Initialize feature utils
|
| 192 |
-
model_loading_status = "🔧
|
| 193 |
try:
|
| 194 |
self.feature_utils = FeaturesUtils(
|
| 195 |
tod_vae_ckpt=self.model.vae_path,
|
|
@@ -206,11 +206,11 @@ class AudioFoleyModel:
|
|
| 206 |
return model_loading_status
|
| 207 |
|
| 208 |
# Step 8: Complete
|
| 209 |
-
model_loading_status = "✅
|
| 210 |
return model_loading_status
|
| 211 |
|
| 212 |
except Exception as e:
|
| 213 |
-
error_msg = f"❌
|
| 214 |
log.error(error_msg)
|
| 215 |
model_loading_status = error_msg
|
| 216 |
return error_msg
|
|
@@ -228,7 +228,7 @@ class AudioFoleyModel:
|
|
| 228 |
return None, "❌ Model not loaded. Please load the model first."
|
| 229 |
|
| 230 |
if video_file is None:
|
| 231 |
-
return None, "❌
|
| 232 |
|
| 233 |
log.info(f'🎬 Processing video: {video_file}')
|
| 234 |
if prompt.strip():
|
|
@@ -347,7 +347,7 @@ class AudioFoleyModel:
|
|
| 347 |
audio_int16 = (audio * 32767).clamp(-32768, 32767).to(torch.int16)
|
| 348 |
write(permanent_path, self.model.seq_cfg.sampling_rate, audio_int16.numpy().T)
|
| 349 |
except Exception as e2:
|
| 350 |
-
return None, f"❌
|
| 351 |
|
| 352 |
# Verify file was created
|
| 353 |
if not os.path.exists(permanent_path):
|
|
@@ -357,7 +357,7 @@ class AudioFoleyModel:
|
|
| 357 |
success_msg = f"✅ Audio generated successfully!\n"
|
| 358 |
success_msg += f"📊 Duration: {duration_sec:.2f}s | "
|
| 359 |
success_msg += f"Size: {file_size:.1f}KB | "
|
| 360 |
-
success_msg += f"
|
| 361 |
|
| 362 |
return permanent_path, success_msg
|
| 363 |
|
|
@@ -371,7 +371,7 @@ class AudioFoleyModel:
|
|
| 371 |
|
| 372 |
# Global model instance - initialized once
|
| 373 |
audio_model = None
|
| 374 |
-
model_loading_status = "
|
| 375 |
|
| 376 |
def initialize_model():
|
| 377 |
"""Initialize model once at startup"""
|
|
@@ -379,16 +379,16 @@ def initialize_model():
|
|
| 379 |
|
| 380 |
if audio_model is None:
|
| 381 |
try:
|
| 382 |
-
model_loading_status = "
|
| 383 |
audio_model = AudioFoleyModel()
|
| 384 |
load_result = audio_model.load_model()
|
| 385 |
model_loading_status = load_result
|
| 386 |
return load_result
|
| 387 |
except Exception as e:
|
| 388 |
-
model_loading_status = f"❌
|
| 389 |
return model_loading_status
|
| 390 |
else:
|
| 391 |
-
return "✅
|
| 392 |
|
| 393 |
def generate_audio_interface(video_file, audio_file, prompt, duration, cfg_strength, mask_away_clip):
|
| 394 |
"""Interface function for generating audio"""
|
|
@@ -396,7 +396,7 @@ def generate_audio_interface(video_file, audio_file, prompt, duration, cfg_stren
|
|
| 396 |
|
| 397 |
# Check if model is loaded
|
| 398 |
if audio_model is None or audio_model.net is None:
|
| 399 |
-
return None, "❌
|
| 400 |
|
| 401 |
# Use fixed seed for consistency in HF Space
|
| 402 |
seed = 42
|
|
@@ -417,27 +417,27 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
|
|
| 417 |
gr.Markdown("""
|
| 418 |
# 🎵 AC-Foley: Reference-Audio-Guided Video-to-Audio Synthesis
|
| 419 |
|
| 420 |
-
## 📖
|
| 421 |
-
AC-Foley
|
| 422 |
|
| 423 |
-
## ✨
|
| 424 |
-
-
|
| 425 |
-
-
|
| 426 |
-
-
|
| 427 |
-
-
|
| 428 |
|
| 429 |
-
|
| 430 |
""")
|
| 431 |
|
| 432 |
# Model status display - will be updated automatically
|
| 433 |
model_status = gr.Textbox(
|
| 434 |
-
label="
|
| 435 |
value=model_loading_status,
|
| 436 |
interactive=False
|
| 437 |
)
|
| 438 |
|
| 439 |
# Add a refresh button for status
|
| 440 |
-
refresh_status_btn = gr.Button("🔄
|
| 441 |
refresh_status_btn.click(
|
| 442 |
fn=get_model_status,
|
| 443 |
outputs=model_status
|
|
@@ -445,36 +445,37 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
|
|
| 445 |
|
| 446 |
with gr.Row():
|
| 447 |
with gr.Column(scale=2):
|
| 448 |
-
#
|
| 449 |
-
gr.Markdown("### 📹
|
| 450 |
video_input = gr.Video(
|
| 451 |
-
label="
|
| 452 |
format="mp4"
|
| 453 |
)
|
| 454 |
|
| 455 |
-
#
|
| 456 |
-
gr.Markdown("### 🎛️
|
| 457 |
audio_input = gr.Audio(
|
| 458 |
-
label="
|
| 459 |
type="filepath",
|
| 460 |
-
sources=["upload"]
|
|
|
|
| 461 |
)
|
| 462 |
|
| 463 |
prompt_input = gr.Textbox(
|
| 464 |
-
label="
|
| 465 |
-
placeholder="
|
| 466 |
lines=2
|
| 467 |
)
|
| 468 |
|
| 469 |
-
#
|
| 470 |
-
with gr.Accordion("🔧
|
| 471 |
with gr.Row():
|
| 472 |
duration_slider = gr.Slider(
|
| 473 |
minimum=1.0,
|
| 474 |
maximum=15.0,
|
| 475 |
value=8.0,
|
| 476 |
step=0.5,
|
| 477 |
-
label="
|
| 478 |
)
|
| 479 |
|
| 480 |
cfg_strength_slider = gr.Slider(
|
|
@@ -482,37 +483,37 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
|
|
| 482 |
maximum=8.0,
|
| 483 |
value=4.5,
|
| 484 |
step=0.1,
|
| 485 |
-
label="CFG
|
| 486 |
)
|
| 487 |
|
| 488 |
mask_away_clip = gr.Checkbox(
|
| 489 |
-
label="
|
| 490 |
value=False
|
| 491 |
)
|
| 492 |
|
| 493 |
with gr.Column(scale=1):
|
| 494 |
-
#
|
| 495 |
-
gr.Markdown("### 📋
|
| 496 |
gr.Markdown("""
|
| 497 |
-
|
| 498 |
|
| 499 |
-
1️⃣
|
| 500 |
-
-
|
| 501 |
|
| 502 |
-
2️⃣
|
| 503 |
-
-
|
| 504 |
-
-
|
| 505 |
|
| 506 |
-
3️⃣
|
| 507 |
-
-
|
| 508 |
|
| 509 |
-
4️⃣
|
| 510 |
-
-
|
| 511 |
-
-
|
| 512 |
""")
|
| 513 |
|
| 514 |
-
#
|
| 515 |
-
gr.Markdown("### 🎯
|
| 516 |
example_buttons = []
|
| 517 |
for prompt in EXAMPLE_PROMPTS[:4]:
|
| 518 |
btn = gr.Button(prompt, size="sm")
|
|
@@ -522,18 +523,20 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
|
|
| 522 |
outputs=prompt_input
|
| 523 |
)
|
| 524 |
|
| 525 |
-
#
|
| 526 |
-
generate_btn = gr.Button("🎵
|
| 527 |
|
| 528 |
-
#
|
| 529 |
-
gr.Markdown("### 🎧
|
| 530 |
audio_output = gr.Audio(
|
| 531 |
-
label="
|
| 532 |
-
type="filepath"
|
|
|
|
|
|
|
| 533 |
)
|
| 534 |
|
| 535 |
generation_status = gr.Textbox(
|
| 536 |
-
label="
|
| 537 |
interactive=False,
|
| 538 |
lines=2
|
| 539 |
)
|
|
@@ -548,28 +551,28 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
|
|
| 548 |
outputs=[audio_output, generation_status]
|
| 549 |
)
|
| 550 |
|
| 551 |
-
with gr.Accordion("💡
|
| 552 |
gr.Markdown(USAGE_TIPS)
|
| 553 |
|
| 554 |
gr.Markdown("""
|
| 555 |
-
### 🎬
|
| 556 |
|
| 557 |
-
|
| 558 |
-
- "
|
| 559 |
-
- "
|
| 560 |
|
| 561 |
-
|
| 562 |
-
-
|
| 563 |
-
-
|
| 564 |
|
| 565 |
-
|
| 566 |
-
-
|
| 567 |
-
-
|
| 568 |
|
| 569 |
-
### 📚
|
| 570 |
-
-
|
| 571 |
-
-
|
| 572 |
-
-
|
| 573 |
""")
|
| 574 |
|
| 575 |
# Auto-initialize model on startup
|
|
|
|
| 28 |
]
|
| 29 |
|
| 30 |
USAGE_TIPS = """
|
| 31 |
+
### 💡 Usage Tips
|
| 32 |
|
| 33 |
+
**Basic Settings:**
|
| 34 |
+
- **Video Quality**: Use clear, well-lit videos, recommended 1-15 seconds
|
| 35 |
+
- **Reference Audio**: Provide clear audio clips as timbre reference
|
| 36 |
+
- **CFG Strength**: Between 1-8, higher values follow description more closely
|
| 37 |
|
| 38 |
+
**Advanced Features:**
|
| 39 |
+
- **mask_away_clip**: Enable when video content differs significantly from desired audio
|
| 40 |
+
- **Fine-grained Control**: Use reference audio for precise timbre and style control
|
| 41 |
+
- **Zero-shot Generation**: Generate novel sound combinations without training
|
| 42 |
|
| 43 |
+
**Application Scenarios:**
|
| 44 |
+
- Film post-production audio
|
| 45 |
+
- Game sound effect creation
|
| 46 |
+
- Music composition assistance
|
| 47 |
+
- Sound design experimentation
|
| 48 |
"""
|
| 49 |
|
| 50 |
# Check and install missing dependencies
|
|
|
|
| 123 |
return f"❌ Unknown model variant: {variant}. Available: {available_variants}"
|
| 124 |
|
| 125 |
# Step 1: Initialize model config
|
| 126 |
+
model_loading_status = "🔧 Initializing model configuration..."
|
| 127 |
log.info(f"Loading model variant: {variant}")
|
| 128 |
self.model: ModelConfig = all_model_cfg[variant]
|
| 129 |
|
| 130 |
# Step 2: Download model components
|
| 131 |
+
model_loading_status = "📥 Downloading model components..."
|
| 132 |
try:
|
| 133 |
self.model.download_if_needed()
|
| 134 |
except Exception as e:
|
| 135 |
log.warning(f"Could not download model components: {e}")
|
| 136 |
|
| 137 |
# Step 3: Download main model weights
|
| 138 |
+
model_loading_status = "📥 Downloading main model weights..."
|
| 139 |
if not hasattr(self.model, 'model_path') or not self.model.model_path or not Path(self.model.model_path).exists():
|
| 140 |
try:
|
| 141 |
from huggingface_hub import hf_hub_download
|
|
|
|
| 165 |
log.info(f"Using custom model path: {model_path}")
|
| 166 |
|
| 167 |
# Step 4: Load neural network
|
| 168 |
+
model_loading_status = "🧠 Loading neural network..."
|
| 169 |
self.net: MMAudio = get_my_mmaudio(self.model.model_name).to(self.device, self.dtype).eval()
|
| 170 |
|
| 171 |
# Step 5: Load weights
|
| 172 |
+
model_loading_status = "⚖️ Loading model weights..."
|
| 173 |
if hasattr(self.model, 'model_path') and self.model.model_path and Path(self.model.model_path).exists():
|
| 174 |
try:
|
| 175 |
weights = torch.load(self.model.model_path, map_location=self.device, weights_only=True)
|
|
|
|
| 185 |
return model_loading_status
|
| 186 |
|
| 187 |
# Step 6: Initialize flow matching
|
| 188 |
+
model_loading_status = "🌊 Initializing flow matching..."
|
| 189 |
self.fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=25)
|
| 190 |
|
| 191 |
# Step 7: Initialize feature utils
|
| 192 |
+
model_loading_status = "🔧 Initializing feature utilities..."
|
| 193 |
try:
|
| 194 |
self.feature_utils = FeaturesUtils(
|
| 195 |
tod_vae_ckpt=self.model.vae_path,
|
|
|
|
| 206 |
return model_loading_status
|
| 207 |
|
| 208 |
# Step 8: Complete
|
| 209 |
+
model_loading_status = "✅ Model loaded successfully! Ready to generate audio."
|
| 210 |
return model_loading_status
|
| 211 |
|
| 212 |
except Exception as e:
|
| 213 |
+
error_msg = f"❌ Model loading error: {str(e)}"
|
| 214 |
log.error(error_msg)
|
| 215 |
model_loading_status = error_msg
|
| 216 |
return error_msg
|
|
|
|
| 228 |
return None, "❌ Model not loaded. Please load the model first."
|
| 229 |
|
| 230 |
if video_file is None:
|
| 231 |
+
return None, "❌ Please upload a video file."
|
| 232 |
|
| 233 |
log.info(f'🎬 Processing video: {video_file}')
|
| 234 |
if prompt.strip():
|
|
|
|
| 347 |
audio_int16 = (audio * 32767).clamp(-32768, 32767).to(torch.int16)
|
| 348 |
write(permanent_path, self.model.seq_cfg.sampling_rate, audio_int16.numpy().T)
|
| 349 |
except Exception as e2:
|
| 350 |
+
return None, f"❌ Audio saving failed: {str(e2)}"
|
| 351 |
|
| 352 |
# Verify file was created
|
| 353 |
if not os.path.exists(permanent_path):
|
|
|
|
| 357 |
success_msg = f"✅ Audio generated successfully!\n"
|
| 358 |
success_msg += f"📊 Duration: {duration_sec:.2f}s | "
|
| 359 |
success_msg += f"Size: {file_size:.1f}KB | "
|
| 360 |
+
success_msg += f"Generation time: {generation_time:.2f}s"
|
| 361 |
|
| 362 |
return permanent_path, success_msg
|
| 363 |
|
|
|
|
| 371 |
|
| 372 |
# Global model instance - initialized once
|
| 373 |
audio_model = None
|
| 374 |
+
model_loading_status = "Not initialized"
|
| 375 |
|
| 376 |
def initialize_model():
|
| 377 |
"""Initialize model once at startup"""
|
|
|
|
| 379 |
|
| 380 |
if audio_model is None:
|
| 381 |
try:
|
| 382 |
+
model_loading_status = "Initializing model..."
|
| 383 |
audio_model = AudioFoleyModel()
|
| 384 |
load_result = audio_model.load_model()
|
| 385 |
model_loading_status = load_result
|
| 386 |
return load_result
|
| 387 |
except Exception as e:
|
| 388 |
+
model_loading_status = f"❌ Model initialization failed: {str(e)}"
|
| 389 |
return model_loading_status
|
| 390 |
else:
|
| 391 |
+
return "✅ Model already loaded"
|
| 392 |
|
| 393 |
def generate_audio_interface(video_file, audio_file, prompt, duration, cfg_strength, mask_away_clip):
|
| 394 |
"""Interface function for generating audio"""
|
|
|
|
| 396 |
|
| 397 |
# Check if model is loaded
|
| 398 |
if audio_model is None or audio_model.net is None:
|
| 399 |
+
return None, "❌ Model not loaded. Please wait for initialization to complete or refresh the page."
|
| 400 |
|
| 401 |
# Use fixed seed for consistency in HF Space
|
| 402 |
seed = 42
|
|
|
|
| 417 |
gr.Markdown("""
|
| 418 |
# 🎵 AC-Foley: Reference-Audio-Guided Video-to-Audio Synthesis
|
| 419 |
|
| 420 |
+
## 📖 About
|
| 421 |
+
AC-Foley is a reference-audio-guided video-to-audio synthesis model that enables precise fine-grained sound synthesis. Unlike traditional text-dependent methods, AC-Foley directly leverages reference audio to achieve precise control over generated sounds, addressing the ambiguity of textual descriptions in micro-acoustic features.
|
| 422 |
|
| 423 |
+
## ✨ Key Features
|
| 424 |
+
- **Fine-grained Sound Synthesis**: Generate footsteps with distinct timbres (wood, marble, gravel, etc.)
|
| 425 |
+
- **Timbre Transfer**: Transform violin melodies into bright, piercing suona tones
|
| 426 |
+
- **Zero-shot Generation**: Create unique sound effects without specialized training
|
| 427 |
+
- **Visual-Audio Alignment**: Automatically generate matching audio from video content
|
| 428 |
|
| 429 |
+
*Based on paper: [AC-Foley: Reference-Audio-Guided Video-to-Audio Synthesis with Acoustic Transfer](https://openreview.net/forum?id=URPXhnWdBF)*
|
| 430 |
""")
|
| 431 |
|
| 432 |
# Model status display - will be updated automatically
|
| 433 |
model_status = gr.Textbox(
|
| 434 |
+
label="Model Status",
|
| 435 |
value=model_loading_status,
|
| 436 |
interactive=False
|
| 437 |
)
|
| 438 |
|
| 439 |
# Add a refresh button for status
|
| 440 |
+
refresh_status_btn = gr.Button("🔄 Refresh Status", size="sm")
|
| 441 |
refresh_status_btn.click(
|
| 442 |
fn=get_model_status,
|
| 443 |
outputs=model_status
|
|
|
|
| 445 |
|
| 446 |
with gr.Row():
|
| 447 |
with gr.Column(scale=2):
|
| 448 |
+
# Required inputs
|
| 449 |
+
gr.Markdown("### 📹 Required Input")
|
| 450 |
video_input = gr.Video(
|
| 451 |
+
label="Video File - Upload video for audio generation",
|
| 452 |
format="mp4"
|
| 453 |
)
|
| 454 |
|
| 455 |
+
# Optional inputs
|
| 456 |
+
gr.Markdown("### 🎛️ Optional Inputs")
|
| 457 |
audio_input = gr.Audio(
|
| 458 |
+
label="Reference Audio - Provide timbre, style, rhythm reference (fine-grained control)",
|
| 459 |
type="filepath",
|
| 460 |
+
sources=["upload"],
|
| 461 |
+
format="wav"
|
| 462 |
)
|
| 463 |
|
| 464 |
prompt_input = gr.Textbox(
|
| 465 |
+
label="Text Prompt - Describe desired audio type (leave empty for auto-generation from video)",
|
| 466 |
+
placeholder="e.g., 'footsteps', 'metal clang', 'bird chirping'",
|
| 467 |
lines=2
|
| 468 |
)
|
| 469 |
|
| 470 |
+
# Advanced options
|
| 471 |
+
with gr.Accordion("🔧 Advanced Options", open=False):
|
| 472 |
with gr.Row():
|
| 473 |
duration_slider = gr.Slider(
|
| 474 |
minimum=1.0,
|
| 475 |
maximum=15.0,
|
| 476 |
value=8.0,
|
| 477 |
step=0.5,
|
| 478 |
+
label="Duration (seconds)"
|
| 479 |
)
|
| 480 |
|
| 481 |
cfg_strength_slider = gr.Slider(
|
|
|
|
| 483 |
maximum=8.0,
|
| 484 |
value=4.5,
|
| 485 |
step=0.1,
|
| 486 |
+
label="CFG Strength"
|
| 487 |
)
|
| 488 |
|
| 489 |
mask_away_clip = gr.Checkbox(
|
| 490 |
+
label="Ignore Visual Features (mask_away_clip) - Enable when video and reference audio differ significantly",
|
| 491 |
value=False
|
| 492 |
)
|
| 493 |
|
| 494 |
with gr.Column(scale=1):
|
| 495 |
+
# Usage guide
|
| 496 |
+
gr.Markdown("### 📋 Usage Guide")
|
| 497 |
gr.Markdown("""
|
| 498 |
+
**Four Generation Modes:**
|
| 499 |
|
| 500 |
+
1️⃣ **Video Only**: Upload video only
|
| 501 |
+
- Auto-generate audio from visual content
|
| 502 |
|
| 503 |
+
2️⃣ **Video + Reference Audio**: Upload video + audio
|
| 504 |
+
- Use reference audio's timbre and style
|
| 505 |
+
- Achieve fine-grained timbre control
|
| 506 |
|
| 507 |
+
3️⃣ **Video + Text**: Upload video + text
|
| 508 |
+
- Generate specified audio type from text description
|
| 509 |
|
| 510 |
+
4️⃣ **Complete Mode**: Video + Audio + Text
|
| 511 |
+
- Most precise control method
|
| 512 |
+
- Combine visual, timbral, and semantic guidance
|
| 513 |
""")
|
| 514 |
|
| 515 |
+
# Example prompts
|
| 516 |
+
gr.Markdown("### 🎯 Example Prompts")
|
| 517 |
example_buttons = []
|
| 518 |
for prompt in EXAMPLE_PROMPTS[:4]:
|
| 519 |
btn = gr.Button(prompt, size="sm")
|
|
|
|
| 523 |
outputs=prompt_input
|
| 524 |
)
|
| 525 |
|
| 526 |
+
# Generate button
|
| 527 |
+
generate_btn = gr.Button("🎵 Generate Audio", variant="primary", size="lg")
|
| 528 |
|
| 529 |
+
# Output area
|
| 530 |
+
gr.Markdown("### 🎧 Generated Results")
|
| 531 |
audio_output = gr.Audio(
|
| 532 |
+
label="Generated Audio",
|
| 533 |
+
type="filepath",
|
| 534 |
+
format="wav",
|
| 535 |
+
autoplay=False
|
| 536 |
)
|
| 537 |
|
| 538 |
generation_status = gr.Textbox(
|
| 539 |
+
label="Generation Status",
|
| 540 |
interactive=False,
|
| 541 |
lines=2
|
| 542 |
)
|
|
|
|
| 551 |
outputs=[audio_output, generation_status]
|
| 552 |
)
|
| 553 |
|
| 554 |
+
with gr.Accordion("💡 Detailed Information", open=False):
|
| 555 |
gr.Markdown(USAGE_TIPS)
|
| 556 |
|
| 557 |
gr.Markdown("""
|
| 558 |
+
### 🎬 Application Examples
|
| 559 |
|
| 560 |
+
**Fine-grained Sound Synthesis:**
|
| 561 |
+
- "Footsteps on wooden floor" + reference audio → Specific timbre footsteps
|
| 562 |
+
- "Metal collision" + different reference audio → Iron vs. copper distinction
|
| 563 |
|
| 564 |
+
**Timbre Transfer:**
|
| 565 |
+
- Piano melody video + violin reference audio → Violin playing same melody
|
| 566 |
+
- Human humming + instrument reference → Instrumental version
|
| 567 |
|
| 568 |
+
**Creative Sound Effects:**
|
| 569 |
+
- Sci-fi scene video + real sound reference → Unique sci-fi effects
|
| 570 |
+
- Animation video + real sound effects → Cartoon-reality hybrid effects
|
| 571 |
|
| 572 |
+
### 📚 Technical Details
|
| 573 |
+
- Model based on diffusion models and audio conditioning mechanisms
|
| 574 |
+
- Supports 44.1kHz high-quality audio generation
|
| 575 |
+
- Achieves visual-audio-text multimodal alignment
|
| 576 |
""")
|
| 577 |
|
| 578 |
# Auto-initialize model on startup
|