learnmlf commited on
Commit
dd2db22
·
1 Parent(s): ca6cc37

Convert interface to English and optimize audio performance

Browse files

- Convert all UI text from Chinese to English
- Update model loading status messages to English
- Update error messages and user feedback to English
- Optimize audio component performance:
- Add format='wav' to audio components
- Set autoplay=False for better loading
- Improve audio file handling
- Maintain all functionality while improving international accessibility

Files changed (1) hide show
  1. app.py +95 -92
app.py CHANGED
@@ -28,23 +28,23 @@ EXAMPLE_PROMPTS = [
28
  ]
29
 
30
  USAGE_TIPS = """
31
- ### 💡 使用技巧
32
 
33
- **基础设置:**
34
- - **视频质量**: 使用清晰、光线良好的视频,建议1-15
35
- - **参考音频**: 提供清晰的音频片段作为音色参考
36
- - **CFG强度**: 1-8之间,数值越高越贴合描述
37
 
38
- **高级功能:**
39
- - **mask_away_clip**: 当视频内容与期望音频差异很大时启用
40
- - **细粒度控制**: 使用参考音频实现精确的音色和风格控制
41
- - **零样本生成**: 无需训练即可生成新颖的音效组合
42
 
43
- **应用场景:**
44
- - 影视后期配音
45
- - 游戏音效制作
46
- - 音乐创作辅助
47
- - 声音设计实验
48
  """
49
 
50
  # Check and install missing dependencies
@@ -123,19 +123,19 @@ class AudioFoleyModel:
123
  return f"❌ Unknown model variant: {variant}. Available: {available_variants}"
124
 
125
  # Step 1: Initialize model config
126
- model_loading_status = "🔧 初始化模型配置..."
127
  log.info(f"Loading model variant: {variant}")
128
  self.model: ModelConfig = all_model_cfg[variant]
129
 
130
  # Step 2: Download model components
131
- model_loading_status = "📥 下载模型组件..."
132
  try:
133
  self.model.download_if_needed()
134
  except Exception as e:
135
  log.warning(f"Could not download model components: {e}")
136
 
137
  # Step 3: Download main model weights
138
- model_loading_status = "📥 下载主模型权重..."
139
  if not hasattr(self.model, 'model_path') or not self.model.model_path or not Path(self.model.model_path).exists():
140
  try:
141
  from huggingface_hub import hf_hub_download
@@ -165,11 +165,11 @@ class AudioFoleyModel:
165
  log.info(f"Using custom model path: {model_path}")
166
 
167
  # Step 4: Load neural network
168
- model_loading_status = "🧠 加载神经网络..."
169
  self.net: MMAudio = get_my_mmaudio(self.model.model_name).to(self.device, self.dtype).eval()
170
 
171
  # Step 5: Load weights
172
- model_loading_status = "⚖️ 加载模型权重..."
173
  if hasattr(self.model, 'model_path') and self.model.model_path and Path(self.model.model_path).exists():
174
  try:
175
  weights = torch.load(self.model.model_path, map_location=self.device, weights_only=True)
@@ -185,11 +185,11 @@ class AudioFoleyModel:
185
  return model_loading_status
186
 
187
  # Step 6: Initialize flow matching
188
- model_loading_status = "🌊 初始化流匹配..."
189
  self.fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=25)
190
 
191
  # Step 7: Initialize feature utils
192
- model_loading_status = "🔧 初始化特征工具..."
193
  try:
194
  self.feature_utils = FeaturesUtils(
195
  tod_vae_ckpt=self.model.vae_path,
@@ -206,11 +206,11 @@ class AudioFoleyModel:
206
  return model_loading_status
207
 
208
  # Step 8: Complete
209
- model_loading_status = "✅ 模型加载完成!可以开始生成音频。"
210
  return model_loading_status
211
 
212
  except Exception as e:
213
- error_msg = f"❌ 模型加载错误: {str(e)}"
214
  log.error(error_msg)
215
  model_loading_status = error_msg
216
  return error_msg
@@ -228,7 +228,7 @@ class AudioFoleyModel:
228
  return None, "❌ Model not loaded. Please load the model first."
229
 
230
  if video_file is None:
231
- return None, "❌ 请上传视频文件。"
232
 
233
  log.info(f'🎬 Processing video: {video_file}')
234
  if prompt.strip():
@@ -347,7 +347,7 @@ class AudioFoleyModel:
347
  audio_int16 = (audio * 32767).clamp(-32768, 32767).to(torch.int16)
348
  write(permanent_path, self.model.seq_cfg.sampling_rate, audio_int16.numpy().T)
349
  except Exception as e2:
350
- return None, f"❌ 音频保存失败: {str(e2)}"
351
 
352
  # Verify file was created
353
  if not os.path.exists(permanent_path):
@@ -357,7 +357,7 @@ class AudioFoleyModel:
357
  success_msg = f"✅ Audio generated successfully!\n"
358
  success_msg += f"📊 Duration: {duration_sec:.2f}s | "
359
  success_msg += f"Size: {file_size:.1f}KB | "
360
- success_msg += f"Time: {generation_time:.2f}s"
361
 
362
  return permanent_path, success_msg
363
 
@@ -371,7 +371,7 @@ class AudioFoleyModel:
371
 
372
  # Global model instance - initialized once
373
  audio_model = None
374
- model_loading_status = "未初始化"
375
 
376
  def initialize_model():
377
  """Initialize model once at startup"""
@@ -379,16 +379,16 @@ def initialize_model():
379
 
380
  if audio_model is None:
381
  try:
382
- model_loading_status = "正在初始化模型..."
383
  audio_model = AudioFoleyModel()
384
  load_result = audio_model.load_model()
385
  model_loading_status = load_result
386
  return load_result
387
  except Exception as e:
388
- model_loading_status = f"❌ 模型初始化失败: {str(e)}"
389
  return model_loading_status
390
  else:
391
- return "✅ 模型已加载"
392
 
393
  def generate_audio_interface(video_file, audio_file, prompt, duration, cfg_strength, mask_away_clip):
394
  """Interface function for generating audio"""
@@ -396,7 +396,7 @@ def generate_audio_interface(video_file, audio_file, prompt, duration, cfg_stren
396
 
397
  # Check if model is loaded
398
  if audio_model is None or audio_model.net is None:
399
- return None, "❌ 模型未加载,请等待初始化完成或刷新页面"
400
 
401
  # Use fixed seed for consistency in HF Space
402
  seed = 42
@@ -417,27 +417,27 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
417
  gr.Markdown("""
418
  # 🎵 AC-Foley: Reference-Audio-Guided Video-to-Audio Synthesis
419
 
420
- ## 📖 模型简介
421
- AC-Foley是一个基于参考音频引导的视频到音频合成模型,能够实现精确的细粒度声音合成。与传统依赖文本描述的方法不同,AC-Foley直接利用参考音频来实现对生成声音的精确控制,解决了文本描述在微观声学特征方面的模糊性问题。
422
 
423
- ## ✨ 功能要点
424
- - **细粒度声音合成**: 生成具有特定音色的脚步声(木板、大理石、砾石等)
425
- - **音色转换**: 将小提琴的旋律转换为唢呐的明亮刺耳音色
426
- - **零样本生成**: 创建独特的音效而无需专门训练
427
- - **视觉-音频对齐**: 根据视频内容自动生成匹配的音频
428
 
429
- *基于论文: [AC-Foley: Reference-Audio-Guided Video-to-Audio Synthesis with Acoustic Transfer](https://openreview.net/forum?id=URPXhnWdBF)*
430
  """)
431
 
432
  # Model status display - will be updated automatically
433
  model_status = gr.Textbox(
434
- label="模型状态",
435
  value=model_loading_status,
436
  interactive=False
437
  )
438
 
439
  # Add a refresh button for status
440
- refresh_status_btn = gr.Button("🔄 刷新状态", size="sm")
441
  refresh_status_btn.click(
442
  fn=get_model_status,
443
  outputs=model_status
@@ -445,36 +445,37 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
445
 
446
  with gr.Row():
447
  with gr.Column(scale=2):
448
- # 必需输入
449
- gr.Markdown("### 📹 必需输入")
450
  video_input = gr.Video(
451
- label="视频文件 - 上传需要生成音频的视频文件",
452
  format="mp4"
453
  )
454
 
455
- # 可选输入
456
- gr.Markdown("### 🎛️ 可选输入")
457
  audio_input = gr.Audio(
458
- label="参考音频 - 提供音色、风格、节奏参考(支持细粒度控制)",
459
  type="filepath",
460
- sources=["upload"]
 
461
  )
462
 
463
  prompt_input = gr.Textbox(
464
- label="文本提示 - 描述想要的音频类型(留空则根据视频自动生成)",
465
- placeholder="例如: '脚步声', '金属碰撞声', '鸟叫声'",
466
  lines=2
467
  )
468
 
469
- # 高级选项
470
- with gr.Accordion("🔧 高级选项", open=False):
471
  with gr.Row():
472
  duration_slider = gr.Slider(
473
  minimum=1.0,
474
  maximum=15.0,
475
  value=8.0,
476
  step=0.5,
477
- label="时长 ()"
478
  )
479
 
480
  cfg_strength_slider = gr.Slider(
@@ -482,37 +483,37 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
482
  maximum=8.0,
483
  value=4.5,
484
  step=0.1,
485
- label="CFG强度"
486
  )
487
 
488
  mask_away_clip = gr.Checkbox(
489
- label="忽略视觉特征 (mask_away_clip) - 当视频和参考音频差异较大且生成效果不佳时启用",
490
  value=False
491
  )
492
 
493
  with gr.Column(scale=1):
494
- # 使用指南
495
- gr.Markdown("### 📋 使用指南")
496
  gr.Markdown("""
497
- **四种生成模式:**
498
 
499
- 1️⃣ **纯视频**: 仅上传视频
500
- - 根据视觉内容自动生成音频
501
 
502
- 2️⃣ **视频+参考音频**: 上传视频+音频
503
- - 使用参考音频的音色和风格
504
- - 实现细粒度音色控制
505
 
506
- 3️⃣ **视频+文本**: 上传视频+文本
507
- - 根据文本描述生成指定类型音频
508
 
509
- 4️⃣ **完整模式**: 视频+音频+文本
510
- - 最精确的控制方式
511
- - 结合视觉、音色和语义指导
512
  """)
513
 
514
- # 示例提示词
515
- gr.Markdown("### 🎯 示例提示词")
516
  example_buttons = []
517
  for prompt in EXAMPLE_PROMPTS[:4]:
518
  btn = gr.Button(prompt, size="sm")
@@ -522,18 +523,20 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
522
  outputs=prompt_input
523
  )
524
 
525
- # 生成按钮
526
- generate_btn = gr.Button("🎵 开始生成音频", variant="primary", size="lg")
527
 
528
- # 输出区域
529
- gr.Markdown("### 🎧 生成结果")
530
  audio_output = gr.Audio(
531
- label="生成的音频",
532
- type="filepath"
 
 
533
  )
534
 
535
  generation_status = gr.Textbox(
536
- label="生成状态",
537
  interactive=False,
538
  lines=2
539
  )
@@ -548,28 +551,28 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
548
  outputs=[audio_output, generation_status]
549
  )
550
 
551
- with gr.Accordion("💡 详细说明", open=False):
552
  gr.Markdown(USAGE_TIPS)
553
 
554
  gr.Markdown("""
555
- ### 🎬 应用示例
556
 
557
- **细粒度声音合成:**
558
- - "木地板上的脚步声" + 参考音频特定音色的脚步声
559
- - "金属碰撞" + 不同参考音频铁器vs铜器的区别
560
 
561
- **音色转换:**
562
- - 钢琴旋律视频 + 小提琴参考音频小提琴演奏同样旋律
563
- - 人声哼唱 + 乐器参考乐器演奏版本
564
 
565
- **创意音效:**
566
- - 科幻场景视频 + 现实音效参考独特的科幻音效
567
- - 动画视频 + 真实音效卡通与现实结合的音效
568
 
569
- ### 📚 技术细节
570
- - 模型基于扩散模型和音频条件机制
571
- - 支持44.1kHz高质量音频生成
572
- - 实现了视觉-音频-文本的多模态对齐
573
  """)
574
 
575
  # Auto-initialize model on startup
 
28
  ]
29
 
30
  USAGE_TIPS = """
31
+ ### 💡 Usage Tips
32
 
33
+ **Basic Settings:**
34
+ - **Video Quality**: Use clear, well-lit videos, recommended 1-15 seconds
35
+ - **Reference Audio**: Provide clear audio clips as timbre reference
36
+ - **CFG Strength**: Between 1-8, higher values follow description more closely
37
 
38
+ **Advanced Features:**
39
+ - **mask_away_clip**: Enable when video content differs significantly from desired audio
40
+ - **Fine-grained Control**: Use reference audio for precise timbre and style control
41
+ - **Zero-shot Generation**: Generate novel sound combinations without training
42
 
43
+ **Application Scenarios:**
44
+ - Film post-production audio
45
+ - Game sound effect creation
46
+ - Music composition assistance
47
+ - Sound design experimentation
48
  """
49
 
50
  # Check and install missing dependencies
 
123
  return f"❌ Unknown model variant: {variant}. Available: {available_variants}"
124
 
125
  # Step 1: Initialize model config
126
+ model_loading_status = "🔧 Initializing model configuration..."
127
  log.info(f"Loading model variant: {variant}")
128
  self.model: ModelConfig = all_model_cfg[variant]
129
 
130
  # Step 2: Download model components
131
+ model_loading_status = "📥 Downloading model components..."
132
  try:
133
  self.model.download_if_needed()
134
  except Exception as e:
135
  log.warning(f"Could not download model components: {e}")
136
 
137
  # Step 3: Download main model weights
138
+ model_loading_status = "📥 Downloading main model weights..."
139
  if not hasattr(self.model, 'model_path') or not self.model.model_path or not Path(self.model.model_path).exists():
140
  try:
141
  from huggingface_hub import hf_hub_download
 
165
  log.info(f"Using custom model path: {model_path}")
166
 
167
  # Step 4: Load neural network
168
+ model_loading_status = "🧠 Loading neural network..."
169
  self.net: MMAudio = get_my_mmaudio(self.model.model_name).to(self.device, self.dtype).eval()
170
 
171
  # Step 5: Load weights
172
+ model_loading_status = "⚖️ Loading model weights..."
173
  if hasattr(self.model, 'model_path') and self.model.model_path and Path(self.model.model_path).exists():
174
  try:
175
  weights = torch.load(self.model.model_path, map_location=self.device, weights_only=True)
 
185
  return model_loading_status
186
 
187
  # Step 6: Initialize flow matching
188
+ model_loading_status = "🌊 Initializing flow matching..."
189
  self.fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=25)
190
 
191
  # Step 7: Initialize feature utils
192
+ model_loading_status = "🔧 Initializing feature utilities..."
193
  try:
194
  self.feature_utils = FeaturesUtils(
195
  tod_vae_ckpt=self.model.vae_path,
 
206
  return model_loading_status
207
 
208
  # Step 8: Complete
209
+ model_loading_status = "✅ Model loaded successfully! Ready to generate audio."
210
  return model_loading_status
211
 
212
  except Exception as e:
213
+ error_msg = f"❌ Model loading error: {str(e)}"
214
  log.error(error_msg)
215
  model_loading_status = error_msg
216
  return error_msg
 
228
  return None, "❌ Model not loaded. Please load the model first."
229
 
230
  if video_file is None:
231
+ return None, "❌ Please upload a video file."
232
 
233
  log.info(f'🎬 Processing video: {video_file}')
234
  if prompt.strip():
 
347
  audio_int16 = (audio * 32767).clamp(-32768, 32767).to(torch.int16)
348
  write(permanent_path, self.model.seq_cfg.sampling_rate, audio_int16.numpy().T)
349
  except Exception as e2:
350
+ return None, f"❌ Audio saving failed: {str(e2)}"
351
 
352
  # Verify file was created
353
  if not os.path.exists(permanent_path):
 
357
  success_msg = f"✅ Audio generated successfully!\n"
358
  success_msg += f"📊 Duration: {duration_sec:.2f}s | "
359
  success_msg += f"Size: {file_size:.1f}KB | "
360
+ success_msg += f"Generation time: {generation_time:.2f}s"
361
 
362
  return permanent_path, success_msg
363
 
 
371
 
372
  # Global model instance - initialized once
373
  audio_model = None
374
+ model_loading_status = "Not initialized"
375
 
376
  def initialize_model():
377
  """Initialize model once at startup"""
 
379
 
380
  if audio_model is None:
381
  try:
382
+ model_loading_status = "Initializing model..."
383
  audio_model = AudioFoleyModel()
384
  load_result = audio_model.load_model()
385
  model_loading_status = load_result
386
  return load_result
387
  except Exception as e:
388
+ model_loading_status = f"❌ Model initialization failed: {str(e)}"
389
  return model_loading_status
390
  else:
391
+ return "✅ Model already loaded"
392
 
393
  def generate_audio_interface(video_file, audio_file, prompt, duration, cfg_strength, mask_away_clip):
394
  """Interface function for generating audio"""
 
396
 
397
  # Check if model is loaded
398
  if audio_model is None or audio_model.net is None:
399
+ return None, "❌ Model not loaded. Please wait for initialization to complete or refresh the page."
400
 
401
  # Use fixed seed for consistency in HF Space
402
  seed = 42
 
417
  gr.Markdown("""
418
  # 🎵 AC-Foley: Reference-Audio-Guided Video-to-Audio Synthesis
419
 
420
+ ## 📖 About
421
+ AC-Foley is a reference-audio-guided video-to-audio synthesis model that enables precise fine-grained sound synthesis. Unlike traditional text-dependent methods, AC-Foley directly leverages reference audio to achieve precise control over generated sounds, addressing the ambiguity of textual descriptions in micro-acoustic features.
422
 
423
+ ## ✨ Key Features
424
+ - **Fine-grained Sound Synthesis**: Generate footsteps with distinct timbres (wood, marble, gravel, etc.)
425
+ - **Timbre Transfer**: Transform violin melodies into bright, piercing suona tones
426
+ - **Zero-shot Generation**: Create unique sound effects without specialized training
427
+ - **Visual-Audio Alignment**: Automatically generate matching audio from video content
428
 
429
+ *Based on paper: [AC-Foley: Reference-Audio-Guided Video-to-Audio Synthesis with Acoustic Transfer](https://openreview.net/forum?id=URPXhnWdBF)*
430
  """)
431
 
432
  # Model status display - will be updated automatically
433
  model_status = gr.Textbox(
434
+ label="Model Status",
435
  value=model_loading_status,
436
  interactive=False
437
  )
438
 
439
  # Add a refresh button for status
440
+ refresh_status_btn = gr.Button("🔄 Refresh Status", size="sm")
441
  refresh_status_btn.click(
442
  fn=get_model_status,
443
  outputs=model_status
 
445
 
446
  with gr.Row():
447
  with gr.Column(scale=2):
448
+ # Required inputs
449
+ gr.Markdown("### 📹 Required Input")
450
  video_input = gr.Video(
451
+ label="Video File - Upload video for audio generation",
452
  format="mp4"
453
  )
454
 
455
+ # Optional inputs
456
+ gr.Markdown("### 🎛️ Optional Inputs")
457
  audio_input = gr.Audio(
458
+ label="Reference Audio - Provide timbre, style, rhythm reference (fine-grained control)",
459
  type="filepath",
460
+ sources=["upload"],
461
+ format="wav"
462
  )
463
 
464
  prompt_input = gr.Textbox(
465
+ label="Text Prompt - Describe desired audio type (leave empty for auto-generation from video)",
466
+ placeholder="e.g., 'footsteps', 'metal clang', 'bird chirping'",
467
  lines=2
468
  )
469
 
470
+ # Advanced options
471
+ with gr.Accordion("🔧 Advanced Options", open=False):
472
  with gr.Row():
473
  duration_slider = gr.Slider(
474
  minimum=1.0,
475
  maximum=15.0,
476
  value=8.0,
477
  step=0.5,
478
+ label="Duration (seconds)"
479
  )
480
 
481
  cfg_strength_slider = gr.Slider(
 
483
  maximum=8.0,
484
  value=4.5,
485
  step=0.1,
486
+ label="CFG Strength"
487
  )
488
 
489
  mask_away_clip = gr.Checkbox(
490
+ label="Ignore Visual Features (mask_away_clip) - Enable when video and reference audio differ significantly",
491
  value=False
492
  )
493
 
494
  with gr.Column(scale=1):
495
+ # Usage guide
496
+ gr.Markdown("### 📋 Usage Guide")
497
  gr.Markdown("""
498
+ **Four Generation Modes:**
499
 
500
+ 1️⃣ **Video Only**: Upload video only
501
+ - Auto-generate audio from visual content
502
 
503
+ 2️⃣ **Video + Reference Audio**: Upload video + audio
504
+ - Use reference audio's timbre and style
505
+ - Achieve fine-grained timbre control
506
 
507
+ 3️⃣ **Video + Text**: Upload video + text
508
+ - Generate specified audio type from text description
509
 
510
+ 4️⃣ **Complete Mode**: Video + Audio + Text
511
+ - Most precise control method
512
+ - Combine visual, timbral, and semantic guidance
513
  """)
514
 
515
+ # Example prompts
516
+ gr.Markdown("### 🎯 Example Prompts")
517
  example_buttons = []
518
  for prompt in EXAMPLE_PROMPTS[:4]:
519
  btn = gr.Button(prompt, size="sm")
 
523
  outputs=prompt_input
524
  )
525
 
526
+ # Generate button
527
+ generate_btn = gr.Button("🎵 Generate Audio", variant="primary", size="lg")
528
 
529
+ # Output area
530
+ gr.Markdown("### 🎧 Generated Results")
531
  audio_output = gr.Audio(
532
+ label="Generated Audio",
533
+ type="filepath",
534
+ format="wav",
535
+ autoplay=False
536
  )
537
 
538
  generation_status = gr.Textbox(
539
+ label="Generation Status",
540
  interactive=False,
541
  lines=2
542
  )
 
551
  outputs=[audio_output, generation_status]
552
  )
553
 
554
+ with gr.Accordion("💡 Detailed Information", open=False):
555
  gr.Markdown(USAGE_TIPS)
556
 
557
  gr.Markdown("""
558
+ ### 🎬 Application Examples
559
 
560
+ **Fine-grained Sound Synthesis:**
561
+ - "Footsteps on wooden floor" + reference audio Specific timbre footsteps
562
+ - "Metal collision" + different reference audio Iron vs. copper distinction
563
 
564
+ **Timbre Transfer:**
565
+ - Piano melody video + violin reference audio Violin playing same melody
566
+ - Human humming + instrument reference Instrumental version
567
 
568
+ **Creative Sound Effects:**
569
+ - Sci-fi scene video + real sound reference Unique sci-fi effects
570
+ - Animation video + real sound effects Cartoon-reality hybrid effects
571
 
572
+ ### 📚 Technical Details
573
+ - Model based on diffusion models and audio conditioning mechanisms
574
+ - Supports 44.1kHz high-quality audio generation
575
+ - Achieves visual-audio-text multimodal alignment
576
  """)
577
 
578
  # Auto-initialize model on startup