Spaces:

learnmlf
/

Acfoley

Sleeping

learnmlf commited on 6 days ago

Commit

dd2db22

1 Parent(s): ca6cc37

Convert interface to English and optimize audio performance

- Convert all UI text from Chinese to English
- Update model loading status messages to English
- Update error messages and user feedback to English
- Optimize audio component performance:
- Add format='wav' to audio components
- Set autoplay=False for better loading
- Improve audio file handling
- Maintain all functionality while improving international accessibility

Files changed (1) hide show

app.py +95 -92

app.py CHANGED Viewed

@@ -28,23 +28,23 @@ EXAMPLE_PROMPTS = [
 ]
 USAGE_TIPS = """
-### 💡 使用技巧
-**基础设置:**
-- **视频质量**: 使用清晰、光线良好的视频，建议1-15秒
-- **参考音频**: 提供清晰的音频片段作为音色参考
-- **CFG强度**: 1-8之间，数值越高越贴合描述
-**高级功能:**
-- **mask_away_clip**: 当视频内容与期望音频差异很大时启用
-- **细粒度控制**: 使用参考音频实现精确的音色和风格控制
-- **零样本生成**: 无需训练即可生成新颖的音效组合
-**应用场景:**
-- 影视后期配音
-- 游戏音效制作
-- 音乐创作辅助
-- 声音设计实验
 """
 # Check and install missing dependencies
@@ -123,19 +123,19 @@ class AudioFoleyModel:
                 return f"❌ Unknown model variant: {variant}. Available: {available_variants}"
             # Step 1: Initialize model config
-            model_loading_status = "🔧 初始化模型配置..."
             log.info(f"Loading model variant: {variant}")
             self.model: ModelConfig = all_model_cfg[variant]
             # Step 2: Download model components
-            model_loading_status = "📥 下载模型组件..."
             try:
                 self.model.download_if_needed()
             except Exception as e:
                 log.warning(f"Could not download model components: {e}")
             # Step 3: Download main model weights
-            model_loading_status = "📥 下载主模型权重..."
             if not hasattr(self.model, 'model_path') or not self.model.model_path or not Path(self.model.model_path).exists():
                 try:
                     from huggingface_hub import hf_hub_download
@@ -165,11 +165,11 @@ class AudioFoleyModel:
                 log.info(f"Using custom model path: {model_path}")
             # Step 4: Load neural network
-            model_loading_status = "🧠 加载神经网络..."
             self.net: MMAudio = get_my_mmaudio(self.model.model_name).to(self.device, self.dtype).eval()
             # Step 5: Load weights
-            model_loading_status = "⚖️ 加载模型权重..."
             if hasattr(self.model, 'model_path') and self.model.model_path and Path(self.model.model_path).exists():
                 try:
                     weights = torch.load(self.model.model_path, map_location=self.device, weights_only=True)
@@ -185,11 +185,11 @@ class AudioFoleyModel:
                 return model_loading_status
             # Step 6: Initialize flow matching
-            model_loading_status = "🌊 初始化流匹配..."
             self.fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=25)
             # Step 7: Initialize feature utils
-            model_loading_status = "🔧 初始化特征工具..."
             try:
                 self.feature_utils = FeaturesUtils(
                     tod_vae_ckpt=self.model.vae_path,
@@ -206,11 +206,11 @@ class AudioFoleyModel:
                 return model_loading_status
             # Step 8: Complete
-            model_loading_status = "✅ 模型加载完成！可以开始生成音频。"
             return model_loading_status
         except Exception as e:
-            error_msg = f"❌ 模型加载错误: {str(e)}"
             log.error(error_msg)
             model_loading_status = error_msg
             return error_msg
@@ -228,7 +228,7 @@ class AudioFoleyModel:
                 return None, "❌ Model not loaded. Please load the model first."
             if video_file is None:
-                return None, "❌ 请上传视频文件。"
             log.info(f'🎬 Processing video: {video_file}')
             if prompt.strip():
@@ -347,7 +347,7 @@ class AudioFoleyModel:
                             audio_int16 = (audio * 32767).clamp(-32768, 32767).to(torch.int16)
                             write(permanent_path, self.model.seq_cfg.sampling_rate, audio_int16.numpy().T)
                         except Exception as e2:
-                            return None, f"❌ 音频保存失败: {str(e2)}"
                 # Verify file was created
                 if not os.path.exists(permanent_path):
@@ -357,7 +357,7 @@ class AudioFoleyModel:
                 success_msg = f"✅ Audio generated successfully!\n"
                 success_msg += f"📊 Duration: {duration_sec:.2f}s | "
                 success_msg += f"Size: {file_size:.1f}KB | "
-                success_msg += f"Time: {generation_time:.2f}s"
                 return permanent_path, success_msg
@@ -371,7 +371,7 @@ class AudioFoleyModel:
 # Global model instance - initialized once
 audio_model = None
-model_loading_status = "未初始化"
 def initialize_model():
     """Initialize model once at startup"""
@@ -379,16 +379,16 @@ def initialize_model():
     if audio_model is None:
         try:
-            model_loading_status = "正在初始化模型..."
             audio_model = AudioFoleyModel()
             load_result = audio_model.load_model()
             model_loading_status = load_result
             return load_result
         except Exception as e:
-            model_loading_status = f"❌ 模型初始化失败: {str(e)}"
             return model_loading_status
     else:
-        return "✅ 模型已加载"
 def generate_audio_interface(video_file, audio_file, prompt, duration, cfg_strength, mask_away_clip):
     """Interface function for generating audio"""
@@ -396,7 +396,7 @@ def generate_audio_interface(video_file, audio_file, prompt, duration, cfg_stren
     # Check if model is loaded
     if audio_model is None or audio_model.net is None:
-        return None, "❌ 模型未加载，请等待初始化完成或刷新页面"
     # Use fixed seed for consistency in HF Space
     seed = 42
@@ -417,27 +417,27 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
     gr.Markdown("""
     # 🎵 AC-Foley: Reference-Audio-Guided Video-to-Audio Synthesis
-    ## 📖 模型简介
-    AC-Foley是一个基于参考音频引导的视频到音频合成模型，能够实现精确的细粒度声音合成。与传统依赖文本描述的方法不同，AC-Foley直接利用参考音频来实现对生成声音的精确控制，解决了文本描述在微观声学特征方面的模糊性问题。
-    ## ✨ 功能要点
-    - **细粒度声音合成**: 生成具有特定音色的脚步声（木板、大理石、砾石等）
-    - **音色转换**: 将小提琴的旋律转换为唢呐的明亮刺耳音色
-    - **零样本生成**: 创建独特的音效而无需专门训练
-    - **视觉-音频对齐**: 根据视频内容自动生成匹配的音频
-    *基于论文: [AC-Foley: Reference-Audio-Guided Video-to-Audio Synthesis with Acoustic Transfer](https://openreview.net/forum?id=URPXhnWdBF)*
     """)
     # Model status display - will be updated automatically
     model_status = gr.Textbox(
-        label="模型状态",
         value=model_loading_status,
         interactive=False
     )
     # Add a refresh button for status
-    refresh_status_btn = gr.Button("🔄 刷新状态", size="sm")
     refresh_status_btn.click(
         fn=get_model_status,
         outputs=model_status
@@ -445,36 +445,37 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
     with gr.Row():
         with gr.Column(scale=2):
-            # 必需输入
-            gr.Markdown("### 📹 必需输入")
             video_input = gr.Video(
-                label="视频文件 - 上传需要生成音频的视频文件",
                 format="mp4"
             )
-            # 可选输入
-            gr.Markdown("### 🎛️ 可选输入")
             audio_input = gr.Audio(
-                label="参考音频 - 提供音色、风格、节奏参考（支持细粒度控制）",
                 type="filepath",
-                sources=["upload"]
             )
             prompt_input = gr.Textbox(
-                label="文本提示 - 描述想要的音频类型（留空则根据视频自动生成）",
-                placeholder="例如: '脚步声', '金属碰撞声', '鸟叫声'",
                 lines=2
             )
-            # 高级选项
-            with gr.Accordion("🔧 高级选项", open=False):
                 with gr.Row():
                     duration_slider = gr.Slider(
                         minimum=1.0,
                         maximum=15.0,
                         value=8.0,
                         step=0.5,
-                        label="时长 (秒)"
                     )
                     cfg_strength_slider = gr.Slider(
@@ -482,37 +483,37 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
                         maximum=8.0,
                         value=4.5,
                         step=0.1,
-                        label="CFG强度"
                     )
                 mask_away_clip = gr.Checkbox(
-                    label="忽略视觉特征 (mask_away_clip) - 当视频和参考音频差异较大且生成效果不佳时启用",
                     value=False
                 )
         with gr.Column(scale=1):
-            # 使用指南
-            gr.Markdown("### 📋 使用指南")
             gr.Markdown("""
-            **四种生成模式:**
-            1️⃣ **纯视频**: 仅上传视频
-            - 根据视觉内容自动生成音频
-            2️⃣ **视频+参考音频**: 上传视频+音频
-            - 使用参考音频的音色和风格
-            - 实现细粒度音色控制
-            3️⃣ **视频+文本**: 上传视频+文本
-            - 根据文本描述生成指定类型音频
-            4️⃣ **完整模式**: 视频+音频+文本
-            - 最精确的控制方式
-            - 结合视觉、音色和语义指导
             """)
-            # 示例提示词
-            gr.Markdown("### 🎯 示例提示词")
             example_buttons = []
             for prompt in EXAMPLE_PROMPTS[:4]:
                 btn = gr.Button(prompt, size="sm")
@@ -522,18 +523,20 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
                     outputs=prompt_input
                 )
-    # 生成按钮
-    generate_btn = gr.Button("🎵 开始生成音频", variant="primary", size="lg")
-    # 输出区域
-    gr.Markdown("### 🎧 生成结果")
     audio_output = gr.Audio(
-        label="生成的音频",
-        type="filepath"
     )
     generation_status = gr.Textbox(
-        label="生成状态",
         interactive=False,
         lines=2
     )
@@ -548,28 +551,28 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
         outputs=[audio_output, generation_status]
     )
-    with gr.Accordion("💡 详细说明", open=False):
         gr.Markdown(USAGE_TIPS)
         gr.Markdown("""
-        ### 🎬 应用示例
-        **细粒度声音合成:**
-        - "木地板上的脚步声" + 参考音频 → 特定音色的脚步声
-        - "金属碰撞" + 不同参考音频 → 铁器vs铜器的区别
-        **音色转换:**
-        - 钢琴旋律视频 + 小提琴参考音频 → 小提琴演奏同样旋律
-        - 人声哼唱 + 乐器参考 → 乐器演奏版本
-        **创意音效:**
-        - 科幻场景视频 + 现实音效参考 → 独特的科幻音效
-        - 动画视频 + 真实音效 → 卡通与现实结合的音效
-        ### 📚 技术细节
-        - 模型基于扩散模型和音频条件机制
-        - 支持44.1kHz高质量音频生成
-        - 实现了视觉-音频-文本的多模态对齐
         """)
     # Auto-initialize model on startup

 ]
 USAGE_TIPS = """
+### 💡 Usage Tips
+**Basic Settings:**
+- **Video Quality**: Use clear, well-lit videos, recommended 1-15 seconds
+- **Reference Audio**: Provide clear audio clips as timbre reference
+- **CFG Strength**: Between 1-8, higher values follow description more closely
+**Advanced Features:**
+- **mask_away_clip**: Enable when video content differs significantly from desired audio
+- **Fine-grained Control**: Use reference audio for precise timbre and style control
+- **Zero-shot Generation**: Generate novel sound combinations without training
+**Application Scenarios:**
+- Film post-production audio
+- Game sound effect creation
+- Music composition assistance
+- Sound design experimentation
 """
 # Check and install missing dependencies
                 return f"❌ Unknown model variant: {variant}. Available: {available_variants}"
             # Step 1: Initialize model config
+            model_loading_status = "🔧 Initializing model configuration..."
             log.info(f"Loading model variant: {variant}")
             self.model: ModelConfig = all_model_cfg[variant]
             # Step 2: Download model components
+            model_loading_status = "📥 Downloading model components..."
             try:
                 self.model.download_if_needed()
             except Exception as e:
                 log.warning(f"Could not download model components: {e}")
             # Step 3: Download main model weights
+            model_loading_status = "📥 Downloading main model weights..."
             if not hasattr(self.model, 'model_path') or not self.model.model_path or not Path(self.model.model_path).exists():
                 try:
                     from huggingface_hub import hf_hub_download
                 log.info(f"Using custom model path: {model_path}")
             # Step 4: Load neural network
+            model_loading_status = "🧠 Loading neural network..."
             self.net: MMAudio = get_my_mmaudio(self.model.model_name).to(self.device, self.dtype).eval()
             # Step 5: Load weights
+            model_loading_status = "⚖️ Loading model weights..."
             if hasattr(self.model, 'model_path') and self.model.model_path and Path(self.model.model_path).exists():
                 try:
                     weights = torch.load(self.model.model_path, map_location=self.device, weights_only=True)
                 return model_loading_status
             # Step 6: Initialize flow matching
+            model_loading_status = "🌊 Initializing flow matching..."
             self.fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=25)
             # Step 7: Initialize feature utils
+            model_loading_status = "🔧 Initializing feature utilities..."
             try:
                 self.feature_utils = FeaturesUtils(
                     tod_vae_ckpt=self.model.vae_path,
                 return model_loading_status
             # Step 8: Complete
+            model_loading_status = "✅ Model loaded successfully! Ready to generate audio."
             return model_loading_status
         except Exception as e:
+            error_msg = f"❌ Model loading error: {str(e)}"
             log.error(error_msg)
             model_loading_status = error_msg
             return error_msg
                 return None, "❌ Model not loaded. Please load the model first."
             if video_file is None:
+                return None, "❌ Please upload a video file."
             log.info(f'🎬 Processing video: {video_file}')
             if prompt.strip():
                             audio_int16 = (audio * 32767).clamp(-32768, 32767).to(torch.int16)
                             write(permanent_path, self.model.seq_cfg.sampling_rate, audio_int16.numpy().T)
                         except Exception as e2:
+                            return None, f"❌ Audio saving failed: {str(e2)}"
                 # Verify file was created
                 if not os.path.exists(permanent_path):
                 success_msg = f"✅ Audio generated successfully!\n"
                 success_msg += f"📊 Duration: {duration_sec:.2f}s | "
                 success_msg += f"Size: {file_size:.1f}KB | "
+                success_msg += f"Generation time: {generation_time:.2f}s"
                 return permanent_path, success_msg
 # Global model instance - initialized once
 audio_model = None
+model_loading_status = "Not initialized"
 def initialize_model():
     """Initialize model once at startup"""
     if audio_model is None:
         try:
+            model_loading_status = "Initializing model..."
             audio_model = AudioFoleyModel()
             load_result = audio_model.load_model()
             model_loading_status = load_result
             return load_result
         except Exception as e:
+            model_loading_status = f"❌ Model initialization failed: {str(e)}"
             return model_loading_status
     else:
+        return "✅ Model already loaded"
 def generate_audio_interface(video_file, audio_file, prompt, duration, cfg_strength, mask_away_clip):
     """Interface function for generating audio"""
     # Check if model is loaded
     if audio_model is None or audio_model.net is None:
+        return None, "❌ Model not loaded. Please wait for initialization to complete or refresh the page."
     # Use fixed seed for consistency in HF Space
     seed = 42
     gr.Markdown("""
     # 🎵 AC-Foley: Reference-Audio-Guided Video-to-Audio Synthesis
+    ## 📖 About
+    AC-Foley is a reference-audio-guided video-to-audio synthesis model that enables precise fine-grained sound synthesis. Unlike traditional text-dependent methods, AC-Foley directly leverages reference audio to achieve precise control over generated sounds, addressing the ambiguity of textual descriptions in micro-acoustic features.
+    ## ✨ Key Features
+    - **Fine-grained Sound Synthesis**: Generate footsteps with distinct timbres (wood, marble, gravel, etc.)
+    - **Timbre Transfer**: Transform violin melodies into bright, piercing suona tones
+    - **Zero-shot Generation**: Create unique sound effects without specialized training
+    - **Visual-Audio Alignment**: Automatically generate matching audio from video content
+    *Based on paper: [AC-Foley: Reference-Audio-Guided Video-to-Audio Synthesis with Acoustic Transfer](https://openreview.net/forum?id=URPXhnWdBF)*
     """)
     # Model status display - will be updated automatically
     model_status = gr.Textbox(
+        label="Model Status",
         value=model_loading_status,
         interactive=False
     )
     # Add a refresh button for status
+    refresh_status_btn = gr.Button("🔄 Refresh Status", size="sm")
     refresh_status_btn.click(
         fn=get_model_status,
         outputs=model_status
     with gr.Row():
         with gr.Column(scale=2):
+            # Required inputs
+            gr.Markdown("### 📹 Required Input")
             video_input = gr.Video(
+                label="Video File - Upload video for audio generation",
                 format="mp4"
             )
+            # Optional inputs
+            gr.Markdown("### 🎛️ Optional Inputs")
             audio_input = gr.Audio(
+                label="Reference Audio - Provide timbre, style, rhythm reference (fine-grained control)",
                 type="filepath",
+                sources=["upload"],
+                format="wav"
             )
             prompt_input = gr.Textbox(
+                label="Text Prompt - Describe desired audio type (leave empty for auto-generation from video)",
+                placeholder="e.g., 'footsteps', 'metal clang', 'bird chirping'",
                 lines=2
             )
+            # Advanced options
+            with gr.Accordion("🔧 Advanced Options", open=False):
                 with gr.Row():
                     duration_slider = gr.Slider(
                         minimum=1.0,
                         maximum=15.0,
                         value=8.0,
                         step=0.5,
+                        label="Duration (seconds)"
                     )
                     cfg_strength_slider = gr.Slider(
                         maximum=8.0,
                         value=4.5,
                         step=0.1,
+                        label="CFG Strength"
                     )
                 mask_away_clip = gr.Checkbox(
+                    label="Ignore Visual Features (mask_away_clip) - Enable when video and reference audio differ significantly",
                     value=False
                 )
         with gr.Column(scale=1):
+            # Usage guide
+            gr.Markdown("### 📋 Usage Guide")
             gr.Markdown("""
+            **Four Generation Modes:**
+            1️⃣ **Video Only**: Upload video only
+            - Auto-generate audio from visual content
+            2️⃣ **Video + Reference Audio**: Upload video + audio
+            - Use reference audio's timbre and style
+            - Achieve fine-grained timbre control
+            3️⃣ **Video + Text**: Upload video + text
+            - Generate specified audio type from text description
+            4️⃣ **Complete Mode**: Video + Audio + Text
+            - Most precise control method
+            - Combine visual, timbral, and semantic guidance
             """)
+            # Example prompts
+            gr.Markdown("### 🎯 Example Prompts")
             example_buttons = []
             for prompt in EXAMPLE_PROMPTS[:4]:
                 btn = gr.Button(prompt, size="sm")
                     outputs=prompt_input
                 )
+    # Generate button
+    generate_btn = gr.Button("🎵 Generate Audio", variant="primary", size="lg")
+    # Output area
+    gr.Markdown("### 🎧 Generated Results")
     audio_output = gr.Audio(
+        label="Generated Audio",
+        type="filepath",
+        format="wav",
+        autoplay=False
     )
     generation_status = gr.Textbox(
+        label="Generation Status",
         interactive=False,
         lines=2
     )
         outputs=[audio_output, generation_status]
     )
+    with gr.Accordion("💡 Detailed Information", open=False):
         gr.Markdown(USAGE_TIPS)
         gr.Markdown("""
+        ### 🎬 Application Examples
+        **Fine-grained Sound Synthesis:**
+        - "Footsteps on wooden floor" + reference audio → Specific timbre footsteps
+        - "Metal collision" + different reference audio → Iron vs. copper distinction
+        **Timbre Transfer:**
+        - Piano melody video + violin reference audio → Violin playing same melody
+        - Human humming + instrument reference → Instrumental version
+        **Creative Sound Effects:**
+        - Sci-fi scene video + real sound reference → Unique sci-fi effects
+        - Animation video + real sound effects → Cartoon-reality hybrid effects
+        ### 📚 Technical Details
+        - Model based on diffusion models and audio conditioning mechanisms
+        - Supports 44.1kHz high-quality audio generation
+        - Achieves visual-audio-text multimodal alignment
         """)
     # Auto-initialize model on startup