Add audio input support and make prompt optional
Browse files
app.py
CHANGED
|
@@ -31,9 +31,13 @@ USAGE_TIPS = """
|
|
| 31 |
### 💡 使用技巧
|
| 32 |
|
| 33 |
1. **视频质量**: 使用清晰、光线良好的视频
|
| 34 |
-
2.
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
| 36 |
4. **CFG强度**: 数值越高越贴合提示词,但可能降低质量
|
|
|
|
| 37 |
"""
|
| 38 |
|
| 39 |
# Check and install missing dependencies
|
|
@@ -205,7 +209,7 @@ class AudioFoleyModel:
|
|
| 205 |
|
| 206 |
def generate_audio(self, video_file, prompt: str, negative_prompt: str = "",
|
| 207 |
duration: float = 8.0, cfg_strength: float = 4.5,
|
| 208 |
-
seed: int = 42) -> Tuple[Optional[str], str]:
|
| 209 |
"""Generate audio from video and text prompt"""
|
| 210 |
try:
|
| 211 |
# Validation checks
|
|
@@ -216,13 +220,30 @@ class AudioFoleyModel:
|
|
| 216 |
return None, "❌ Model not loaded. Please load the model first."
|
| 217 |
|
| 218 |
if video_file is None:
|
| 219 |
-
return None, "❌
|
| 220 |
-
|
| 221 |
-
if not prompt.strip():
|
| 222 |
-
return None, "❌ Please provide a text prompt describing the desired audio."
|
| 223 |
|
| 224 |
log.info(f'🎬 Processing video: {video_file}')
|
| 225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
# Load and process video
|
| 228 |
try:
|
|
@@ -247,7 +268,12 @@ class AudioFoleyModel:
|
|
| 247 |
# Update model sequence configuration
|
| 248 |
try:
|
| 249 |
self.model.seq_cfg.duration = duration_sec
|
| 250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
self.net.update_seq_lengths(
|
| 252 |
self.model.seq_cfg.latent_seq_len,
|
| 253 |
self.model.seq_cfg.clip_seq_len,
|
|
@@ -267,7 +293,7 @@ class AudioFoleyModel:
|
|
| 267 |
clip_frames,
|
| 268 |
sync_frames,
|
| 269 |
[prompt],
|
| 270 |
-
|
| 271 |
negative_text=[negative_prompt] if negative_prompt.strip() else None,
|
| 272 |
feature_utils=self.feature_utils,
|
| 273 |
net=self.net,
|
|
@@ -352,7 +378,7 @@ def initialize_model():
|
|
| 352 |
else:
|
| 353 |
return "✅ 模型已加载"
|
| 354 |
|
| 355 |
-
def generate_audio_interface(video_file, prompt, duration, cfg_strength):
|
| 356 |
"""Interface function for generating audio"""
|
| 357 |
global audio_model, model_loading_status
|
| 358 |
|
|
@@ -365,7 +391,7 @@ def generate_audio_interface(video_file, prompt, duration, cfg_strength):
|
|
| 365 |
negative_prompt = "" # Simplified interface
|
| 366 |
|
| 367 |
audio_path, message = audio_model.generate_audio(
|
| 368 |
-
video_file, prompt, negative_prompt, duration, cfg_strength, seed
|
| 369 |
)
|
| 370 |
return audio_path, message
|
| 371 |
|
|
@@ -379,7 +405,11 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
|
|
| 379 |
gr.Markdown("""
|
| 380 |
# 🎵 hf_AC Audio Foley Generator
|
| 381 |
|
| 382 |
-
基于AI
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
|
| 384 |
**注意**: 模型会在启动时自动加载,首次使用需要下载约3GB的模型文件。
|
| 385 |
""")
|
|
@@ -401,14 +431,21 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
|
|
| 401 |
with gr.Row():
|
| 402 |
with gr.Column():
|
| 403 |
video_input = gr.Video(
|
| 404 |
-
label="上传视频",
|
| 405 |
format="mp4"
|
| 406 |
)
|
| 407 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
prompt_input = gr.Textbox(
|
| 409 |
-
label="音频描述",
|
| 410 |
-
placeholder="
|
| 411 |
-
lines=
|
|
|
|
| 412 |
)
|
| 413 |
|
| 414 |
with gr.Row():
|
|
@@ -452,7 +489,7 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
|
|
| 452 |
generate_btn.click(
|
| 453 |
fn=generate_audio_interface,
|
| 454 |
inputs=[
|
| 455 |
-
video_input, prompt_input, duration_slider, cfg_strength_slider
|
| 456 |
],
|
| 457 |
outputs=[audio_output, generation_status]
|
| 458 |
)
|
|
|
|
| 31 |
### 💡 使用技巧
|
| 32 |
|
| 33 |
1. **视频质量**: 使用清晰、光线良好的视频
|
| 34 |
+
2. **三种模式**:
|
| 35 |
+
- 纯视频:让AI根据画面自动生成音频
|
| 36 |
+
- 视频+文本:指定想要的音频类型
|
| 37 |
+
- 视频+音频+文本:使用参考音频的音色风格
|
| 38 |
+
3. **时长**: 建议1-15秒效果最佳
|
| 39 |
4. **CFG强度**: 数值越高越贴合提示词,但可能降低质量
|
| 40 |
+
5. **参考音频**: 可提供音色、节奏、风格参考
|
| 41 |
"""
|
| 42 |
|
| 43 |
# Check and install missing dependencies
|
|
|
|
| 209 |
|
| 210 |
def generate_audio(self, video_file, prompt: str, negative_prompt: str = "",
|
| 211 |
duration: float = 8.0, cfg_strength: float = 4.5,
|
| 212 |
+
seed: int = 42, reference_audio: str = None) -> Tuple[Optional[str], str]:
|
| 213 |
"""Generate audio from video and text prompt"""
|
| 214 |
try:
|
| 215 |
# Validation checks
|
|
|
|
| 220 |
return None, "❌ Model not loaded. Please load the model first."
|
| 221 |
|
| 222 |
if video_file is None:
|
| 223 |
+
return None, "❌ 请上传视频文件。"
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
log.info(f'🎬 Processing video: {video_file}')
|
| 226 |
+
if prompt.strip():
|
| 227 |
+
log.info(f'📝 Prompt: "{prompt}"')
|
| 228 |
+
else:
|
| 229 |
+
log.info('📝 No prompt provided - will generate based on video content')
|
| 230 |
+
if reference_audio:
|
| 231 |
+
log.info(f'🎵 Reference audio: {reference_audio}')
|
| 232 |
+
|
| 233 |
+
# Load and process reference audio if provided
|
| 234 |
+
reference_audio_tensor = None
|
| 235 |
+
if reference_audio and os.path.exists(reference_audio):
|
| 236 |
+
try:
|
| 237 |
+
# Use the same Audio class from hf_AC
|
| 238 |
+
SAMPLE_RATE = 44100
|
| 239 |
+
audio_processor = Audio([reference_audio], SAMPLE_RATE)
|
| 240 |
+
audio_list = audio_processor.load_audio()
|
| 241 |
+
if audio_list:
|
| 242 |
+
reference_audio_tensor = audio_list[0]
|
| 243 |
+
log.info(f'🎵 Reference audio loaded: {reference_audio_tensor.shape}')
|
| 244 |
+
except Exception as e:
|
| 245 |
+
log.warning(f"Failed to load reference audio: {e}")
|
| 246 |
+
reference_audio_tensor = None
|
| 247 |
|
| 248 |
# Load and process video
|
| 249 |
try:
|
|
|
|
| 268 |
# Update model sequence configuration
|
| 269 |
try:
|
| 270 |
self.model.seq_cfg.duration = duration_sec
|
| 271 |
+
# Set audio sample count based on reference audio or default
|
| 272 |
+
if reference_audio_tensor is not None:
|
| 273 |
+
self.model.seq_cfg.audio_num_sample = reference_audio_tensor.shape[0]
|
| 274 |
+
else:
|
| 275 |
+
self.model.seq_cfg.audio_num_sample = 89088 # Default for 44kHz
|
| 276 |
+
|
| 277 |
self.net.update_seq_lengths(
|
| 278 |
self.model.seq_cfg.latent_seq_len,
|
| 279 |
self.model.seq_cfg.clip_seq_len,
|
|
|
|
| 293 |
clip_frames,
|
| 294 |
sync_frames,
|
| 295 |
[prompt],
|
| 296 |
+
reference_audio_tensor, # Use reference audio if provided
|
| 297 |
negative_text=[negative_prompt] if negative_prompt.strip() else None,
|
| 298 |
feature_utils=self.feature_utils,
|
| 299 |
net=self.net,
|
|
|
|
| 378 |
else:
|
| 379 |
return "✅ 模型已加载"
|
| 380 |
|
| 381 |
+
def generate_audio_interface(video_file, audio_file, prompt, duration, cfg_strength):
|
| 382 |
"""Interface function for generating audio"""
|
| 383 |
global audio_model, model_loading_status
|
| 384 |
|
|
|
|
| 391 |
negative_prompt = "" # Simplified interface
|
| 392 |
|
| 393 |
audio_path, message = audio_model.generate_audio(
|
| 394 |
+
video_file, prompt, negative_prompt, duration, cfg_strength, seed, audio_file
|
| 395 |
)
|
| 396 |
return audio_path, message
|
| 397 |
|
|
|
|
| 405 |
gr.Markdown("""
|
| 406 |
# 🎵 hf_AC Audio Foley Generator
|
| 407 |
|
| 408 |
+
基于AI的视频音频生成工具,支持三种生成模式:
|
| 409 |
+
|
| 410 |
+
1. **纯视频模式**: 仅上传视频,根据视觉内容自动生成匹配音频
|
| 411 |
+
2. **视频+文本模式**: 上传视频 + 文本描述,生成指定类型的音频
|
| 412 |
+
3. **视频+音频+文本模式**: 上传视频 + 参考音频 + 文本,生成具有特定音色风格的音频
|
| 413 |
|
| 414 |
**注意**: 模型会在启动时自动加载,首次使用需要下载约3GB的模型文件。
|
| 415 |
""")
|
|
|
|
| 431 |
with gr.Row():
|
| 432 |
with gr.Column():
|
| 433 |
video_input = gr.Video(
|
| 434 |
+
label="上传视频 (必需)",
|
| 435 |
format="mp4"
|
| 436 |
)
|
| 437 |
|
| 438 |
+
audio_input = gr.Audio(
|
| 439 |
+
label="参考音频 (可选) - 提供音色/风格参考",
|
| 440 |
+
type="filepath",
|
| 441 |
+
sources=["upload"]
|
| 442 |
+
)
|
| 443 |
+
|
| 444 |
prompt_input = gr.Textbox(
|
| 445 |
+
label="音频描述 (可选) - 留空则根据视频内容自动生成",
|
| 446 |
+
placeholder="可选:描述想要的音频类型 (例如: '脚步声', '鸟叫声', '汽车引擎声')",
|
| 447 |
+
lines=2,
|
| 448 |
+
value=""
|
| 449 |
)
|
| 450 |
|
| 451 |
with gr.Row():
|
|
|
|
| 489 |
generate_btn.click(
|
| 490 |
fn=generate_audio_interface,
|
| 491 |
inputs=[
|
| 492 |
+
video_input, audio_input, prompt_input, duration_slider, cfg_strength_slider
|
| 493 |
],
|
| 494 |
outputs=[audio_output, generation_status]
|
| 495 |
)
|