learnmlf commited on
Commit
ebc0a66
·
1 Parent(s): d4263af

Add audio input support and make prompt optional

Browse files
Files changed (1) hide show
  1. app.py +55 -18
app.py CHANGED
@@ -31,9 +31,13 @@ USAGE_TIPS = """
31
  ### 💡 使用技巧
32
 
33
  1. **视频质量**: 使用清晰、光线良好的视频
34
- 2. **提示词**: 具体描述想要的音频类型
35
- 3. **时长**: 建议1-30秒效果最佳
 
 
 
36
  4. **CFG强度**: 数值越高越贴合提示词,但可能降低质量
 
37
  """
38
 
39
  # Check and install missing dependencies
@@ -205,7 +209,7 @@ class AudioFoleyModel:
205
 
206
  def generate_audio(self, video_file, prompt: str, negative_prompt: str = "",
207
  duration: float = 8.0, cfg_strength: float = 4.5,
208
- seed: int = 42) -> Tuple[Optional[str], str]:
209
  """Generate audio from video and text prompt"""
210
  try:
211
  # Validation checks
@@ -216,13 +220,30 @@ class AudioFoleyModel:
216
  return None, "❌ Model not loaded. Please load the model first."
217
 
218
  if video_file is None:
219
- return None, "❌ Please upload a video file."
220
-
221
- if not prompt.strip():
222
- return None, "❌ Please provide a text prompt describing the desired audio."
223
 
224
  log.info(f'🎬 Processing video: {video_file}')
225
- log.info(f'📝 Prompt: "{prompt}"')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
  # Load and process video
228
  try:
@@ -247,7 +268,12 @@ class AudioFoleyModel:
247
  # Update model sequence configuration
248
  try:
249
  self.model.seq_cfg.duration = duration_sec
250
- self.model.seq_cfg.audio_num_sample = 89088 # Default for 44kHz
 
 
 
 
 
251
  self.net.update_seq_lengths(
252
  self.model.seq_cfg.latent_seq_len,
253
  self.model.seq_cfg.clip_seq_len,
@@ -267,7 +293,7 @@ class AudioFoleyModel:
267
  clip_frames,
268
  sync_frames,
269
  [prompt],
270
- None, # No reference audio
271
  negative_text=[negative_prompt] if negative_prompt.strip() else None,
272
  feature_utils=self.feature_utils,
273
  net=self.net,
@@ -352,7 +378,7 @@ def initialize_model():
352
  else:
353
  return "✅ 模型已加载"
354
 
355
- def generate_audio_interface(video_file, prompt, duration, cfg_strength):
356
  """Interface function for generating audio"""
357
  global audio_model, model_loading_status
358
 
@@ -365,7 +391,7 @@ def generate_audio_interface(video_file, prompt, duration, cfg_strength):
365
  negative_prompt = "" # Simplified interface
366
 
367
  audio_path, message = audio_model.generate_audio(
368
- video_file, prompt, negative_prompt, duration, cfg_strength, seed
369
  )
370
  return audio_path, message
371
 
@@ -379,7 +405,11 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
379
  gr.Markdown("""
380
  # 🎵 hf_AC Audio Foley Generator
381
 
382
- 基于AI的视频音频生成工具。上传视频并提供��本描述,模型将生成匹配的音频内容。
 
 
 
 
383
 
384
  **注意**: 模型会在启动时自动加载,首次使用需要下载约3GB的模型文件。
385
  """)
@@ -401,14 +431,21 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
401
  with gr.Row():
402
  with gr.Column():
403
  video_input = gr.Video(
404
- label="上传视频",
405
  format="mp4"
406
  )
407
 
 
 
 
 
 
 
408
  prompt_input = gr.Textbox(
409
- label="音频描述",
410
- placeholder="描述你想要生成的音频 (例如: '脚步声', '鸟叫声', '汽车引擎声')",
411
- lines=3
 
412
  )
413
 
414
  with gr.Row():
@@ -452,7 +489,7 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
452
  generate_btn.click(
453
  fn=generate_audio_interface,
454
  inputs=[
455
- video_input, prompt_input, duration_slider, cfg_strength_slider
456
  ],
457
  outputs=[audio_output, generation_status]
458
  )
 
31
  ### 💡 使用技巧
32
 
33
  1. **视频质量**: 使用清晰、光线良好的视频
34
+ 2. **三种模式**:
35
+ - 纯视频:让AI根据画面自动生成音频
36
+ - 视频+文本:指定想要的音频类型
37
+ - 视频+音频+文本:使用参考音频的音色风格
38
+ 3. **时长**: 建议1-15秒效果最佳
39
  4. **CFG强度**: 数值越高越贴合提示词,但可能降低质量
40
+ 5. **参考音频**: 可提供音色、节奏、风格参考
41
  """
42
 
43
  # Check and install missing dependencies
 
209
 
210
  def generate_audio(self, video_file, prompt: str, negative_prompt: str = "",
211
  duration: float = 8.0, cfg_strength: float = 4.5,
212
+ seed: int = 42, reference_audio: str = None) -> Tuple[Optional[str], str]:
213
  """Generate audio from video and text prompt"""
214
  try:
215
  # Validation checks
 
220
  return None, "❌ Model not loaded. Please load the model first."
221
 
222
  if video_file is None:
223
+ return None, "❌ 请上传视频文件。"
 
 
 
224
 
225
  log.info(f'🎬 Processing video: {video_file}')
226
+ if prompt.strip():
227
+ log.info(f'📝 Prompt: "{prompt}"')
228
+ else:
229
+ log.info('📝 No prompt provided - will generate based on video content')
230
+ if reference_audio:
231
+ log.info(f'🎵 Reference audio: {reference_audio}')
232
+
233
+ # Load and process reference audio if provided
234
+ reference_audio_tensor = None
235
+ if reference_audio and os.path.exists(reference_audio):
236
+ try:
237
+ # Use the same Audio class from hf_AC
238
+ SAMPLE_RATE = 44100
239
+ audio_processor = Audio([reference_audio], SAMPLE_RATE)
240
+ audio_list = audio_processor.load_audio()
241
+ if audio_list:
242
+ reference_audio_tensor = audio_list[0]
243
+ log.info(f'🎵 Reference audio loaded: {reference_audio_tensor.shape}')
244
+ except Exception as e:
245
+ log.warning(f"Failed to load reference audio: {e}")
246
+ reference_audio_tensor = None
247
 
248
  # Load and process video
249
  try:
 
268
  # Update model sequence configuration
269
  try:
270
  self.model.seq_cfg.duration = duration_sec
271
+ # Set audio sample count based on reference audio or default
272
+ if reference_audio_tensor is not None:
273
+ self.model.seq_cfg.audio_num_sample = reference_audio_tensor.shape[0]
274
+ else:
275
+ self.model.seq_cfg.audio_num_sample = 89088 # Default for 44kHz
276
+
277
  self.net.update_seq_lengths(
278
  self.model.seq_cfg.latent_seq_len,
279
  self.model.seq_cfg.clip_seq_len,
 
293
  clip_frames,
294
  sync_frames,
295
  [prompt],
296
+ reference_audio_tensor, # Use reference audio if provided
297
  negative_text=[negative_prompt] if negative_prompt.strip() else None,
298
  feature_utils=self.feature_utils,
299
  net=self.net,
 
378
  else:
379
  return "✅ 模型已加载"
380
 
381
+ def generate_audio_interface(video_file, audio_file, prompt, duration, cfg_strength):
382
  """Interface function for generating audio"""
383
  global audio_model, model_loading_status
384
 
 
391
  negative_prompt = "" # Simplified interface
392
 
393
  audio_path, message = audio_model.generate_audio(
394
+ video_file, prompt, negative_prompt, duration, cfg_strength, seed, audio_file
395
  )
396
  return audio_path, message
397
 
 
405
  gr.Markdown("""
406
  # 🎵 hf_AC Audio Foley Generator
407
 
408
+ 基于AI的视频音频生成工具,支持三种生成模式:
409
+
410
+ 1. **纯视频模式**: 仅上传视频,根据视觉内容自动生成匹配音频
411
+ 2. **视频+文本模式**: 上传视频 + 文本描述,生成指定类型的音频
412
+ 3. **视频+音频+文本模式**: 上传视频 + 参考音频 + 文本,生成具有特定音色风格的音频
413
 
414
  **注意**: 模型会在启动时自动加载,首次使用需要下载约3GB的模型文件。
415
  """)
 
431
  with gr.Row():
432
  with gr.Column():
433
  video_input = gr.Video(
434
+ label="上传视频 (必需)",
435
  format="mp4"
436
  )
437
 
438
+ audio_input = gr.Audio(
439
+ label="参考音频 (可选) - 提供音色/风格参考",
440
+ type="filepath",
441
+ sources=["upload"]
442
+ )
443
+
444
  prompt_input = gr.Textbox(
445
+ label="音频描述 (可选) - 留空则根据视频内容自动生成",
446
+ placeholder="可选:描述想要的音频类型 (例如: '脚步声', '鸟叫声', '汽车引擎声')",
447
+ lines=2,
448
+ value=""
449
  )
450
 
451
  with gr.Row():
 
489
  generate_btn.click(
490
  fn=generate_audio_interface,
491
  inputs=[
492
+ video_input, audio_input, prompt_input, duration_slider, cfg_strength_slider
493
  ],
494
  outputs=[audio_output, generation_status]
495
  )