learnmlf commited on
Commit
d4263af
·
1 Parent(s): 2c1dff6

Fix audio saving with torchcodec dependency

Browse files

- Add torchcodec to requirements.txt for proper audio encoding
- Add soundfile as backup audio saving library
- Implement fallback audio saving methods:
1. torchaudio.save (primary)
2. soundfile.write (backup)
3. scipy.io.wavfile.write (last resort)
- Improve error handling for audio saving failures
- Ensure robust audio output regardless of codec availability

Files changed (2) hide show
  1. app.py +19 -2
  2. requirements.txt +2 -0
app.py CHANGED
@@ -291,8 +291,25 @@ class AudioFoleyModel:
291
  output_filename = f"generated_audio_{timestamp}.wav"
292
  permanent_path = f"/tmp/{output_filename}"
293
 
294
- # Save audio file
295
- torchaudio.save(permanent_path, audio, self.model.seq_cfg.sampling_rate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
  # Verify file was created
298
  if not os.path.exists(permanent_path):
 
291
  output_filename = f"generated_audio_{timestamp}.wav"
292
  permanent_path = f"/tmp/{output_filename}"
293
 
294
+ # Save audio file with fallback methods
295
+ try:
296
+ # Try with torchaudio first
297
+ torchaudio.save(permanent_path, audio, self.model.seq_cfg.sampling_rate)
298
+ except Exception as e:
299
+ log.warning(f"torchaudio.save failed: {e}, trying alternative method...")
300
+ try:
301
+ # Fallback: use soundfile if available
302
+ import soundfile as sf
303
+ sf.write(permanent_path, audio.numpy().T, self.model.seq_cfg.sampling_rate)
304
+ except ImportError:
305
+ try:
306
+ # Fallback: use scipy.io.wavfile
307
+ from scipy.io.wavfile import write
308
+ # Convert to int16 for wav format
309
+ audio_int16 = (audio * 32767).clamp(-32768, 32767).to(torch.int16)
310
+ write(permanent_path, self.model.seq_cfg.sampling_rate, audio_int16.numpy().T)
311
+ except Exception as e2:
312
+ return None, f"❌ 音频保存失败: {str(e2)}"
313
 
314
  # Verify file was created
315
  if not os.path.exists(permanent_path):
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
  torch>=2.0.0
2
  torchvision
3
  torchaudio
 
4
  gradio>=4.0.0
5
  huggingface_hub>=0.26.0
6
  numpy>=1.21.0,<2.1
@@ -11,6 +12,7 @@ tqdm>=4.66.1
11
  einops>=0.6.0
12
  requests
13
  librosa>=0.8.1
 
14
  av>=14.0.1
15
  timm>=1.0.12
16
  open_clip_torch>=2.29.0
 
1
  torch>=2.0.0
2
  torchvision
3
  torchaudio
4
+ torchcodec
5
  gradio>=4.0.0
6
  huggingface_hub>=0.26.0
7
  numpy>=1.21.0,<2.1
 
12
  einops>=0.6.0
13
  requests
14
  librosa>=0.8.1
15
+ soundfile>=0.12.1
16
  av>=14.0.1
17
  timm>=1.0.12
18
  open_clip_torch>=2.29.0