import sys import os import torch import torchaudio sys.path.insert(0, './CosyVoice') from cosyvoice.cli.cosyvoice import CosyVoice2 from cosyvoice.utils.file_utils import load_wav class CosyVoice2TTS: def __init__(self, model_dir, device="cuda"): print(f"[TTS] Loading CosyVoice2 model from {model_dir}...") # 初始化模型 self.model = CosyVoice2( model_dir, load_jit=False, load_trt=False, load_vllm=False, fp16=True ) print("[TTS] CosyVoice2 Model loaded successfully.") def synthesize(self, text, prompt_text, prompt_speech_path, output_path=None, stream=False): if not text: return None, None # 加载音频 prompt_speech_16k = load_wav(prompt_speech_path, 16000) # 调用 zero_shot 推理 output = self.model.inference_zero_shot( tts_text=text, prompt_text=prompt_text, prompt_speech_16k=prompt_speech_16k, stream=stream ) final_audio = [] # 获取采样率 sample_rate = getattr(self.model, 'sample_rate', 24000) for i in output: final_audio.append(i['tts_speech']) if not final_audio: return None, None full_audio_tensor = torch.cat(final_audio, dim=1) if output_path: os.makedirs(os.path.dirname(output_path), exist_ok=True) torchaudio.save(output_path, full_audio_tensor, sample_rate) print(f"[TTS] Audio saved to {output_path}") return sample_rate, full_audio_tensor.cpu().numpy()