Spaces:
Runtime error
Runtime error
Upload 4 files
Browse files- Echox_copy_stream.py +68 -8
- app.py +3 -0
- requirements.txt +17 -1
- tts_wrapper.py +57 -0
Echox_copy_stream.py
CHANGED
|
@@ -12,6 +12,7 @@ import librosa
|
|
| 12 |
from text_to_speech import *
|
| 13 |
import torch.nn.functional as F
|
| 14 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
| 15 |
|
| 16 |
from transformers import logging as hf_logging
|
| 17 |
hf_logging.set_verbosity_error()
|
|
@@ -36,8 +37,8 @@ def load_model(args, device):
|
|
| 36 |
quantization_config=quantization_config,
|
| 37 |
token=hf_token,
|
| 38 |
).eval().to(device)
|
| 39 |
-
for module in model.model.audio_tower:
|
| 40 |
-
|
| 41 |
|
| 42 |
if args.peft_model_id:
|
| 43 |
lora_config = PeftConfig.from_pretrained(args.peft_model_id)
|
|
@@ -96,10 +97,43 @@ class EchoxAssistant():
|
|
| 96 |
self.base_model_path = "FreedomIntelligence/EchoX-8B"
|
| 97 |
self.peft_model_id = None
|
| 98 |
self.audio_tower = "openai/whisper-large-v3"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
self.args = BasicSetting()
|
| 100 |
self.device = "cuda"
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
self.audio_executor = ThreadPoolExecutor(max_workers=2)
|
| 104 |
# self.specAug = SpecAugmentTransform()
|
| 105 |
# special_token
|
|
@@ -375,8 +409,22 @@ class EchoxAssistant():
|
|
| 375 |
accumulated_hidden_states[segment_start_idx:segment_end_idx], dim=0
|
| 376 |
).unsqueeze(0)
|
| 377 |
|
| 378 |
-
|
| 379 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
|
| 381 |
segment_start_idx = segment_end_idx
|
| 382 |
|
|
@@ -384,11 +432,23 @@ class EchoxAssistant():
|
|
| 384 |
current_attention_mask = torch.ones_like(next_token)
|
| 385 |
|
| 386 |
if segment_start_idx < len(accumulated_hidden_states):
|
| 387 |
-
print(f"Processing final segment from {segment_start_idx} to {len(accumulated_hidden_states)}")
|
| 388 |
segment_hidden_states = torch.stack(
|
| 389 |
accumulated_hidden_states[segment_start_idx:], dim=0
|
| 390 |
).unsqueeze(0)
|
| 391 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
audio_futures.append(future)
|
| 393 |
|
| 394 |
for future in audio_futures:
|
|
|
|
| 12 |
from text_to_speech import *
|
| 13 |
import torch.nn.functional as F
|
| 14 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 15 |
+
from tts_wrapper import CosyVoice2TTS
|
| 16 |
|
| 17 |
from transformers import logging as hf_logging
|
| 18 |
hf_logging.set_verbosity_error()
|
|
|
|
| 37 |
quantization_config=quantization_config,
|
| 38 |
token=hf_token,
|
| 39 |
).eval().to(device)
|
| 40 |
+
# for module in model.model.audio_tower:
|
| 41 |
+
# module = module.to(device)
|
| 42 |
|
| 43 |
if args.peft_model_id:
|
| 44 |
lora_config = PeftConfig.from_pretrained(args.peft_model_id)
|
|
|
|
| 97 |
self.base_model_path = "FreedomIntelligence/EchoX-8B"
|
| 98 |
self.peft_model_id = None
|
| 99 |
self.audio_tower = "openai/whisper-large-v3"
|
| 100 |
+
self.cosyvoice_model_path = "FunAudioLLM/CosyVoice2-0.5B"
|
| 101 |
+
self.cosyvoice_ref_audio = "show_case/ref.wav"
|
| 102 |
+
self.cosyvoice_ref_text = "It's always a good idea to research and compare prices from different sources to get a more accurate idea of the average price of a used car in the United States for different years."
|
| 103 |
+
|
| 104 |
self.args = BasicSetting()
|
| 105 |
self.device = "cuda"
|
| 106 |
+
|
| 107 |
+
if self.args.cosyvoice_model_path:
|
| 108 |
+
print(f"[EchoxAssistant] Initializing CosyVoice2 TTS from {self.args.cosyvoice_model_path} ...")
|
| 109 |
+
try:
|
| 110 |
+
self.cosyvoice_tts = CosyVoice2TTS(model_dir=self.args.cosyvoice_model_path, device=self.device)
|
| 111 |
+
# reference prompt for zero-shot voice cloning
|
| 112 |
+
self.cosyvoice_ref_audio = self.args.cosyvoice_ref_audio
|
| 113 |
+
self.cosyvoice_ref_text = self.args.cosyvoice_ref_text or ""
|
| 114 |
+
|
| 115 |
+
self.vocoder = None
|
| 116 |
+
self.voc_cfg = None
|
| 117 |
+
self.audio_executor = ThreadPoolExecutor(max_workers=2)
|
| 118 |
+
print("[EchoxAssistant] CosyVoice2 TTS ready.")
|
| 119 |
+
except Exception as e:
|
| 120 |
+
print(f"[EchoxAssistant] Failed to init CosyVoice2TTS: {e}. Falling back to original vocoder.")
|
| 121 |
+
self.cosyvoice_tts = None
|
| 122 |
+
self.vocoder, self.voc_cfg = load_speech_model(self.device)
|
| 123 |
+
self.audio_executor = ThreadPoolExecutor(max_workers=2)
|
| 124 |
+
else:
|
| 125 |
+
self.vocoder, self.voc_cfg = load_speech_model(self.device)
|
| 126 |
+
self.cosyvoice_tts = None
|
| 127 |
+
self.cosyvoice_ref_audio = None
|
| 128 |
+
self.cosyvoice_ref_text = ""
|
| 129 |
+
self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
|
| 130 |
+
self.audio_executor = ThreadPoolExecutor(max_workers=2)
|
| 131 |
+
|
| 132 |
+
if not hasattr(self, "model"):
|
| 133 |
+
self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
|
| 134 |
+
|
| 135 |
+
# self.vocoder, self.voc_cfg= load_speech_model(self.device)
|
| 136 |
+
# self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
|
| 137 |
self.audio_executor = ThreadPoolExecutor(max_workers=2)
|
| 138 |
# self.specAug = SpecAugmentTransform()
|
| 139 |
# special_token
|
|
|
|
| 409 |
accumulated_hidden_states[segment_start_idx:segment_end_idx], dim=0
|
| 410 |
).unsqueeze(0)
|
| 411 |
|
| 412 |
+
if self.cosyvoice_tts:
|
| 413 |
+
segment_token_ids = accumulated_tokens[segment_start_idx:segment_end_idx]
|
| 414 |
+
segment_text = self.tokenizer.decode(segment_token_ids, skip_special_tokens=True)
|
| 415 |
+
future = self.audio_executor.submit(
|
| 416 |
+
lambda txt=segment_text: self.cosyvoice_tts.synthesize(
|
| 417 |
+
text=txt,
|
| 418 |
+
prompt_text=self.cosyvoice_ref_text,
|
| 419 |
+
prompt_speech_path=self.cosyvoice_ref_audio,
|
| 420 |
+
output_path=None,
|
| 421 |
+
stream=False
|
| 422 |
+
)
|
| 423 |
+
)
|
| 424 |
+
audio_futures.append(future)
|
| 425 |
+
else:
|
| 426 |
+
future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
|
| 427 |
+
audio_futures.append(future)
|
| 428 |
|
| 429 |
segment_start_idx = segment_end_idx
|
| 430 |
|
|
|
|
| 432 |
current_attention_mask = torch.ones_like(next_token)
|
| 433 |
|
| 434 |
if segment_start_idx < len(accumulated_hidden_states):
|
|
|
|
| 435 |
segment_hidden_states = torch.stack(
|
| 436 |
accumulated_hidden_states[segment_start_idx:], dim=0
|
| 437 |
).unsqueeze(0)
|
| 438 |
+
if self.cosyvoice_tts:
|
| 439 |
+
segment_token_ids = accumulated_tokens[segment_start_idx:]
|
| 440 |
+
segment_text = self.tokenizer.decode(segment_token_ids, skip_special_tokens=True)
|
| 441 |
+
future = self.audio_executor.submit(
|
| 442 |
+
lambda txt=segment_text: self.cosyvoice_tts.synthesize(
|
| 443 |
+
text=txt,
|
| 444 |
+
prompt_text=self.cosyvoice_ref_text,
|
| 445 |
+
prompt_speech_path=self.cosyvoice_ref_audio,
|
| 446 |
+
output_path=None,
|
| 447 |
+
stream=False
|
| 448 |
+
)
|
| 449 |
+
)
|
| 450 |
+
else:
|
| 451 |
+
future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
|
| 452 |
audio_futures.append(future)
|
| 453 |
|
| 454 |
for future in audio_futures:
|
app.py
CHANGED
|
@@ -10,6 +10,7 @@ import torch
|
|
| 10 |
import librosa
|
| 11 |
import soundfile as sf
|
| 12 |
import tempfile
|
|
|
|
| 13 |
import spaces # ZeroGPU requirement
|
| 14 |
|
| 15 |
# 导入你的模块
|
|
@@ -98,6 +99,8 @@ def process_audio_text(text, audio):
|
|
| 98 |
|
| 99 |
if audio_data is not None:
|
| 100 |
sr, audio_array = audio_data
|
|
|
|
|
|
|
| 101 |
yield (sr, audio_array), accumulated_text
|
| 102 |
else:
|
| 103 |
yield None, accumulated_text
|
|
|
|
| 10 |
import librosa
|
| 11 |
import soundfile as sf
|
| 12 |
import tempfile
|
| 13 |
+
import numpy as np
|
| 14 |
import spaces # ZeroGPU requirement
|
| 15 |
|
| 16 |
# 导入你的模块
|
|
|
|
| 99 |
|
| 100 |
if audio_data is not None:
|
| 101 |
sr, audio_array = audio_data
|
| 102 |
+
if isinstance(audio_array, np.ndarray) and audio_array.ndim == 2 and audio_array.shape[0] == 1:
|
| 103 |
+
audio_array = audio_array.squeeze(0)
|
| 104 |
yield (sr, audio_array), accumulated_text
|
| 105 |
else:
|
| 106 |
yield None, accumulated_text
|
requirements.txt
CHANGED
|
@@ -6,4 +6,20 @@ sentencepiece==0.2.0
|
|
| 6 |
soundfile==0.12.1
|
| 7 |
torch==2.2.0
|
| 8 |
tqdm==4.66.5
|
| 9 |
-
transformers==4.49.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
soundfile==0.12.1
|
| 7 |
torch==2.2.0
|
| 8 |
tqdm==4.66.5
|
| 9 |
+
transformers==4.49.0
|
| 10 |
+
modelscope
|
| 11 |
+
funasr
|
| 12 |
+
hyperpyyaml
|
| 13 |
+
onnxruntime-gpu
|
| 14 |
+
inflect
|
| 15 |
+
jieba
|
| 16 |
+
pypinyin
|
| 17 |
+
g2p_en
|
| 18 |
+
librosa
|
| 19 |
+
soundfile
|
| 20 |
+
matcha-tts
|
| 21 |
+
openai-whisper
|
| 22 |
+
wetext
|
| 23 |
+
pyarrow
|
| 24 |
+
pyworld
|
| 25 |
+
torchcodec
|
tts_wrapper.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
import torch
|
| 4 |
+
import torchaudio
|
| 5 |
+
|
| 6 |
+
sys.path.insert(0, './CosyVoice')
|
| 7 |
+
|
| 8 |
+
from cosyvoice.cli.cosyvoice import CosyVoice2
|
| 9 |
+
from cosyvoice.utils.file_utils import load_wav
|
| 10 |
+
|
| 11 |
+
class CosyVoice2TTS:
|
| 12 |
+
def __init__(self, model_dir, device="cuda"):
|
| 13 |
+
print(f"[TTS] Loading CosyVoice2 model from {model_dir}...")
|
| 14 |
+
|
| 15 |
+
# 初始化模型
|
| 16 |
+
self.model = CosyVoice2(
|
| 17 |
+
model_dir,
|
| 18 |
+
load_jit=False,
|
| 19 |
+
load_trt=False,
|
| 20 |
+
load_vllm=False,
|
| 21 |
+
fp16=True
|
| 22 |
+
)
|
| 23 |
+
print("[TTS] CosyVoice2 Model loaded successfully.")
|
| 24 |
+
|
| 25 |
+
def synthesize(self, text, prompt_text, prompt_speech_path, output_path=None, stream=False):
|
| 26 |
+
if not text:
|
| 27 |
+
return None, None
|
| 28 |
+
|
| 29 |
+
# 加载音频
|
| 30 |
+
prompt_speech_16k = load_wav(prompt_speech_path, 16000)
|
| 31 |
+
|
| 32 |
+
# 调用 zero_shot 推理
|
| 33 |
+
output = self.model.inference_zero_shot(
|
| 34 |
+
tts_text=text,
|
| 35 |
+
prompt_text=prompt_text,
|
| 36 |
+
prompt_speech_16k=prompt_speech_16k,
|
| 37 |
+
stream=stream
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
final_audio = []
|
| 41 |
+
# 获取采样率
|
| 42 |
+
sample_rate = getattr(self.model, 'sample_rate', 24000)
|
| 43 |
+
|
| 44 |
+
for i in output:
|
| 45 |
+
final_audio.append(i['tts_speech'])
|
| 46 |
+
|
| 47 |
+
if not final_audio:
|
| 48 |
+
return None, None
|
| 49 |
+
|
| 50 |
+
full_audio_tensor = torch.cat(final_audio, dim=1)
|
| 51 |
+
|
| 52 |
+
if output_path:
|
| 53 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
| 54 |
+
torchaudio.save(output_path, full_audio_tensor, sample_rate)
|
| 55 |
+
print(f"[TTS] Audio saved to {output_path}")
|
| 56 |
+
|
| 57 |
+
return sample_rate, full_audio_tensor.cpu().numpy()
|