tzzte commited on
Commit
4ff5a32
·
verified ·
1 Parent(s): 7819b34

Upload 4 files

Browse files
Files changed (4) hide show
  1. Echox_copy_stream.py +68 -8
  2. app.py +3 -0
  3. requirements.txt +17 -1
  4. tts_wrapper.py +57 -0
Echox_copy_stream.py CHANGED
@@ -12,6 +12,7 @@ import librosa
12
  from text_to_speech import *
13
  import torch.nn.functional as F
14
  from concurrent.futures import ThreadPoolExecutor, as_completed
 
15
 
16
  from transformers import logging as hf_logging
17
  hf_logging.set_verbosity_error()
@@ -36,8 +37,8 @@ def load_model(args, device):
36
  quantization_config=quantization_config,
37
  token=hf_token,
38
  ).eval().to(device)
39
- for module in model.model.audio_tower:
40
- module = module.to(device)
41
 
42
  if args.peft_model_id:
43
  lora_config = PeftConfig.from_pretrained(args.peft_model_id)
@@ -96,10 +97,43 @@ class EchoxAssistant():
96
  self.base_model_path = "FreedomIntelligence/EchoX-8B"
97
  self.peft_model_id = None
98
  self.audio_tower = "openai/whisper-large-v3"
 
 
 
 
99
  self.args = BasicSetting()
100
  self.device = "cuda"
101
- self.vocoder, self.voc_cfg= load_speech_model(self.device)
102
- self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  self.audio_executor = ThreadPoolExecutor(max_workers=2)
104
  # self.specAug = SpecAugmentTransform()
105
  # special_token
@@ -375,8 +409,22 @@ class EchoxAssistant():
375
  accumulated_hidden_states[segment_start_idx:segment_end_idx], dim=0
376
  ).unsqueeze(0)
377
 
378
- future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
379
- audio_futures.append(future)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
 
381
  segment_start_idx = segment_end_idx
382
 
@@ -384,11 +432,23 @@ class EchoxAssistant():
384
  current_attention_mask = torch.ones_like(next_token)
385
 
386
  if segment_start_idx < len(accumulated_hidden_states):
387
- print(f"Processing final segment from {segment_start_idx} to {len(accumulated_hidden_states)}")
388
  segment_hidden_states = torch.stack(
389
  accumulated_hidden_states[segment_start_idx:], dim=0
390
  ).unsqueeze(0)
391
- future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
 
 
 
 
 
 
 
 
 
 
 
 
 
392
  audio_futures.append(future)
393
 
394
  for future in audio_futures:
 
12
  from text_to_speech import *
13
  import torch.nn.functional as F
14
  from concurrent.futures import ThreadPoolExecutor, as_completed
15
+ from tts_wrapper import CosyVoice2TTS
16
 
17
  from transformers import logging as hf_logging
18
  hf_logging.set_verbosity_error()
 
37
  quantization_config=quantization_config,
38
  token=hf_token,
39
  ).eval().to(device)
40
+ # for module in model.model.audio_tower:
41
+ # module = module.to(device)
42
 
43
  if args.peft_model_id:
44
  lora_config = PeftConfig.from_pretrained(args.peft_model_id)
 
97
  self.base_model_path = "FreedomIntelligence/EchoX-8B"
98
  self.peft_model_id = None
99
  self.audio_tower = "openai/whisper-large-v3"
100
+ self.cosyvoice_model_path = "FunAudioLLM/CosyVoice2-0.5B"
101
+ self.cosyvoice_ref_audio = "show_case/ref.wav"
102
+ self.cosyvoice_ref_text = "It's always a good idea to research and compare prices from different sources to get a more accurate idea of the average price of a used car in the United States for different years."
103
+
104
  self.args = BasicSetting()
105
  self.device = "cuda"
106
+
107
+ if self.args.cosyvoice_model_path:
108
+ print(f"[EchoxAssistant] Initializing CosyVoice2 TTS from {self.args.cosyvoice_model_path} ...")
109
+ try:
110
+ self.cosyvoice_tts = CosyVoice2TTS(model_dir=self.args.cosyvoice_model_path, device=self.device)
111
+ # reference prompt for zero-shot voice cloning
112
+ self.cosyvoice_ref_audio = self.args.cosyvoice_ref_audio
113
+ self.cosyvoice_ref_text = self.args.cosyvoice_ref_text or ""
114
+
115
+ self.vocoder = None
116
+ self.voc_cfg = None
117
+ self.audio_executor = ThreadPoolExecutor(max_workers=2)
118
+ print("[EchoxAssistant] CosyVoice2 TTS ready.")
119
+ except Exception as e:
120
+ print(f"[EchoxAssistant] Failed to init CosyVoice2TTS: {e}. Falling back to original vocoder.")
121
+ self.cosyvoice_tts = None
122
+ self.vocoder, self.voc_cfg = load_speech_model(self.device)
123
+ self.audio_executor = ThreadPoolExecutor(max_workers=2)
124
+ else:
125
+ self.vocoder, self.voc_cfg = load_speech_model(self.device)
126
+ self.cosyvoice_tts = None
127
+ self.cosyvoice_ref_audio = None
128
+ self.cosyvoice_ref_text = ""
129
+ self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
130
+ self.audio_executor = ThreadPoolExecutor(max_workers=2)
131
+
132
+ if not hasattr(self, "model"):
133
+ self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
134
+
135
+ # self.vocoder, self.voc_cfg= load_speech_model(self.device)
136
+ # self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
137
  self.audio_executor = ThreadPoolExecutor(max_workers=2)
138
  # self.specAug = SpecAugmentTransform()
139
  # special_token
 
409
  accumulated_hidden_states[segment_start_idx:segment_end_idx], dim=0
410
  ).unsqueeze(0)
411
 
412
+ if self.cosyvoice_tts:
413
+ segment_token_ids = accumulated_tokens[segment_start_idx:segment_end_idx]
414
+ segment_text = self.tokenizer.decode(segment_token_ids, skip_special_tokens=True)
415
+ future = self.audio_executor.submit(
416
+ lambda txt=segment_text: self.cosyvoice_tts.synthesize(
417
+ text=txt,
418
+ prompt_text=self.cosyvoice_ref_text,
419
+ prompt_speech_path=self.cosyvoice_ref_audio,
420
+ output_path=None,
421
+ stream=False
422
+ )
423
+ )
424
+ audio_futures.append(future)
425
+ else:
426
+ future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
427
+ audio_futures.append(future)
428
 
429
  segment_start_idx = segment_end_idx
430
 
 
432
  current_attention_mask = torch.ones_like(next_token)
433
 
434
  if segment_start_idx < len(accumulated_hidden_states):
 
435
  segment_hidden_states = torch.stack(
436
  accumulated_hidden_states[segment_start_idx:], dim=0
437
  ).unsqueeze(0)
438
+ if self.cosyvoice_tts:
439
+ segment_token_ids = accumulated_tokens[segment_start_idx:]
440
+ segment_text = self.tokenizer.decode(segment_token_ids, skip_special_tokens=True)
441
+ future = self.audio_executor.submit(
442
+ lambda txt=segment_text: self.cosyvoice_tts.synthesize(
443
+ text=txt,
444
+ prompt_text=self.cosyvoice_ref_text,
445
+ prompt_speech_path=self.cosyvoice_ref_audio,
446
+ output_path=None,
447
+ stream=False
448
+ )
449
+ )
450
+ else:
451
+ future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
452
  audio_futures.append(future)
453
 
454
  for future in audio_futures:
app.py CHANGED
@@ -10,6 +10,7 @@ import torch
10
  import librosa
11
  import soundfile as sf
12
  import tempfile
 
13
  import spaces # ZeroGPU requirement
14
 
15
  # 导入你的模块
@@ -98,6 +99,8 @@ def process_audio_text(text, audio):
98
 
99
  if audio_data is not None:
100
  sr, audio_array = audio_data
 
 
101
  yield (sr, audio_array), accumulated_text
102
  else:
103
  yield None, accumulated_text
 
10
  import librosa
11
  import soundfile as sf
12
  import tempfile
13
+ import numpy as np
14
  import spaces # ZeroGPU requirement
15
 
16
  # 导入你的模块
 
99
 
100
  if audio_data is not None:
101
  sr, audio_array = audio_data
102
+ if isinstance(audio_array, np.ndarray) and audio_array.ndim == 2 and audio_array.shape[0] == 1:
103
+ audio_array = audio_array.squeeze(0)
104
  yield (sr, audio_array), accumulated_text
105
  else:
106
  yield None, accumulated_text
requirements.txt CHANGED
@@ -6,4 +6,20 @@ sentencepiece==0.2.0
6
  soundfile==0.12.1
7
  torch==2.2.0
8
  tqdm==4.66.5
9
- transformers==4.49.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  soundfile==0.12.1
7
  torch==2.2.0
8
  tqdm==4.66.5
9
+ transformers==4.49.0
10
+ modelscope
11
+ funasr
12
+ hyperpyyaml
13
+ onnxruntime-gpu
14
+ inflect
15
+ jieba
16
+ pypinyin
17
+ g2p_en
18
+ librosa
19
+ soundfile
20
+ matcha-tts
21
+ openai-whisper
22
+ wetext
23
+ pyarrow
24
+ pyworld
25
+ torchcodec
tts_wrapper.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import torch
4
+ import torchaudio
5
+
6
+ sys.path.insert(0, './CosyVoice')
7
+
8
+ from cosyvoice.cli.cosyvoice import CosyVoice2
9
+ from cosyvoice.utils.file_utils import load_wav
10
+
11
+ class CosyVoice2TTS:
12
+ def __init__(self, model_dir, device="cuda"):
13
+ print(f"[TTS] Loading CosyVoice2 model from {model_dir}...")
14
+
15
+ # 初始化模型
16
+ self.model = CosyVoice2(
17
+ model_dir,
18
+ load_jit=False,
19
+ load_trt=False,
20
+ load_vllm=False,
21
+ fp16=True
22
+ )
23
+ print("[TTS] CosyVoice2 Model loaded successfully.")
24
+
25
+ def synthesize(self, text, prompt_text, prompt_speech_path, output_path=None, stream=False):
26
+ if not text:
27
+ return None, None
28
+
29
+ # 加载音频
30
+ prompt_speech_16k = load_wav(prompt_speech_path, 16000)
31
+
32
+ # 调用 zero_shot 推理
33
+ output = self.model.inference_zero_shot(
34
+ tts_text=text,
35
+ prompt_text=prompt_text,
36
+ prompt_speech_16k=prompt_speech_16k,
37
+ stream=stream
38
+ )
39
+
40
+ final_audio = []
41
+ # 获取采样率
42
+ sample_rate = getattr(self.model, 'sample_rate', 24000)
43
+
44
+ for i in output:
45
+ final_audio.append(i['tts_speech'])
46
+
47
+ if not final_audio:
48
+ return None, None
49
+
50
+ full_audio_tensor = torch.cat(final_audio, dim=1)
51
+
52
+ if output_path:
53
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
54
+ torchaudio.save(output_path, full_audio_tensor, sample_rate)
55
+ print(f"[TTS] Audio saved to {output_path}")
56
+
57
+ return sample_rate, full_audio_tensor.cpu().numpy()