Spaces:

FreedomIntelligence
/

EchoX

Runtime error

App Files Files Community

tzzte commited on 1 day ago

Commit

4ff5a32

verified ·

1 Parent(s): 7819b34

Upload 4 files

Browse files

Files changed (4) hide show

Echox_copy_stream.py +68 -8
app.py +3 -0
requirements.txt +17 -1
tts_wrapper.py +57 -0

Echox_copy_stream.py CHANGED Viewed

@@ -12,6 +12,7 @@ import librosa
 from text_to_speech import *
 import torch.nn.functional as F
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from transformers import logging as hf_logging
 hf_logging.set_verbosity_error()
@@ -36,8 +37,8 @@ def load_model(args, device):
         quantization_config=quantization_config,
         token=hf_token,
     ).eval().to(device)
-    for module in model.model.audio_tower:
-        module = module.to(device)
     if args.peft_model_id:
         lora_config = PeftConfig.from_pretrained(args.peft_model_id)
@@ -96,10 +97,43 @@ class EchoxAssistant():
                 self.base_model_path = "FreedomIntelligence/EchoX-8B"
                 self.peft_model_id = None
                 self.audio_tower = "openai/whisper-large-v3"
         self.args = BasicSetting()
         self.device = "cuda"
-        self.vocoder, self.voc_cfg= load_speech_model(self.device)
-        self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
         self.audio_executor = ThreadPoolExecutor(max_workers=2)
         # self.specAug = SpecAugmentTransform()
         # special_token
@@ -375,8 +409,22 @@ class EchoxAssistant():
                             accumulated_hidden_states[segment_start_idx:segment_end_idx], dim=0
                         ).unsqueeze(0)
-                        future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
-                        audio_futures.append(future)
                         segment_start_idx = segment_end_idx
@@ -384,11 +432,23 @@ class EchoxAssistant():
                 current_attention_mask = torch.ones_like(next_token)
             if segment_start_idx < len(accumulated_hidden_states):
-                print(f"Processing final segment from {segment_start_idx} to {len(accumulated_hidden_states)}")
                 segment_hidden_states = torch.stack(
                     accumulated_hidden_states[segment_start_idx:], dim=0
                 ).unsqueeze(0)
-                future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
                 audio_futures.append(future)
             for future in audio_futures:

 from text_to_speech import *
 import torch.nn.functional as F
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from tts_wrapper import CosyVoice2TTS
 from transformers import logging as hf_logging
 hf_logging.set_verbosity_error()
         quantization_config=quantization_config,
         token=hf_token,
     ).eval().to(device)
+    # for module in model.model.audio_tower:
+    #     module = module.to(device)
     if args.peft_model_id:
         lora_config = PeftConfig.from_pretrained(args.peft_model_id)
                 self.base_model_path = "FreedomIntelligence/EchoX-8B"
                 self.peft_model_id = None
                 self.audio_tower = "openai/whisper-large-v3"
+                self.cosyvoice_model_path = "FunAudioLLM/CosyVoice2-0.5B"
+                self.cosyvoice_ref_audio = "show_case/ref.wav"
+                self.cosyvoice_ref_text = "It's always a good idea to research and compare prices from different sources to get a more accurate idea of the average price of a used car in the United States for different years."
         self.args = BasicSetting()
         self.device = "cuda"
+        if self.args.cosyvoice_model_path:
+            print(f"[EchoxAssistant] Initializing CosyVoice2 TTS from {self.args.cosyvoice_model_path} ...")
+            try:
+                self.cosyvoice_tts = CosyVoice2TTS(model_dir=self.args.cosyvoice_model_path, device=self.device)
+                # reference prompt for zero-shot voice cloning
+                self.cosyvoice_ref_audio = self.args.cosyvoice_ref_audio
+                self.cosyvoice_ref_text = self.args.cosyvoice_ref_text or ""
+                self.vocoder = None
+                self.voc_cfg = None
+                self.audio_executor = ThreadPoolExecutor(max_workers=2)
+                print("[EchoxAssistant] CosyVoice2 TTS ready.")
+            except Exception as e:
+                print(f"[EchoxAssistant] Failed to init CosyVoice2TTS: {e}. Falling back to original vocoder.")
+                self.cosyvoice_tts = None
+                self.vocoder, self.voc_cfg = load_speech_model(self.device)
+                self.audio_executor = ThreadPoolExecutor(max_workers=2)
+        else:
+            self.vocoder, self.voc_cfg = load_speech_model(self.device)
+            self.cosyvoice_tts = None
+            self.cosyvoice_ref_audio = None
+            self.cosyvoice_ref_text = ""
+            self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
+            self.audio_executor = ThreadPoolExecutor(max_workers=2)
+        if not hasattr(self, "model"):
+            self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
+        # self.vocoder, self.voc_cfg= load_speech_model(self.device)
+        # self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
         self.audio_executor = ThreadPoolExecutor(max_workers=2)
         # self.specAug = SpecAugmentTransform()
         # special_token
                             accumulated_hidden_states[segment_start_idx:segment_end_idx], dim=0
                         ).unsqueeze(0)
+                        if self.cosyvoice_tts:
+                            segment_token_ids = accumulated_tokens[segment_start_idx:segment_end_idx]
+                            segment_text = self.tokenizer.decode(segment_token_ids, skip_special_tokens=True)
+                            future = self.audio_executor.submit(
+                                lambda txt=segment_text: self.cosyvoice_tts.synthesize(
+                                    text=txt,
+                                    prompt_text=self.cosyvoice_ref_text,
+                                    prompt_speech_path=self.cosyvoice_ref_audio,
+                                    output_path=None,
+                                    stream=False
+                                )
+                            )
+                            audio_futures.append(future)
+                        else:
+                            future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
+                            audio_futures.append(future)
                         segment_start_idx = segment_end_idx
                 current_attention_mask = torch.ones_like(next_token)
             if segment_start_idx < len(accumulated_hidden_states):
                 segment_hidden_states = torch.stack(
                     accumulated_hidden_states[segment_start_idx:], dim=0
                 ).unsqueeze(0)
+                if self.cosyvoice_tts:
+                    segment_token_ids = accumulated_tokens[segment_start_idx:]
+                    segment_text = self.tokenizer.decode(segment_token_ids, skip_special_tokens=True)
+                    future = self.audio_executor.submit(
+                        lambda txt=segment_text: self.cosyvoice_tts.synthesize(
+                            text=txt,
+                            prompt_text=self.cosyvoice_ref_text,
+                            prompt_speech_path=self.cosyvoice_ref_audio,
+                            output_path=None,
+                            stream=False
+                        )
+                    )
+                else:
+                    future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
                 audio_futures.append(future)
             for future in audio_futures:

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ import torch
 import librosa
 import soundfile as sf
 import tempfile
 import spaces  # ZeroGPU requirement
 # 导入你的模块
@@ -98,6 +99,8 @@ def process_audio_text(text, audio):
             if audio_data is not None:
                 sr, audio_array = audio_data
                 yield (sr, audio_array), accumulated_text
             else:
                 yield None, accumulated_text

 import librosa
 import soundfile as sf
 import tempfile
+import numpy as np
 import spaces  # ZeroGPU requirement
 # 导入你的模块
             if audio_data is not None:
                 sr, audio_array = audio_data
+                if isinstance(audio_array, np.ndarray) and audio_array.ndim == 2 and audio_array.shape[0] == 1:
+                    audio_array = audio_array.squeeze(0)
                 yield (sr, audio_array), accumulated_text
             else:
                 yield None, accumulated_text

requirements.txt CHANGED Viewed

@@ -6,4 +6,20 @@ sentencepiece==0.2.0
 soundfile==0.12.1
 torch==2.2.0
 tqdm==4.66.5
-transformers==4.49.0

 soundfile==0.12.1
 torch==2.2.0
 tqdm==4.66.5
+transformers==4.49.0
+modelscope
+funasr
+hyperpyyaml
+onnxruntime-gpu
+inflect
+jieba
+pypinyin
+g2p_en
+librosa
+soundfile
+matcha-tts
+openai-whisper
+wetext
+pyarrow
+pyworld
+torchcodec

tts_wrapper.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import sys
+import os
+import torch
+import torchaudio
+sys.path.insert(0, './CosyVoice')
+from cosyvoice.cli.cosyvoice import CosyVoice2
+from cosyvoice.utils.file_utils import load_wav
+class CosyVoice2TTS:
+    def __init__(self, model_dir, device="cuda"):
+        print(f"[TTS] Loading CosyVoice2 model from {model_dir}...")
+        # 初始化模型
+        self.model = CosyVoice2(
+            model_dir,
+            load_jit=False,
+            load_trt=False,
+            load_vllm=False,
+            fp16=True
+        )
+        print("[TTS] CosyVoice2 Model loaded successfully.")
+    def synthesize(self, text, prompt_text, prompt_speech_path, output_path=None, stream=False):
+        if not text:
+            return None, None
+        # 加载音频
+        prompt_speech_16k = load_wav(prompt_speech_path, 16000)
+        # 调用 zero_shot 推理
+        output = self.model.inference_zero_shot(
+            tts_text=text,
+            prompt_text=prompt_text,
+            prompt_speech_16k=prompt_speech_16k,
+            stream=stream
+        )
+        final_audio = []
+        # 获取采样率
+        sample_rate = getattr(self.model, 'sample_rate', 24000)
+        for i in output:
+            final_audio.append(i['tts_speech'])
+        if not final_audio:
+            return None, None
+        full_audio_tensor = torch.cat(final_audio, dim=1)
+        if output_path:
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+            torchaudio.save(output_path, full_audio_tensor, sample_rate)
+            print(f"[TTS] Audio saved to {output_path}")
+        return sample_rate, full_audio_tensor.cpu().numpy()