SoundImage-VoiceClone

Runtime error

App Files Files Community

Steveeeeeeen HF Staff commited on Feb 12

Commit

71064d4

verified ·

1 Parent(s): b8a3553

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -31

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import gradio as gr
 import spaces
 from zonos.model import Zonos
-from zonos.conditioning import make_cond_dict  # Keep this; remove supported_language_codes
 # We'll keep a global dictionary of loaded models to avoid reloading
 MODELS_CACHE = {}
@@ -13,15 +13,6 @@ device = "cuda"
 banner_url = "https://huggingface.co/datasets/Steveeeeeeen/random_images/resolve/main/ZonosHeader.png"
 BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 150px; max-width: 300px;"> </div>'
-# Define a list of tuples: (Display Label, Language Code)
-LANGUAGES = [
-    ("English",  "en-us"),
-    ("Japanese", "ja"),
-    ("Chinese",  "cmn"),
-    ("French",   "fr-fr"),
-    ("German",   "de"),
-]
 def load_model(model_name: str):
     """
     Loads or retrieves a cached Zonos model, sets it to eval and bfloat16.
@@ -37,20 +28,15 @@ def load_model(model_name: str):
     return MODELS_CACHE[model_name]
 @spaces.GPU(duration=90)
-def tts(text, speaker_audio, selected_language_label, model_choice):
     """
     text: str (Text prompt to synthesize)
     speaker_audio: (sample_rate, numpy_array) from Gradio if type="numpy"
-    selected_language_label: str (the display name from the dropdown, e.g. "Chinese")
     model_choice: str (which Zonos model to use, e.g., "Zyphra/Zonos-v0.1-hybrid")
     Returns (sr_out, wav_out_numpy).
     """
-    # Map from label -> actual language code
-    label_to_code = dict(LANGUAGES)
-    # Convert the human-readable label back to the code
-    selected_language = label_to_code[selected_language_label]
     model = load_model(model_choice)
     if not text:
@@ -66,11 +52,12 @@ def tts(text, speaker_audio, selected_language_label, model_choice):
     # Convert to Torch tensor
     wav_tensor = torch.from_numpy(wav_np).float()
-    # If stereo or multi-channel, downmix to mono
     if wav_tensor.ndim == 2 and wav_tensor.shape[0] > 1:
-        wav_tensor = wav_tensor.mean(dim=0)  # => (samples,)
-    # Add batch dimension => (1, samples)
     wav_tensor = wav_tensor.unsqueeze(0)
     # Get speaker embedding
@@ -79,12 +66,12 @@ def tts(text, speaker_audio, selected_language_label, model_choice):
         spk_embedding = spk_embedding.to(device, dtype=torch.bfloat16)
     # Prepare conditioning dictionary
-    cond_dict = {
-        "text": text,
-        "speaker": spk_embedding,
-        "language": selected_language,  # Use the code here
-        "device": device,
-    }
     conditioning = model.prepare_conditioning(cond_dict)
     # Generate codes
@@ -119,6 +106,8 @@ def build_demo():
             ref_audio_input = gr.Audio(
                 label="Reference Audio (Speaker Cloning)",
                 type="numpy"
             )
         model_dropdown = gr.Dropdown(
@@ -127,12 +116,10 @@ def build_demo():
             value="Zyphra/Zonos-v0.1-hybrid",
             interactive=True,
         )
-        # For the language dropdown, we display only the friendly label
         language_dropdown = gr.Dropdown(
-            label="Language",
-            choices=[label for (label, code) in LANGUAGES],
-            value="English",  # default display
             interactive=True,
         )
@@ -150,3 +137,5 @@ def build_demo():
 if __name__ == "__main__":
     demo_app = build_demo()
     demo_app.launch(server_name="0.0.0.0", server_port=7860, share=True)

 import spaces
 from zonos.model import Zonos
+from zonos.conditioning import make_cond_dict, supported_language_codes
 # We'll keep a global dictionary of loaded models to avoid reloading
 MODELS_CACHE = {}
 banner_url = "https://huggingface.co/datasets/Steveeeeeeen/random_images/resolve/main/ZonosHeader.png"
 BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 150px; max-width: 300px;"> </div>'
 def load_model(model_name: str):
     """
     Loads or retrieves a cached Zonos model, sets it to eval and bfloat16.
     return MODELS_CACHE[model_name]
 @spaces.GPU(duration=90)
+def tts(text, speaker_audio, selected_language, model_choice):
     """
     text: str (Text prompt to synthesize)
     speaker_audio: (sample_rate, numpy_array) from Gradio if type="numpy"
+    selected_language: str (language code)
     model_choice: str (which Zonos model to use, e.g., "Zyphra/Zonos-v0.1-hybrid")
     Returns (sr_out, wav_out_numpy).
     """
     model = load_model(model_choice)
     if not text:
     # Convert to Torch tensor
     wav_tensor = torch.from_numpy(wav_np).float()
+    # If stereo (shape [channels, samples]) or multi-channel, downmix to mono
+    # e.g. shape (2, samples) -> shape (samples,) by averaging
     if wav_tensor.ndim == 2 and wav_tensor.shape[0] > 1:
+        wav_tensor = wav_tensor.mean(dim=0)  # shape => (samples,)
+    # Now add a batch dimension => shape (1, samples)
     wav_tensor = wav_tensor.unsqueeze(0)
     # Get speaker embedding
         spk_embedding = spk_embedding.to(device, dtype=torch.bfloat16)
     # Prepare conditioning dictionary
+    cond_dict = make_cond_dict(
+        text=text,
+        speaker=spk_embedding,
+        language=selected_language,
+        device=device,
+    )
     conditioning = model.prepare_conditioning(cond_dict)
     # Generate codes
             ref_audio_input = gr.Audio(
                 label="Reference Audio (Speaker Cloning)",
                 type="numpy"
+                # Optionally add mono=True if you want Gradio to always downmix automatically:
+                # mono=True
             )
         model_dropdown = gr.Dropdown(
             value="Zyphra/Zonos-v0.1-hybrid",
             interactive=True,
         )
         language_dropdown = gr.Dropdown(
+            label="Language Code",
+            choices=["en-us", "ja", "cmn", "fr-fr", "de"]
+            value="en-us",
             interactive=True,
         )
 if __name__ == "__main__":
     demo_app = build_demo()
     demo_app.launch(server_name="0.0.0.0", server_port=7860, share=True)
+This is my code. replace supported_language_codes with a list of the languages i asked you and in the gr.Dropdown it display the name of the language instead of just "cmn'