Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Voice Development Assistant - Hugging Face Spaces | |
| Optimized for ZeroGPU H200 cluster | |
| Uses OpenRouter for LLM, HuggingFace for TTS | |
| """ | |
| import gradio as gr | |
| import numpy as np | |
| import os | |
| import tempfile | |
| import requests | |
| print(f"π¦ Gradio version: {gr.__version__}") | |
| # Check for ZeroGPU availability | |
| try: | |
| import spaces | |
| ZERO_GPU_AVAILABLE = True | |
| print("π ZeroGPU detected - GPU acceleration enabled!") | |
| except ImportError: | |
| ZERO_GPU_AVAILABLE = False | |
| print("β οΈ ZeroGPU not available - running on CPU") | |
| # Configuration from environment | |
| CONFIG = { | |
| 'openrouter_key': os.getenv('OPENROUTER_API_KEY', ''), | |
| 'whisper_model': os.getenv('WHISPER_MODEL', 'base'), | |
| 'language': os.getenv('LANGUAGE', 'en'), | |
| 'llm_model': os.getenv('LLM_MODEL', 'cognitivecomputations/dolphin-mistral-24b-venice-edition:free'), | |
| 'max_tokens': int(os.getenv('MAX_TOKENS', '4096')), | |
| 'temperature': float(os.getenv('TEMPERATURE', '1.0')) | |
| } | |
| OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1" | |
| # Lazy-loaded models | |
| whisper_model = None | |
| tts_pipeline = None | |
| conversation_history = [] | |
| def get_whisper_model(): | |
| """Load Whisper model (uses GPU when available via ZeroGPU)""" | |
| global whisper_model | |
| if whisper_model is None: | |
| import whisper | |
| import torch | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model_name = CONFIG['whisper_model'] | |
| print(f"Loading Whisper model '{model_name}' on {device}...") | |
| whisper_model = whisper.load_model(model_name, device=device) | |
| print(f"β Whisper model loaded on {device}") | |
| return whisper_model | |
| def get_tts_pipeline(): | |
| """Get HuggingFace TTS pipeline""" | |
| global tts_pipeline | |
| if tts_pipeline is None: | |
| try: | |
| import torch | |
| from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
| from datasets import load_dataset | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Loading TTS models on {device}...") | |
| processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
| model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device) | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device) | |
| embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
| speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device) | |
| tts_pipeline = { | |
| "processor": processor, | |
| "model": model, | |
| "vocoder": vocoder, | |
| "speaker_embeddings": speaker_embeddings, | |
| "device": device | |
| } | |
| print("β HuggingFace TTS initialized (SpeechT5)") | |
| except Exception as e: | |
| print(f"β οΈ SpeechT5 failed, trying MMS-TTS: {e}") | |
| try: | |
| from transformers import pipeline | |
| tts_pipeline = pipeline("text-to-speech", model="facebook/mms-tts-eng") | |
| print("β HuggingFace TTS initialized (MMS-TTS)") | |
| except Exception as e2: | |
| print(f"β TTS initialization failed: {e2}") | |
| tts_pipeline = None | |
| return tts_pipeline | |
| def chat_with_openrouter(messages: list) -> str: | |
| """Send chat request to OpenRouter API""" | |
| api_key = CONFIG['openrouter_key'] | |
| if not api_key: | |
| raise ValueError("OpenRouter API key not configured. Set OPENROUTER_API_KEY secret.") | |
| headers = { | |
| "Authorization": f"Bearer {api_key}", | |
| "Content-Type": "application/json", | |
| "HTTP-Referer": "https://huggingface.co/spaces", | |
| "X-Title": "Voice Development Assistant" | |
| } | |
| payload = { | |
| "model": CONFIG['llm_model'], | |
| "messages": messages, | |
| "max_tokens": CONFIG['max_tokens'], | |
| "temperature": CONFIG['temperature'] | |
| } | |
| response = requests.post( | |
| f"{OPENROUTER_BASE_URL}/chat/completions", | |
| headers=headers, | |
| json=payload, | |
| timeout=120 | |
| ) | |
| if response.status_code != 200: | |
| raise Exception(f"OpenRouter API error: {response.status_code} - {response.text}") | |
| return response.json()['choices'][0]['message']['content'] | |
| def transcribe_audio_gpu(audio_data: np.ndarray) -> str: | |
| """Transcribe audio using Whisper""" | |
| model = get_whisper_model() | |
| if audio_data.dtype != np.float32: | |
| if audio_data.dtype == np.int16: | |
| audio_data = audio_data.astype(np.float32) / 32768.0 | |
| else: | |
| audio_data = audio_data.astype(np.float32) | |
| if len(audio_data.shape) > 1: | |
| audio_data = audio_data[:, 0] if audio_data.shape[1] > 1 else audio_data.flatten() | |
| result = model.transcribe(audio_data, language=CONFIG['language'], fp16=False) | |
| return result["text"].strip() | |
| # Wrap with ZeroGPU decorator if available | |
| if ZERO_GPU_AVAILABLE: | |
| def transcribe_with_gpu(audio_data: np.ndarray) -> str: | |
| return transcribe_audio_gpu(audio_data) | |
| else: | |
| transcribe_with_gpu = transcribe_audio_gpu | |
| def transcribe_audio(audio): | |
| """Transcribe audio input from Gradio""" | |
| try: | |
| if audio is None: | |
| return "No audio provided. Please record or upload audio." | |
| sample_rate, audio_data = audio | |
| text = transcribe_with_gpu(audio_data) | |
| return text if text else "No speech detected." | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| def synthesize_text(text): | |
| """Synthesize text to speech""" | |
| try: | |
| if not text: | |
| return None, "No text provided" | |
| import torch | |
| import scipy.io.wavfile as wavfile | |
| tts = get_tts_pipeline() | |
| if tts is None: | |
| return None, "TTS not available" | |
| if isinstance(tts, dict): | |
| inputs = tts["processor"](text=text, return_tensors="pt").to(tts["device"]) | |
| with torch.no_grad(): | |
| speech = tts["model"].generate_speech( | |
| inputs["input_ids"], | |
| tts["speaker_embeddings"], | |
| vocoder=tts["vocoder"] | |
| ) | |
| audio_data = speech.cpu().numpy() | |
| sample_rate = 16000 | |
| else: | |
| result = tts(text) | |
| audio_data = result["audio"][0] | |
| sample_rate = result["sampling_rate"] | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp: | |
| wavfile.write(tmp.name, sample_rate, audio_data) | |
| return tmp.name, f"β Synthesized {len(text)} characters" | |
| except Exception as e: | |
| return None, f"Error: {str(e)}" | |
| def chat_with_claude(message, history): | |
| """Chat with LLM via OpenRouter""" | |
| global conversation_history | |
| try: | |
| if not message.strip(): | |
| return history | |
| conversation_history.append({"role": "user", "content": message}) | |
| assistant_message = chat_with_openrouter(conversation_history) | |
| conversation_history.append({"role": "assistant", "content": assistant_message}) | |
| history.append([message, assistant_message]) | |
| return history | |
| except Exception as e: | |
| history.append([message, f"Error: {str(e)}"]) | |
| return history | |
| def voice_chat(audio): | |
| """Complete voice conversation""" | |
| global conversation_history | |
| try: | |
| if audio is None: | |
| return None, "No audio provided", "" | |
| sample_rate, audio_data = audio | |
| user_text = transcribe_with_gpu(audio_data) | |
| if not user_text: | |
| return None, "No speech detected", "" | |
| conversation_history.append({"role": "user", "content": user_text}) | |
| response_text = chat_with_openrouter(conversation_history) | |
| conversation_history.append({"role": "assistant", "content": response_text}) | |
| audio_path, _ = synthesize_text(response_text) | |
| conversation_log = f"**π€ You:** {user_text}\n\n**π€ Assistant:** {response_text}" | |
| return audio_path, conversation_log, response_text | |
| except Exception as e: | |
| return None, f"Error: {str(e)}", "" | |
| def clear_history(): | |
| """Clear conversation history""" | |
| global conversation_history | |
| conversation_history = [] | |
| return [] | |
| def check_api_status(): | |
| """Check system status""" | |
| status = [] | |
| if CONFIG['openrouter_key']: | |
| status.append("β OpenRouter API key configured") | |
| else: | |
| status.append("β OpenRouter API key missing (Set OPENROUTER_API_KEY secret)") | |
| status.append("β HuggingFace TTS (free, no API key)") | |
| if ZERO_GPU_AVAILABLE: | |
| status.append("π ZeroGPU enabled (H200 acceleration)") | |
| else: | |
| status.append("π» Running on CPU") | |
| return "\n".join(status) | |
| # Build Gradio Interface | |
| demo = gr.Blocks(title="Voice Development Assistant") | |
| with demo: | |
| gr.Markdown(""" | |
| # π€ Voice Development Assistant | |
| **Personal Voice Interface for Development Workflows** | |
| Speech-to-Text β’ Text-to-Speech β’ Claude AI Conversations | |
| """) | |
| with gr.Accordion("π System Status", open=False): | |
| status_display = gr.Markdown(check_api_status()) | |
| refresh_btn = gr.Button("π Refresh Status") | |
| refresh_btn.click(check_api_status, outputs=[status_display]) | |
| with gr.Tabs(): | |
| # Voice Chat | |
| with gr.Tab("π€ Voice Chat"): | |
| gr.Markdown("### Speak with Claude using your voice") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| voice_input = gr.Audio(label="ποΈ Click to Record", sources=["microphone"], type="numpy") | |
| voice_submit = gr.Button("π Send to Claude", variant="primary") | |
| with gr.Column(scale=1): | |
| voice_output = gr.Audio(label="π Claude's Response", type="filepath") | |
| voice_log = gr.Markdown(label="Conversation") | |
| voice_text = gr.Textbox(label="Response Text", lines=3, interactive=False) | |
| voice_submit.click(voice_chat, inputs=[voice_input], outputs=[voice_output, voice_log, voice_text]) | |
| # Transcribe | |
| with gr.Tab("π Transcribe"): | |
| gr.Markdown("### Convert speech to text using Whisper") | |
| with gr.Row(): | |
| with gr.Column(): | |
| stt_input = gr.Audio(label="ποΈ Audio Input", sources=["microphone", "upload"], type="numpy") | |
| stt_btn = gr.Button("π Transcribe", variant="primary") | |
| with gr.Column(): | |
| stt_output = gr.Textbox(label="Transcription", lines=8, placeholder="Transcribed text appears here...") | |
| stt_btn.click(transcribe_audio, inputs=[stt_input], outputs=[stt_output]) | |
| # TTS | |
| with gr.Tab("π Speak"): | |
| gr.Markdown("### Convert text to natural speech (HuggingFace TTS)") | |
| with gr.Row(): | |
| with gr.Column(): | |
| tts_input = gr.Textbox(label="Text to Speak", lines=5, placeholder="Enter text to synthesize...") | |
| tts_btn = gr.Button("π Generate Speech", variant="primary") | |
| with gr.Column(): | |
| tts_output = gr.Audio(label="Generated Audio", type="filepath") | |
| tts_status = gr.Textbox(label="Status", interactive=False) | |
| tts_btn.click(synthesize_text, inputs=[tts_input], outputs=[tts_output, tts_status]) | |
| # Text Chat | |
| with gr.Tab("π¬ Text Chat"): | |
| gr.Markdown("### Chat with Claude via text") | |
| chatbot = gr.Chatbot(height=450) | |
| with gr.Row(): | |
| chat_input = gr.Textbox(label="Message", placeholder="Type your message...", scale=4) | |
| chat_submit = gr.Button("Send", variant="primary", scale=1) | |
| clear_btn = gr.Button("ποΈ Clear History") | |
| chat_submit.click(chat_with_claude, inputs=[chat_input, chatbot], outputs=[chatbot]).then(lambda: "", outputs=[chat_input]) | |
| chat_input.submit(chat_with_claude, inputs=[chat_input, chatbot], outputs=[chatbot]).then(lambda: "", outputs=[chat_input]) | |
| clear_btn.click(clear_history, outputs=[chatbot]) | |
| gr.Markdown(""" | |
| --- | |
| **Voice Development Assistant** β’ Built with Whisper, HuggingFace TTS, and OpenRouter | |
| π Configure OPENROUTER_API_KEY as a Hugging Face Space secret | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() | |