Spaces:
Running
Running
| import gradio as gr | |
| import json | |
| import os | |
| import logging | |
| import requests | |
| import re | |
| import numpy as np | |
| import pandas as pd | |
| from datetime import datetime | |
| import time | |
| from typing import Dict, List, Tuple, Optional | |
| import tempfile | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Anthropic API key | |
| ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "") | |
| # Try to import SpeechBrain and HuggingFace components | |
| try: | |
| from speechbrain.pretrained import EncoderDecoderASR, VAD, EncoderClassifier | |
| from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification | |
| import torch | |
| SPEECHBRAIN_AVAILABLE = True | |
| HUGGINGFACE_AVAILABLE = True | |
| logger.info("SpeechBrain and HuggingFace models available") | |
| except ImportError as e: | |
| logger.warning(f"SpeechBrain/HuggingFace not available: {e}") | |
| SPEECHBRAIN_AVAILABLE = False | |
| HUGGINGFACE_AVAILABLE = False | |
| # Initialize models if available | |
| asr_model = None | |
| vad_model = None | |
| sentiment_model = None | |
| emotion_model = None | |
| if SPEECHBRAIN_AVAILABLE and HUGGINGFACE_AVAILABLE: | |
| try: | |
| # Speech-to-text model | |
| asr_model = EncoderDecoderASR.from_hparams( | |
| source="speechbrain/asr-crdnn-rnnlm-librispeech", | |
| savedir="pretrained_models/asr-crdnn-rnnlm-librispeech" | |
| ) | |
| # Voice Activity Detection | |
| vad_model = VAD.from_hparams( | |
| source="speechbrain/vad-crdnn-libriparty", | |
| savedir="pretrained_models/vad-crdnn-libriparty" | |
| ) | |
| # Sentiment analysis | |
| sentiment_model = pipeline( | |
| "sentiment-analysis", | |
| model="cardiffnlp/twitter-roberta-base-sentiment-latest", | |
| return_all_scores=True | |
| ) | |
| # Emotion analysis | |
| emotion_model = pipeline( | |
| "text-classification", | |
| model="j-hartmann/emotion-english-distilroberta-base", | |
| return_all_scores=True | |
| ) | |
| logger.info("All models loaded successfully") | |
| except Exception as e: | |
| logger.error(f"Error loading models: {e}") | |
| SPEECHBRAIN_AVAILABLE = False | |
| HUGGINGFACE_AVAILABLE = False | |
| def call_claude_api(prompt): | |
| """Call Claude API directly""" | |
| if not ANTHROPIC_API_KEY: | |
| return "β Claude API key not configured. Please set ANTHROPIC_API_KEY environment variable." | |
| try: | |
| headers = { | |
| "Content-Type": "application/json", | |
| "x-api-key": ANTHROPIC_API_KEY, | |
| "anthropic-version": "2023-06-01" | |
| } | |
| data = { | |
| "model": "claude-3-5-sonnet-20241022", | |
| "max_tokens": 4096, | |
| "messages": [ | |
| { | |
| "role": "user", | |
| "content": prompt | |
| } | |
| ] | |
| } | |
| response = requests.post( | |
| "https://api.anthropic.com/v1/messages", | |
| headers=headers, | |
| json=data, | |
| timeout=60 | |
| ) | |
| if response.status_code == 200: | |
| response_json = response.json() | |
| return response_json['content'][0]['text'] | |
| else: | |
| logger.error(f"Claude API error: {response.status_code} - {response.text}") | |
| return f"β Claude API Error: {response.status_code}" | |
| except Exception as e: | |
| logger.error(f"Error calling Claude API: {str(e)}") | |
| return f"β Error: {str(e)}" | |
| def transcribe_audio_with_metadata(audio_file): | |
| """Transcribe audio with timestamps, sentiment, and metadata""" | |
| if not audio_file: | |
| return None, "No audio file provided" | |
| if not SPEECHBRAIN_AVAILABLE: | |
| return None, "SpeechBrain not available - using demo transcription" | |
| try: | |
| # Get transcription with timestamps | |
| transcript = asr_model.transcribe_file(audio_file) | |
| # Split into sentences for analysis | |
| sentences = re.split(r'[.!?]+', transcript) | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| # Analyze each sentence | |
| rich_transcript = [] | |
| current_time = 0 | |
| for i, sentence in enumerate(sentences): | |
| # Estimate timestamp (rough approximation) | |
| timestamp = current_time + (i * 2) # Assume ~2 seconds per sentence | |
| # Sentiment analysis | |
| sentiment_result = sentiment_model(sentence)[0] if sentiment_model else None | |
| sentiment = max(sentiment_result, key=lambda x: x['score']) if sentiment_result else {'label': 'neutral', 'score': 0.5} | |
| # Emotion analysis | |
| emotion_result = emotion_model(sentence)[0] if emotion_model else None | |
| emotion = max(emotion_result, key=lambda x: x['score']) if emotion_result else {'label': 'neutral', 'score': 0.5} | |
| # Word count and complexity metrics | |
| words = sentence.split() | |
| word_count = len(words) | |
| avg_word_length = np.mean([len(word) for word in words]) if words else 0 | |
| # Calculate speech rate (words per minute estimate) | |
| speech_rate = word_count * 30 / 60 # Rough estimate | |
| rich_transcript.append({ | |
| 'timestamp': timestamp, | |
| 'sentence': sentence, | |
| 'word_count': word_count, | |
| 'avg_word_length': round(avg_word_length, 2), | |
| 'speech_rate_wpm': round(speech_rate, 1), | |
| 'sentiment': sentiment['label'], | |
| 'sentiment_score': round(sentiment['score'], 3), | |
| 'emotion': emotion['label'], | |
| 'emotion_score': round(emotion['score'], 3) | |
| }) | |
| current_time = timestamp | |
| return rich_transcript, "Transcription completed successfully" | |
| except Exception as e: | |
| logger.error(f"Error in transcription: {e}") | |
| return None, f"Transcription error: {str(e)}" | |
| def format_rich_transcript(rich_transcript): | |
| """Format rich transcript for display""" | |
| if not rich_transcript: | |
| return "No transcript data available" | |
| formatted_lines = [] | |
| for entry in rich_transcript: | |
| timestamp_str = f"{int(entry['timestamp']//60):02d}:{int(entry['timestamp']%60):02d}" | |
| line = f"[{timestamp_str}] *PAR: {entry['sentence']}" | |
| line += f" [Words: {entry['word_count']}, Rate: {entry['speech_rate_wpm']}wpm]" | |
| line += f" [Sentiment: {entry['sentiment']} ({entry['sentiment_score']})]" | |
| line += f" [Emotion: {entry['emotion']} ({entry['emotion_score']})]" | |
| formatted_lines.append(line) | |
| return '\n'.join(formatted_lines) | |
| def calculate_slp_metrics(rich_transcript): | |
| """Calculate comprehensive SLP metrics""" | |
| if not rich_transcript: | |
| return {} | |
| # Basic metrics | |
| total_sentences = len(rich_transcript) | |
| total_words = sum(entry['word_count'] for entry in rich_transcript) | |
| total_duration = rich_transcript[-1]['timestamp'] if rich_transcript else 0 | |
| # Word-level analysis | |
| all_words = [] | |
| for entry in rich_transcript: | |
| words = entry['sentence'].lower().split() | |
| all_words.extend(words) | |
| # Word frequency distribution | |
| word_freq = {} | |
| for word in all_words: | |
| word_clean = re.sub(r'[^\w\s]', '', word) | |
| if word_clean: | |
| word_freq[word_clean] = word_freq.get(word_clean, 0) + 1 | |
| # Vocabulary diversity (Type-Token Ratio) | |
| unique_words = len(set(all_words)) | |
| ttr = unique_words / total_words if total_words > 0 else 0 | |
| # Speech rate analysis | |
| speech_rates = [entry['speech_rate_wpm'] for entry in rich_transcript] | |
| avg_speech_rate = np.mean(speech_rates) if speech_rates else 0 | |
| # Sentiment analysis | |
| sentiment_counts = {} | |
| emotion_counts = {} | |
| for entry in rich_transcript: | |
| sentiment_counts[entry['sentiment']] = sentiment_counts.get(entry['sentiment'], 0) + 1 | |
| emotion_counts[entry['emotion']] = emotion_counts.get(entry['emotion'], 0) + 1 | |
| # Sentence complexity | |
| sentence_lengths = [entry['word_count'] for entry in rich_transcript] | |
| avg_sentence_length = np.mean(sentence_lengths) if sentence_lengths else 0 | |
| # Pause analysis (gaps between sentences) | |
| pauses = [] | |
| for i in range(1, len(rich_transcript)): | |
| pause = rich_transcript[i]['timestamp'] - rich_transcript[i-1]['timestamp'] | |
| pauses.append(pause) | |
| avg_pause_duration = np.mean(pauses) if pauses else 0 | |
| return { | |
| 'total_sentences': total_sentences, | |
| 'total_words': total_words, | |
| 'total_duration_seconds': total_duration, | |
| 'unique_words': unique_words, | |
| 'type_token_ratio': round(ttr, 3), | |
| 'avg_sentence_length': round(avg_sentence_length, 1), | |
| 'avg_speech_rate_wpm': round(avg_speech_rate, 1), | |
| 'avg_pause_duration': round(avg_pause_duration, 1), | |
| 'sentiment_distribution': sentiment_counts, | |
| 'emotion_distribution': emotion_counts, | |
| 'word_frequency': dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]), | |
| 'speech_rate_variability': round(np.std(speech_rates), 1) if speech_rates else 0 | |
| } | |
| def generate_slp_analysis_prompt(rich_transcript, metrics, age, gender, slp_notes=""): | |
| """Generate comprehensive SLP analysis prompt""" | |
| # Format metrics for the prompt | |
| metrics_text = f""" | |
| TRANSCRIPT METRICS: | |
| - Total sentences: {metrics['total_sentences']} | |
| - Total words: {metrics['total_words']} | |
| - Duration: {metrics['total_duration_seconds']:.1f} seconds | |
| - Type-Token Ratio: {metrics['type_token_ratio']} (vocabulary diversity) | |
| - Average sentence length: {metrics['avg_sentence_length']} words | |
| - Average speech rate: {metrics['avg_speech_rate_wpm']} words per minute | |
| - Speech rate variability: {metrics['speech_rate_variability']} wpm | |
| - Average pause duration: {metrics['avg_pause_duration']:.1f} seconds | |
| SENTIMENT DISTRIBUTION: {metrics['sentiment_distribution']} | |
| EMOTION DISTRIBUTION: {metrics['emotion_distribution']} | |
| MOST FREQUENT WORDS: {list(metrics['word_frequency'].keys())[:10]} | |
| """ | |
| # Format rich transcript for analysis | |
| transcript_text = format_rich_transcript(rich_transcript) | |
| notes_section = f"\nSLP CLINICAL NOTES:\n{slp_notes}" if slp_notes else "" | |
| prompt = f""" | |
| You are a speech-language pathologist conducting a comprehensive analysis of a speech transcript with rich metadata. | |
| PATIENT: {age}-year-old {gender} | |
| {metrics_text} | |
| TRANSCRIPT WITH METADATA: | |
| {transcript_text}{notes_section} | |
| Please provide a comprehensive analysis including: | |
| 1. SPEECH FLUENCY ANALYSIS: | |
| - Speech rate patterns and variability | |
| - Pause patterns and their significance | |
| - Overall fluency assessment | |
| 2. LANGUAGE COMPLEXITY: | |
| - Vocabulary diversity and word frequency patterns | |
| - Sentence structure and complexity | |
| - Language development level assessment | |
| 3. EMOTIONAL AND AFFECTIVE ANALYSIS: | |
| - Sentiment patterns throughout the transcript | |
| - Emotional expression and regulation | |
| - Impact on communication effectiveness | |
| 4. SPEECH FACTORS: | |
| - Word retrieval patterns | |
| - Grammatical accuracy | |
| - Repetitions and revisions | |
| 5. CLINICAL IMPLICATIONS: | |
| - Specific intervention targets | |
| - Strengths and areas for improvement | |
| - Recommendations for therapy | |
| 6. COMPREHENSIVE SUMMARY: | |
| - Overall communication profile | |
| - Developmental appropriateness | |
| - Prognosis and treatment priorities | |
| Use the quantitative metrics and qualitative observations to support your analysis. | |
| """ | |
| return prompt | |
| def analyze_rich_transcript(rich_transcript, age, gender, slp_notes=""): | |
| """Analyze rich transcript with comprehensive metrics""" | |
| if not rich_transcript: | |
| return "No transcript data available for analysis." | |
| # Calculate SLP metrics | |
| metrics = calculate_slp_metrics(rich_transcript) | |
| # Generate analysis prompt | |
| prompt = generate_slp_analysis_prompt(rich_transcript, metrics, age, gender, slp_notes) | |
| # Get analysis from Claude API | |
| if ANTHROPIC_API_KEY: | |
| result = call_claude_api(prompt) | |
| else: | |
| result = generate_demo_analysis(rich_transcript, metrics) | |
| return result | |
| def generate_demo_analysis(rich_transcript, metrics): | |
| """Generate demo analysis when API is not available""" | |
| return f"""## Comprehensive SLP Analysis | |
| ### SPEECH FLUENCY ANALYSIS | |
| **Speech Rate**: {metrics['avg_speech_rate_wpm']} words per minute (variability: {metrics['speech_rate_variability']} wpm) | |
| - Speech rate appears {'within normal limits' if 120 <= metrics['avg_speech_rate_wpm'] <= 180 else 'below typical range' if metrics['avg_speech_rate_wpm'] < 120 else 'above typical range'} | |
| - Variability suggests {'consistent' if metrics['speech_rate_variability'] < 20 else 'variable'} speech patterns | |
| **Pause Analysis**: Average pause duration of {metrics['avg_pause_duration']:.1f} seconds | |
| - {'Appropriate' if 0.5 <= metrics['avg_pause_duration'] <= 2.0 else 'Short' if metrics['avg_pause_duration'] < 0.5 else 'Long'} pauses between utterances | |
| ### LANGUAGE COMPLEXITY | |
| **Vocabulary Diversity**: Type-Token Ratio of {metrics['type_token_ratio']} | |
| - {'Good' if metrics['type_token_ratio'] > 0.4 else 'Limited' if metrics['type_token_ratio'] < 0.3 else 'Moderate'} vocabulary diversity | |
| **Sentence Structure**: Average {metrics['avg_sentence_length']} words per sentence | |
| - Sentence length appears {'age-appropriate' if 5 <= metrics['avg_sentence_length'] <= 12 else 'below age expectations' if metrics['avg_sentence_length'] < 5 else 'above age expectations'} | |
| **Most Frequent Words**: {', '.join(list(metrics['word_frequency'].keys())[:5])} | |
| ### EMOTIONAL AND AFFECTIVE ANALYSIS | |
| **Sentiment Distribution**: {metrics['sentiment_distribution']} | |
| **Emotion Distribution**: {metrics['emotion_distribution']} | |
| ### CLINICAL IMPLICATIONS | |
| Based on the quantitative analysis, this patient shows: | |
| - {'Good' if metrics['type_token_ratio'] > 0.4 else 'Limited'} vocabulary diversity | |
| - {'Appropriate' if 120 <= metrics['avg_speech_rate_wpm'] <= 180 else 'Atypical'} speech rate | |
| - {'Consistent' if metrics['speech_rate_variability'] < 20 else 'Variable'} speech patterns | |
| ### RECOMMENDATIONS | |
| 1. Focus on vocabulary expansion if TTR < 0.4 | |
| 2. Address speech rate if outside normal range | |
| 3. Work on sentence complexity if below age expectations | |
| 4. Consider emotional regulation strategies based on sentiment patterns""" | |
| def create_enhanced_interface(): | |
| """Create the enhanced Gradio interface""" | |
| with gr.Blocks(title="Enhanced CASL Analysis Tool", theme=gr.themes.Soft()) as app: | |
| gr.Markdown("# π£οΈ Enhanced CASL Analysis Tool") | |
| gr.Markdown("Advanced speech analysis with sentiment, timestamps, and comprehensive SLP metrics") | |
| with gr.Tabs(): | |
| # Audio Upload & Transcription Tab | |
| with gr.Tab("π€ Audio Analysis"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Audio Upload") | |
| audio_input = gr.Audio( | |
| type="filepath", | |
| label="Upload Audio Recording" | |
| ) | |
| transcribe_btn = gr.Button( | |
| "π€ Transcribe & Analyze", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| transcription_status = gr.Markdown("") | |
| with gr.Column(scale=2): | |
| gr.Markdown("### Rich Transcript") | |
| rich_transcript_display = gr.Textbox( | |
| label="Transcription with Timestamps & Sentiment", | |
| lines=15, | |
| max_lines=20 | |
| ) | |
| # Analysis Tab | |
| with gr.Tab("π Analysis"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Patient Information") | |
| with gr.Row(): | |
| age = gr.Number(label="Age", value=8, minimum=1, maximum=120) | |
| gender = gr.Radio(["male", "female", "other"], label="Gender", value="male") | |
| slp_notes = gr.Textbox( | |
| label="SLP Clinical Notes (Optional)", | |
| placeholder="Enter additional clinical observations...", | |
| lines=3 | |
| ) | |
| analyze_btn = gr.Button( | |
| "π Analyze Transcript", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| with gr.Column(scale=2): | |
| gr.Markdown("### Comprehensive Analysis") | |
| analysis_output = gr.Textbox( | |
| label="SLP Analysis Report", | |
| lines=25, | |
| max_lines=30 | |
| ) | |
| # Metrics Tab | |
| with gr.Tab("π Metrics Dashboard"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### Quantitative Metrics") | |
| metrics_display = gr.JSON( | |
| label="SLP Metrics", | |
| interactive=False | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("### Word Frequency") | |
| word_freq_display = gr.Dataframe( | |
| headers=["Word", "Frequency"], | |
| label="Most Frequent Words", | |
| interactive=False | |
| ) | |
| # Event handlers | |
| def on_transcribe(audio_file): | |
| """Handle audio transcription""" | |
| if not audio_file: | |
| return "", "Please upload an audio file first." | |
| rich_transcript, status = transcribe_audio_with_metadata(audio_file) | |
| if rich_transcript: | |
| formatted = format_rich_transcript(rich_transcript) | |
| return formatted, status | |
| else: | |
| return "", status | |
| def on_analyze(rich_transcript_text, age_val, gender_val, notes): | |
| """Handle analysis""" | |
| # Convert formatted text back to rich transcript structure | |
| # This is a simplified version - in practice you'd want to store the rich data | |
| if not rich_transcript_text or rich_transcript_text == "No transcript data available": | |
| return "Please transcribe audio first." | |
| # For demo purposes, create a simple rich transcript from the text | |
| lines = rich_transcript_text.split('\n') | |
| rich_transcript = [] | |
| for i, line in enumerate(lines): | |
| if line.strip(): | |
| # Extract sentence from the line | |
| sentence_match = re.search(r'\*PAR: (.+?)(?=\s*\[|$)', line) | |
| if sentence_match: | |
| sentence = sentence_match.group(1).strip() | |
| rich_transcript.append({ | |
| 'timestamp': i * 2, | |
| 'sentence': sentence, | |
| 'word_count': len(sentence.split()), | |
| 'avg_word_length': np.mean([len(word) for word in sentence.split()]) if sentence.split() else 0, | |
| 'speech_rate_wpm': 120.0, | |
| 'sentiment': 'neutral', | |
| 'sentiment_score': 0.5, | |
| 'emotion': 'neutral', | |
| 'emotion_score': 0.5 | |
| }) | |
| return analyze_rich_transcript(rich_transcript, age_val, gender_val, notes) | |
| def update_metrics(rich_transcript_text): | |
| """Update metrics display""" | |
| if not rich_transcript_text or rich_transcript_text == "No transcript data available": | |
| return {}, [] | |
| # Convert text back to rich transcript (simplified) | |
| lines = rich_transcript_text.split('\n') | |
| rich_transcript = [] | |
| for i, line in enumerate(lines): | |
| if line.strip(): | |
| sentence_match = re.search(r'\*PAR: (.+?)(?=\s*\[|$)', line) | |
| if sentence_match: | |
| sentence = sentence_match.group(1).strip() | |
| rich_transcript.append({ | |
| 'timestamp': i * 2, | |
| 'sentence': sentence, | |
| 'word_count': len(sentence.split()), | |
| 'avg_word_length': np.mean([len(word) for word in sentence.split()]) if sentence.split() else 0, | |
| 'speech_rate_wpm': 120.0, | |
| 'sentiment': 'neutral', | |
| 'sentiment_score': 0.5, | |
| 'emotion': 'neutral', | |
| 'emotion_score': 0.5 | |
| }) | |
| metrics = calculate_slp_metrics(rich_transcript) | |
| # Create word frequency dataframe | |
| word_freq_data = [[word, freq] for word, freq in list(metrics['word_frequency'].items())[:20]] | |
| return metrics, word_freq_data | |
| # Connect event handlers | |
| transcribe_btn.click( | |
| on_transcribe, | |
| inputs=[audio_input], | |
| outputs=[rich_transcript_display, transcription_status] | |
| ) | |
| analyze_btn.click( | |
| on_analyze, | |
| inputs=[rich_transcript_display, age, gender, slp_notes], | |
| outputs=[analysis_output] | |
| ) | |
| # Update metrics when transcript changes | |
| rich_transcript_display.change( | |
| update_metrics, | |
| inputs=[rich_transcript_display], | |
| outputs=[metrics_display, word_freq_display] | |
| ) | |
| return app | |
| if __name__ == "__main__": | |
| print("π Starting Enhanced CASL Analysis Tool...") | |
| if not ANTHROPIC_API_KEY: | |
| print("β οΈ ANTHROPIC_API_KEY not configured - analysis will show demo response") | |
| print(" For HuggingFace Spaces: Add ANTHROPIC_API_KEY as a secret in your space settings") | |
| print(" For local use: export ANTHROPIC_API_KEY='your-key-here'") | |
| else: | |
| print("β Claude API configured") | |
| if not SPEECHBRAIN_AVAILABLE: | |
| print("β οΈ SpeechBrain not available - audio transcription will use demo mode") | |
| print(" Install with: pip install speechbrain transformers torch") | |
| else: | |
| print("β SpeechBrain and HuggingFace models loaded") | |
| app = create_enhanced_interface() | |
| app.launch(show_api=False) |