CodeGovindz commited on
Commit
6deff13
·
0 Parent(s):

Initial commit: Manan ML API for emotion detection

Browse files
Files changed (4) hide show
  1. Dockerfile +40 -0
  2. README.md +62 -0
  3. app.py +559 -0
  4. requirements.txt +17 -0
Dockerfile ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Install system dependencies
7
+ RUN apt-get update && apt-get install -y \
8
+ libgl1-mesa-glx \
9
+ libglib2.0-0 \
10
+ libsm6 \
11
+ libxext6 \
12
+ libxrender-dev \
13
+ libgomp1 \
14
+ ffmpeg \
15
+ git \
16
+ && rm -rf /var/lib/apt/lists/*
17
+
18
+ # Copy requirements first for caching
19
+ COPY requirements.txt .
20
+
21
+ # Install Python dependencies
22
+ RUN pip install --no-cache-dir --upgrade pip && \
23
+ pip install --no-cache-dir -r requirements.txt
24
+
25
+ # Copy application code
26
+ COPY . .
27
+
28
+ # Create directory for models
29
+ RUN mkdir -p /app/pretrained_models
30
+
31
+ # Expose port (Hugging Face uses 7860)
32
+ EXPOSE 7860
33
+
34
+ # Set environment variables
35
+ ENV PYTHONUNBUFFERED=1
36
+ ENV TRANSFORMERS_CACHE=/app/.cache
37
+ ENV HF_HOME=/app/.cache
38
+
39
+ # Run the application
40
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Manan ML API
3
+ emoji: 🧠
4
+ colorFrom: green
5
+ colorTo: purple
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ ---
10
+
11
+ # Manan ML API - Mental Health Emotion Recognition
12
+
13
+ This is the ML backend for the **Manan (मनन)** mental health analysis app.
14
+
15
+ ## Features
16
+
17
+ - **Face Emotion Recognition**: Using DeepFace to detect 7 emotions (Angry, Disgust, Fear, Happy, Sad, Surprise, Neutral)
18
+ - **Voice Emotion Recognition**: Using SpeechBrain's Wav2Vec2-IEMOCAP model
19
+ - **Text Emotion Recognition**: Using Whisper for transcription + DistilBERT for emotion classification
20
+
21
+ ## API Endpoints
22
+
23
+ ### Health Check
24
+ ```
25
+ GET /
26
+ GET /health
27
+ ```
28
+
29
+ ### Face Emotion Prediction
30
+ ```
31
+ POST /pred_face
32
+ - files: List of image files
33
+ - questions: JSON string with question metadata
34
+ ```
35
+
36
+ ### Voice Emotion Prediction
37
+ ```
38
+ POST /predict_audio_batch
39
+ - files: List of audio files (WAV format)
40
+ ```
41
+
42
+ ### Text Emotion Prediction
43
+ ```
44
+ POST /predict_text/
45
+ - files: List of audio files (WAV format)
46
+ - Returns: transcript + emotion
47
+ ```
48
+
49
+ ## Models Used
50
+
51
+ 1. **DeepFace** - Facial emotion recognition
52
+ 2. **OpenAI Whisper (base)** - Speech-to-text
53
+ 3. **SpeechBrain Wav2Vec2-IEMOCAP** - Voice emotion recognition
54
+ 4. **DistilBERT** - Text emotion classification
55
+
56
+ ## Usage
57
+
58
+ The API is designed to be used with the Manan Flutter mobile app for multimodal emotion analysis.
59
+
60
+ ## License
61
+
62
+ MIT License
app.py ADDED
@@ -0,0 +1,559 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import logging
4
+ import numpy as np
5
+ import torch
6
+ import torch.nn as nn
7
+ import whisper
8
+ import librosa
9
+ import asyncio
10
+ from typing import List, Dict, Any, Optional
11
+ from fastapi import FastAPI, UploadFile, File, HTTPException, Request, Form
12
+ from fastapi.middleware.cors import CORSMiddleware
13
+ from fastapi.responses import JSONResponse
14
+ from PIL import Image
15
+ from torchvision import transforms
16
+ from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
17
+ from speechbrain.inference.classifiers import EncoderClassifier
18
+ import torchaudio
19
+ import json
20
+
21
+ # Setup logging
22
+ logging.basicConfig(
23
+ level=logging.INFO,
24
+ format="%(asctime)s [%(levelname)s] %(message)s",
25
+ )
26
+ logger = logging.getLogger(__name__)
27
+
28
+ class ModelManager:
29
+ """Centralized model management for all ML models."""
30
+
31
+ _instance = None
32
+
33
+ def __new__(cls):
34
+ if cls._instance is None:
35
+ cls._instance = super(ModelManager, cls).__new__(cls)
36
+ cls._instance._initialized = False
37
+ return cls._instance
38
+
39
+ def __init__(self):
40
+ if self._initialized:
41
+ return
42
+
43
+ self._initialized = True
44
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
45
+ logger.info(f"Using device: {self.device}")
46
+
47
+ self.emotion_model = None
48
+ self.whisper_model = None
49
+ self.text_tokenizer = None
50
+ self.text_model = None
51
+ self.speechbrain_model = None
52
+
53
+ # Model paths
54
+ self.MODEL_PATHS = {
55
+ 'whisper_model': 'base',
56
+ 'text_model': 'emotion-distilbert-model',
57
+ 'speechbrain_model': 'speechbrain/emotion-recognition-wav2vec2-IEMOCAP'
58
+ }
59
+
60
+ # Constants
61
+ self.EMOTIONS = ["Angry", "Disgust", "Fear", "Happy", "Sad", "Surprise", "Neutral"]
62
+ self.SAMPLE_RATE = 16000
63
+ self.TEXT_EMOTIONS = ["sadness", "joy", "love", "anger", "fear", "surprise"]
64
+
65
+ # SpeechBrain emotion mapping
66
+ self.SPEECHBRAIN_EMOTION_MAP = {
67
+ 'neu': 'Neutral',
68
+ 'hap': 'Happy',
69
+ 'sad': 'Sad',
70
+ 'ang': 'Angry',
71
+ 'fea': 'Fear',
72
+ 'dis': 'Disgust',
73
+ 'sur': 'Surprise'
74
+ }
75
+
76
+ def load_all_models(self):
77
+ """Load all required models."""
78
+ try:
79
+ logger.info("Starting to load all models...")
80
+ self._load_emotion_model()
81
+ self._load_whisper_model()
82
+ self._load_text_models()
83
+ self._load_speechbrain_model()
84
+ logger.info("All models loaded successfully!")
85
+ return True
86
+ except Exception as e:
87
+ logger.error(f"Error loading models: {str(e)}")
88
+ raise
89
+
90
+ def _load_emotion_model(self):
91
+ """Use DeepFace for emotion recognition."""
92
+ try:
93
+ logger.info("Loading DeepFace for emotion recognition...")
94
+ from deepface import DeepFace
95
+ self.emotion_model = DeepFace
96
+ logger.info("DeepFace loaded successfully")
97
+ except Exception as e:
98
+ logger.error(f"Failed to initialize DeepFace: {str(e)}")
99
+ raise
100
+
101
+ def _load_whisper_model(self):
102
+ """Load the Whisper speech-to-text model."""
103
+ try:
104
+ logger.info("Loading Whisper model...")
105
+ self.whisper_model = whisper.load_model(self.MODEL_PATHS['whisper_model'])
106
+ logger.info("Whisper model loaded successfully")
107
+ except Exception as e:
108
+ logger.error(f"Failed to load Whisper model: {str(e)}")
109
+ raise
110
+
111
+ def _load_text_models(self):
112
+ """Load the text emotion classification model and tokenizer."""
113
+ try:
114
+ logger.info("Loading text emotion model...")
115
+ model_path = self.MODEL_PATHS['text_model']
116
+
117
+ # Try to load from local path first, then from HuggingFace Hub
118
+ if os.path.exists(model_path):
119
+ self.text_tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
120
+ self.text_model = DistilBertForSequenceClassification.from_pretrained(model_path)
121
+ else:
122
+ # Use a public emotion model from HuggingFace
123
+ logger.info("Local model not found, using HuggingFace model...")
124
+ self.text_tokenizer = DistilBertTokenizerFast.from_pretrained("bhadresh-savani/distilbert-base-uncased-emotion")
125
+ self.text_model = DistilBertForSequenceClassification.from_pretrained("bhadresh-savani/distilbert-base-uncased-emotion")
126
+
127
+ self.text_model.eval()
128
+ logger.info("Text models loaded successfully")
129
+
130
+ except Exception as e:
131
+ logger.error(f"Failed to load text models: {str(e)}")
132
+ raise
133
+
134
+ def _load_speechbrain_model(self):
135
+ """Load SpeechBrain emotion recognition model."""
136
+ try:
137
+ logger.info("Loading SpeechBrain emotion recognition model...")
138
+ self.speechbrain_model = EncoderClassifier.from_hparams(
139
+ source=self.MODEL_PATHS['speechbrain_model'],
140
+ savedir="pretrained_models/emotion-recognition-wav2vec2-IEMOCAP",
141
+ run_opts={"device": "cpu"}
142
+ )
143
+ logger.info("SpeechBrain emotion recognition model loaded successfully")
144
+ except Exception as e:
145
+ logger.error(f"Failed to load SpeechBrain model: {str(e)}")
146
+ raise
147
+
148
+ def get_emotion_model(self):
149
+ if self.emotion_model is None:
150
+ self._load_emotion_model()
151
+ return self.emotion_model
152
+
153
+ def get_whisper_model(self):
154
+ if self.whisper_model is None:
155
+ self._load_whisper_model()
156
+ return self.whisper_model
157
+
158
+ def get_text_models(self):
159
+ if self.text_model is None or self.text_tokenizer is None:
160
+ self._load_text_models()
161
+ return self.text_tokenizer, self.text_model
162
+
163
+ def get_speechbrain_model(self):
164
+ if self.speechbrain_model is None:
165
+ self._load_speechbrain_model()
166
+ return self.speechbrain_model
167
+
168
+
169
+ # Initialize FastAPI app
170
+ app = FastAPI(title="Manan ML API - Emotion Recognition")
171
+
172
+ # CORS middleware
173
+ app.add_middleware(
174
+ CORSMiddleware,
175
+ allow_origins=["*"],
176
+ allow_credentials=True,
177
+ allow_methods=["*"],
178
+ allow_headers=["*"],
179
+ expose_headers=["*"]
180
+ )
181
+
182
+ # Initialize model manager
183
+ model_manager = ModelManager()
184
+
185
+ # Image transformation pipeline
186
+ transform = transforms.Compose([
187
+ transforms.Resize((224, 224)),
188
+ transforms.ToTensor(),
189
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
190
+ ])
191
+
192
+
193
+ @app.on_event("startup")
194
+ async def startup_event():
195
+ """Initialize all models when the application starts."""
196
+ try:
197
+ logger.info("Starting model initialization...")
198
+ model_manager.load_all_models()
199
+ logger.info("All models initialized successfully!")
200
+ except Exception as e:
201
+ logger.error(f"Failed to initialize models: {str(e)}")
202
+ # Don't raise - let the app start and load models on demand
203
+
204
+
205
+ @app.get("/")
206
+ async def root():
207
+ """Health check endpoint."""
208
+ return {
209
+ "status": "running",
210
+ "message": "Manan ML API is running!",
211
+ "endpoints": [
212
+ "/pred_face - Face emotion prediction",
213
+ "/predict_audio_batch - Voice emotion prediction",
214
+ "/predict_text/ - Text emotion prediction"
215
+ ]
216
+ }
217
+
218
+
219
+ @app.get("/health")
220
+ async def health_check():
221
+ """Health check endpoint."""
222
+ return {"status": "healthy", "device": str(model_manager.device)}
223
+
224
+
225
+ # Helper function for SpeechBrain prediction
226
+ def predict_emotion_speechbrain(audio_path: str) -> Dict[str, Any]:
227
+ """Predict emotion from audio using SpeechBrain."""
228
+ try:
229
+ speechbrain_model = model_manager.get_speechbrain_model()
230
+
231
+ signal, sr = torchaudio.load(audio_path)
232
+
233
+ if sr != 16000:
234
+ resampler = torchaudio.transforms.Resample(sr, 16000)
235
+ signal = resampler(signal)
236
+
237
+ if signal.dim() == 1:
238
+ signal = signal.unsqueeze(0)
239
+ elif signal.dim() == 3:
240
+ signal = signal.squeeze(1)
241
+
242
+ device = next(speechbrain_model.mods.wav2vec2.parameters()).device
243
+ signal = signal.to(device)
244
+
245
+ with torch.no_grad():
246
+ feats = speechbrain_model.mods.wav2vec2(signal)
247
+ pooled = speechbrain_model.mods.avg_pool(feats)
248
+ out = speechbrain_model.mods.output_mlp(pooled)
249
+ out_prob = speechbrain_model.hparams.softmax(out)
250
+
251
+ score, index = torch.max(out_prob, dim=-1)
252
+ predicted_emotion = speechbrain_model.hparams.label_encoder.decode_ndim(index.cpu())
253
+
254
+ if isinstance(predicted_emotion, list):
255
+ if isinstance(predicted_emotion[0], list):
256
+ emotion_key = str(predicted_emotion[0][0]).lower()[:3]
257
+ else:
258
+ emotion_key = str(predicted_emotion[0]).lower()[:3]
259
+ else:
260
+ emotion_key = str(predicted_emotion).lower()[:3]
261
+
262
+ emotion = model_manager.SPEECHBRAIN_EMOTION_MAP.get(emotion_key, 'Neutral')
263
+ probs = out_prob[0].detach().cpu().numpy()
264
+
265
+ if probs.ndim > 1:
266
+ probs = probs.flatten()
267
+
268
+ all_emotions = speechbrain_model.hparams.label_encoder.decode_ndim(
269
+ torch.arange(len(probs))
270
+ )
271
+ prob_dict = {}
272
+ for i in range(len(probs)):
273
+ if i < len(all_emotions):
274
+ if isinstance(all_emotions[i], list):
275
+ key = str(all_emotions[i][0]).lower()[:3]
276
+ else:
277
+ key = str(all_emotions[i]).lower()[:3]
278
+ emotion_name = model_manager.SPEECHBRAIN_EMOTION_MAP.get(key, f'emotion_{i}')
279
+ prob_dict[emotion_name] = float(probs[i])
280
+
281
+ confidence = float(score[0])
282
+
283
+ return {
284
+ 'emotion': emotion,
285
+ 'confidence': confidence,
286
+ 'probabilities': prob_dict
287
+ }
288
+
289
+ except Exception as e:
290
+ logger.error(f"Error predicting emotion with SpeechBrain: {str(e)}")
291
+ raise
292
+
293
+
294
+ def transcribe_audio(audio_path: str) -> str:
295
+ """Transcribe audio to text using Whisper."""
296
+ try:
297
+ result = model_manager.whisper_model.transcribe(audio_path)
298
+ return result["text"].strip()
299
+ except Exception as e:
300
+ logger.error(f"Error in audio transcription: {str(e)}")
301
+ return ""
302
+
303
+
304
+ # ============== API ENDPOINTS ==============
305
+
306
+ @app.post("/pred_face")
307
+ async def predict_face_emotion(
308
+ files: List[UploadFile] = File(...),
309
+ questions: str = Form(None)
310
+ ):
311
+ """Predict emotions from face images using DeepFace."""
312
+ from deepface import DeepFace
313
+
314
+ logger.info(f"Received {len(files)} files for face prediction")
315
+ if not files:
316
+ raise HTTPException(status_code=400, detail="No files provided")
317
+
318
+ temp_files = []
319
+
320
+ try:
321
+ questions_data = {}
322
+ question_count = 0
323
+
324
+ if questions:
325
+ try:
326
+ questions_data = json.loads(questions)
327
+ question_count = len(questions_data)
328
+ except json.JSONDecodeError:
329
+ raise HTTPException(status_code=400, detail="Invalid questions JSON format.")
330
+ else:
331
+ question_count = 3
332
+ questions_data = {str(i): {"text": f"Question {i+1}", "imageCount": 1} for i in range(question_count)}
333
+
334
+ question_files = {str(i): [] for i in range(question_count)}
335
+ for file in files:
336
+ if '_' in file.filename and file.filename.startswith('q'):
337
+ try:
338
+ q_idx = file.filename.split('_')[0][1:]
339
+ if q_idx in question_files:
340
+ question_files[q_idx].append(file)
341
+ except Exception as e:
342
+ logger.warning(f"Skipping file {file.filename}: {e}")
343
+
344
+ results = []
345
+
346
+ for q_idx, q_files in question_files.items():
347
+ if not q_files:
348
+ results.append({
349
+ "emotion": "Unknown",
350
+ "probabilities": {e: 0.0 for e in model_manager.EMOTIONS}
351
+ })
352
+ continue
353
+
354
+ probs_list = []
355
+
356
+ for file in q_files:
357
+ try:
358
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp:
359
+ content = await file.read()
360
+ tmp.write(content)
361
+ temp_path = tmp.name
362
+ temp_files.append(temp_path)
363
+
364
+ analysis = DeepFace.analyze(
365
+ img_path=temp_path,
366
+ actions=['emotion'],
367
+ enforce_detection=False,
368
+ silent=True
369
+ )
370
+
371
+ if isinstance(analysis, list):
372
+ analysis = analysis[0]
373
+
374
+ emotion_scores = analysis.get('emotion', {})
375
+ dominant_emotion = analysis.get('dominant_emotion', 'neutral')
376
+
377
+ normalized_probs = {}
378
+ for emo in model_manager.EMOTIONS:
379
+ key = emo.lower()
380
+ normalized_probs[emo] = emotion_scores.get(key, 0.0) / 100.0
381
+
382
+ probs_list.append(normalized_probs)
383
+
384
+ except Exception as e:
385
+ logger.error(f"Error processing {file.filename}: {e}")
386
+
387
+ if probs_list:
388
+ avg_probs = {}
389
+ for emo in model_manager.EMOTIONS:
390
+ avg_probs[emo] = sum(p.get(emo, 0) for p in probs_list) / len(probs_list)
391
+
392
+ dominant_emotion = max(avg_probs, key=avg_probs.get)
393
+ results.append({
394
+ "emotion": dominant_emotion,
395
+ "probabilities": avg_probs
396
+ })
397
+ else:
398
+ results.append({
399
+ "emotion": "Unknown",
400
+ "probabilities": {e: 0.0 for e in model_manager.EMOTIONS}
401
+ })
402
+
403
+ return results
404
+
405
+ except Exception as e:
406
+ logger.error(f"Error in face emotion prediction: {str(e)}")
407
+ raise HTTPException(status_code=500, detail=str(e))
408
+
409
+ finally:
410
+ for file_path in temp_files:
411
+ try:
412
+ if os.path.exists(file_path):
413
+ os.remove(file_path)
414
+ except Exception as e:
415
+ logger.warning(f"Failed to delete temp file {file_path}: {e}")
416
+
417
+
418
+ @app.post("/predict_audio_batch")
419
+ async def predict_audio_batch(files: List[UploadFile] = File(...)):
420
+ """Predict emotions from multiple audio files using SpeechBrain."""
421
+ logger.info(f"Received {len(files)} audio files for prediction")
422
+
423
+ if not files:
424
+ raise HTTPException(status_code=400, detail="No audio files provided")
425
+
426
+ temp_files = []
427
+ results = []
428
+
429
+ try:
430
+ for file in files:
431
+ try:
432
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
433
+ content = await file.read()
434
+ tmp.write(content)
435
+ temp_path = tmp.name
436
+ temp_files.append(temp_path)
437
+
438
+ prediction = predict_emotion_speechbrain(temp_path)
439
+ results.append(prediction)
440
+ logger.info(f"Predicted emotion for {file.filename}: {prediction['emotion']}")
441
+
442
+ except Exception as e:
443
+ logger.error(f"Error processing {file.filename}: {e}")
444
+ results.append({
445
+ 'emotion': 'Unknown',
446
+ 'confidence': 0.0,
447
+ 'probabilities': {},
448
+ 'error': str(e)
449
+ })
450
+
451
+ return {'status': 'success', 'results': results}
452
+
453
+ except Exception as e:
454
+ logger.error(f"Error in audio batch prediction: {str(e)}")
455
+ raise HTTPException(status_code=500, detail=str(e))
456
+
457
+ finally:
458
+ for file_path in temp_files:
459
+ try:
460
+ if os.path.exists(file_path):
461
+ os.remove(file_path)
462
+ except Exception as e:
463
+ logger.warning(f"Failed to delete temp file {file_path}: {e}")
464
+
465
+
466
+ @app.post("/predict_text/")
467
+ async def predict_text_emotion(files: List[UploadFile] = File(...)):
468
+ """Transcribe audio and predict text emotion."""
469
+ logger.info(f"Received {len(files)} audio files for text prediction")
470
+
471
+ if not files:
472
+ raise HTTPException(status_code=400, detail="No audio files provided")
473
+
474
+ temp_files = []
475
+ results = []
476
+
477
+ try:
478
+ tokenizer, text_model = model_manager.get_text_models()
479
+ whisper_model = model_manager.get_whisper_model()
480
+
481
+ for file in files:
482
+ try:
483
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
484
+ content = await file.read()
485
+ tmp.write(content)
486
+ temp_path = tmp.name
487
+ temp_files.append(temp_path)
488
+
489
+ # Transcribe
490
+ transcription = whisper_model.transcribe(temp_path)
491
+ transcript = transcription["text"].strip()
492
+ logger.info(f"Transcribed: {transcript}")
493
+
494
+ if not transcript:
495
+ results.append({
496
+ 'transcript': '',
497
+ 'emotion': 'neutral',
498
+ 'confidence': 0.0,
499
+ 'probabilities': {}
500
+ })
501
+ continue
502
+
503
+ # Predict emotion from text
504
+ inputs = tokenizer(
505
+ transcript,
506
+ return_tensors="pt",
507
+ truncation=True,
508
+ max_length=128,
509
+ padding=True
510
+ )
511
+
512
+ with torch.no_grad():
513
+ outputs = text_model(**inputs)
514
+ probs = torch.softmax(outputs.logits, dim=1)[0]
515
+
516
+ # Get emotion labels
517
+ emotion_labels = model_manager.TEXT_EMOTIONS
518
+ if hasattr(text_model.config, 'id2label'):
519
+ emotion_labels = [text_model.config.id2label[i] for i in range(len(probs))]
520
+
521
+ prob_dict = {emotion_labels[i]: float(probs[i]) for i in range(len(probs))}
522
+ predicted_idx = torch.argmax(probs).item()
523
+ predicted_emotion = emotion_labels[predicted_idx]
524
+ confidence = float(probs[predicted_idx])
525
+
526
+ results.append({
527
+ 'transcript': transcript,
528
+ 'emotion': predicted_emotion,
529
+ 'confidence': confidence,
530
+ 'probabilities': prob_dict
531
+ })
532
+
533
+ except Exception as e:
534
+ logger.error(f"Error processing {file.filename}: {e}")
535
+ results.append({
536
+ 'transcript': '',
537
+ 'emotion': 'unknown',
538
+ 'confidence': 0.0,
539
+ 'error': str(e)
540
+ })
541
+
542
+ return results
543
+
544
+ except Exception as e:
545
+ logger.error(f"Error in text prediction: {str(e)}")
546
+ raise HTTPException(status_code=500, detail=str(e))
547
+
548
+ finally:
549
+ for file_path in temp_files:
550
+ try:
551
+ if os.path.exists(file_path):
552
+ os.remove(file_path)
553
+ except Exception as e:
554
+ logger.warning(f"Failed to delete temp file {file_path}: {e}")
555
+
556
+
557
+ if __name__ == "__main__":
558
+ import uvicorn
559
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.0
2
+ uvicorn==0.30.0
3
+ python-multipart==0.0.9
4
+ numpy==1.26.4
5
+ torch==2.2.0
6
+ torchvision==0.17.0
7
+ torchaudio==2.2.0
8
+ openai-whisper==20240930
9
+ transformers==4.44.0
10
+ librosa==0.10.2
11
+ pillow==10.4.0
12
+ deepface==0.0.92
13
+ soundfile==0.12.1
14
+ audioread==3.0.1
15
+ speechbrain==1.0.0
16
+ pydantic==2.8.0
17
+ python-dotenv==1.0.1