import torch
import gradio as gr
import librosa
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
import os

# 1. CONFIGURATION
MODEL_ID = "facebook/wav2vec2-xls-r-300m"
QUANTIZED_MODEL_PATH = "quantized_model.pth"

# 2. LOAD MODEL
print("Loading model architecture...")
# Load architecture
model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_ID, num_labels=2)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_ID)

# Apply quantization structure (Must match how you saved it)
model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

# Load weights
# Check if the quantized model file exists to avoid immediate crash
if os.path.exists(QUANTIZED_MODEL_PATH):
    print("Loading quantized weights...")
    model.load_state_dict(torch.load(QUANTIZED_MODEL_PATH, map_location=torch.device('cpu')))
else:
    print(f"Warning: {QUANTIZED_MODEL_PATH} not found. Using random weights (Model will not work correctly).")

model.eval()

# 3. PREDICTION FUNCTION
def predict_audio(audio_path):
    # Gradio passes None if the user clears the input
    if audio_path is None:
        return "No Audio Provided"
    
    try:
        # Load and resample using librosa (handles filepath from upload OR mic)
        speech_array, sr = librosa.load(audio_path, sr=16000)
        
        inputs = feature_extractor(
            speech_array, 
            sampling_rate=16000, 
            return_tensors="pt", 
            padding=True
        )
        
        with torch.no_grad():
            logits = model(**inputs).logits
            probs = torch.nn.functional.softmax(logits, dim=-1)
            
        # Label 0 = Real, Label 1 = Deepfake 
        fake_prob = probs[0][1].item()
        real_prob = probs[0][0].item()
        
        return {
            "Deepfake": fake_prob, 
            "Real": real_prob
        }
    except Exception as e:
        return f"Error processing audio: {str(e)}"

# 4. CREATE INTERFACE
iface = gr.Interface(
    fn=predict_audio,
    inputs=gr.Audio(
        sources=["upload", "microphone"], # <--- MODIFIED HERE
        type="filepath",                  # Keep as filepath so librosa can load it
        label="Upload or Record Audio"
    ), 
    outputs=gr.Label(num_top_classes=2),
    title="Deepfake Audio Detection API",
    description="Upload an audio file or record your voice to check if it's real or fake."
)

iface.launch()