Spaces:

jguimond
/

assignment_8_v3

Sleeping

App Files Files Community

jguimond commited on 21 days ago

Commit

ddda834

verified ·

1 Parent(s): 6888e8e

Create app.py file

Browse files

Files changed (1) hide show

app.py +108 -0

app.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import gradio as gr
+from transformers import pipeline
+from sentence_transformers import SentenceTransformer, util
+import evaluate
+# --- 1. Load Image Captioning Models ---
+# Model 1: BLIP (Detailed)
+captioner_model1 = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+# Model 2: ViT-GPT2 (Concise)
+captioner_model2 = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
+# --- 2. Load NLP Analysis Models ---
+# A. Zero-Shot Classifier (For Nuanced Sentiment/Vibe)
+# This allows us to define specific labels like "action" or "sadness" instead of just +/-
+classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-xsmall-zeroshot-v1.1-all-33")
+# B. Semantic Similarity (For Model Agreement)
+similarity_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+# C. ROUGE Metric (For Object Identification Accuracy)
+# This compares the AI caption to your "Ground Truth" text
+rouge = evaluate.load("rouge")
+# Define our Nuanced Labels
+VIBE_LABELS = ["peaceful/calm", "action/violence", "happy/joy", "sad/depressing", "fear/scary", "scared/fearful", "angry/mad", "neutral"]
+# --- 3. Analysis Function ---
+def analyze_image(image, ground_truth):
+    # 1. Generate Captions
+    res1 = captioner_model1(image)
+    cap1 = res1[0]['generated_text']
+    res2 = captioner_model2(image)
+    cap2 = res2[0]['generated_text']
+    # 2. Analyze Nuance (Zero-Shot)
+    # We take the top label for each
+    vibe1 = classifier(cap1, VIBE_LABELS)
+    vibe1_label = f"{vibe1['labels'][0]} ({vibe1['scores'][0]:.1%})"
+    vibe2 = classifier(cap2, VIBE_LABELS)
+    vibe2_label = f"{vibe2['labels'][0]} ({vibe2['scores'][0]:.1%})"
+    # 3. Calculate Similarity (Model Agreement)
+    emb1 = similarity_model.encode(cap1, convert_to_tensor=True)
+    emb2 = similarity_model.encode(cap2, convert_to_tensor=True)
+    sim_score = util.pytorch_cos_sim(emb1, emb2).item()
+    # 4. Calculate Accuracy vs Ground Truth (ROUGE)
+    # Only runs if you provided a reference text
+    rouge_stats = "N/A (No Ground Truth Provided)"
+    if ground_truth and ground_truth.strip() != "":
+        # ROUGE for Model 1
+        r1 = rouge.compute(predictions=[cap1], references=[ground_truth])
+        # ROUGE for Model 2
+        r2 = rouge.compute(predictions=[cap2], references=[ground_truth])
+        rouge_stats = (
+            f"Model 1 Accuracy (ROUGE-L): {r1['rougeL']:.3f}\n"
+            f"Model 2 Accuracy (ROUGE-L): {r2['rougeL']:.3f}"
+        )
+    # 5. Format Output
+    output_str = (
+        f"--- Model 1 (BLIP) ---\n"
+        f"Caption: {cap1}\n"
+        f"Nuance: {vibe1_label}\n\n"
+        f"--- Model 2 (ViT-GPT2) ---\n"
+        f"Caption: {cap2}\n"
+        f"Nuance: {vibe2_label}\n\n"
+        f"--- Statistics ---\n"
+        f"Model Agreement (Similarity): {sim_score:.3f}\n"
+        f"(1.0 = Perfect Match, 0.0 = Totally Different)\n\n"
+        f"--- Accuracy (vs Ground Truth) ---\n"
+        f"{rouge_stats}"
+    )
+    return output_str
+# --- 4. Interface ---
+interface = gr.Interface(
+    fn=analyze_image,
+    inputs=[
+        gr.Image(type="pil", label="Upload Image"),
+        gr.Textbox(label="Ground Truth (Optional)", placeholder="Type what is actually in the image to test accuracy...")
+    ],
+    outputs="text",
+    title="Multimodal Analysis: Nuance & Accuracy",
+    description="Generates captions using two models, detects emotional nuance (Zero-Shot), and calculates accuracy scores against a ground truth description.",
+    examples=[
+        ["images/1.png", "A peaceful dog on a sunny beach"],
+        ["images/2.png", "Sad men carrying a casket at a funeral"],
+        ["images/3.png", "Happy kids at a birthday party"],
+        ["images/4.png", "An angry man in a car"],
+        ["images/5.png", "Two people happy mountain biking"],
+        ["images/6.png", "A man upset about his food at a restaurant"],
+        ["images/7.png", "A couple happy at a restaurant"],
+        ["images/8.png", "A sad woman reading a book"],
+        ["images/9.png", "People scared at a movie"],
+        ["images/10.png", "Two tigers fighting"]
+    ]
+)
+if __name__ == "__main__":
+    interface.launch()