jguimond commited on
Commit
ddda834
·
verified ·
1 Parent(s): 6888e8e

Create app.py file

Browse files
Files changed (1) hide show
  1. app.py +108 -0
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ from sentence_transformers import SentenceTransformer, util
4
+ import evaluate
5
+
6
+ # --- 1. Load Image Captioning Models ---
7
+ # Model 1: BLIP (Detailed)
8
+ captioner_model1 = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
9
+
10
+ # Model 2: ViT-GPT2 (Concise)
11
+ captioner_model2 = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
12
+
13
+ # --- 2. Load NLP Analysis Models ---
14
+
15
+ # A. Zero-Shot Classifier (For Nuanced Sentiment/Vibe)
16
+ # This allows us to define specific labels like "action" or "sadness" instead of just +/-
17
+ classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-xsmall-zeroshot-v1.1-all-33")
18
+
19
+ # B. Semantic Similarity (For Model Agreement)
20
+ similarity_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
21
+
22
+ # C. ROUGE Metric (For Object Identification Accuracy)
23
+ # This compares the AI caption to your "Ground Truth" text
24
+ rouge = evaluate.load("rouge")
25
+
26
+ # Define our Nuanced Labels
27
+ VIBE_LABELS = ["peaceful/calm", "action/violence", "happy/joy", "sad/depressing", "fear/scary", "scared/fearful", "angry/mad", "neutral"]
28
+
29
+ # --- 3. Analysis Function ---
30
+ def analyze_image(image, ground_truth):
31
+
32
+ # 1. Generate Captions
33
+ res1 = captioner_model1(image)
34
+ cap1 = res1[0]['generated_text']
35
+
36
+ res2 = captioner_model2(image)
37
+ cap2 = res2[0]['generated_text']
38
+
39
+ # 2. Analyze Nuance (Zero-Shot)
40
+ # We take the top label for each
41
+ vibe1 = classifier(cap1, VIBE_LABELS)
42
+ vibe1_label = f"{vibe1['labels'][0]} ({vibe1['scores'][0]:.1%})"
43
+
44
+ vibe2 = classifier(cap2, VIBE_LABELS)
45
+ vibe2_label = f"{vibe2['labels'][0]} ({vibe2['scores'][0]:.1%})"
46
+
47
+ # 3. Calculate Similarity (Model Agreement)
48
+ emb1 = similarity_model.encode(cap1, convert_to_tensor=True)
49
+ emb2 = similarity_model.encode(cap2, convert_to_tensor=True)
50
+ sim_score = util.pytorch_cos_sim(emb1, emb2).item()
51
+
52
+ # 4. Calculate Accuracy vs Ground Truth (ROUGE)
53
+ # Only runs if you provided a reference text
54
+ rouge_stats = "N/A (No Ground Truth Provided)"
55
+ if ground_truth and ground_truth.strip() != "":
56
+ # ROUGE for Model 1
57
+ r1 = rouge.compute(predictions=[cap1], references=[ground_truth])
58
+ # ROUGE for Model 2
59
+ r2 = rouge.compute(predictions=[cap2], references=[ground_truth])
60
+
61
+ rouge_stats = (
62
+ f"Model 1 Accuracy (ROUGE-L): {r1['rougeL']:.3f}\n"
63
+ f"Model 2 Accuracy (ROUGE-L): {r2['rougeL']:.3f}"
64
+ )
65
+
66
+ # 5. Format Output
67
+ output_str = (
68
+ f"--- Model 1 (BLIP) ---\n"
69
+ f"Caption: {cap1}\n"
70
+ f"Nuance: {vibe1_label}\n\n"
71
+ f"--- Model 2 (ViT-GPT2) ---\n"
72
+ f"Caption: {cap2}\n"
73
+ f"Nuance: {vibe2_label}\n\n"
74
+ f"--- Statistics ---\n"
75
+ f"Model Agreement (Similarity): {sim_score:.3f}\n"
76
+ f"(1.0 = Perfect Match, 0.0 = Totally Different)\n\n"
77
+ f"--- Accuracy (vs Ground Truth) ---\n"
78
+ f"{rouge_stats}"
79
+ )
80
+
81
+ return output_str
82
+
83
+ # --- 4. Interface ---
84
+ interface = gr.Interface(
85
+ fn=analyze_image,
86
+ inputs=[
87
+ gr.Image(type="pil", label="Upload Image"),
88
+ gr.Textbox(label="Ground Truth (Optional)", placeholder="Type what is actually in the image to test accuracy...")
89
+ ],
90
+ outputs="text",
91
+ title="Multimodal Analysis: Nuance & Accuracy",
92
+ description="Generates captions using two models, detects emotional nuance (Zero-Shot), and calculates accuracy scores against a ground truth description.",
93
+ examples=[
94
+ ["images/1.png", "A peaceful dog on a sunny beach"],
95
+ ["images/2.png", "Sad men carrying a casket at a funeral"],
96
+ ["images/3.png", "Happy kids at a birthday party"],
97
+ ["images/4.png", "An angry man in a car"],
98
+ ["images/5.png", "Two people happy mountain biking"],
99
+ ["images/6.png", "A man upset about his food at a restaurant"],
100
+ ["images/7.png", "A couple happy at a restaurant"],
101
+ ["images/8.png", "A sad woman reading a book"],
102
+ ["images/9.png", "People scared at a movie"],
103
+ ["images/10.png", "Two tigers fighting"]
104
+ ]
105
+ )
106
+
107
+ if __name__ == "__main__":
108
+ interface.launch()