# ============================================================================== # Josh Guimond # Unit 8 Assignment: End-to-End AI Solution Implementation # ARIN 460 # 12/03/2025 # Description: This script implements a multimodal AI web app using Gradio to # run two image captioning models, a text “vibe” classifier, and NLP metrics on # uploaded images, allowing direct comparison of model captions to ground-truth # descriptions. # ============================================================================== # ============================================================================== # SECTION 1: SETUP & INSTALLATIONS # ============================================================================== # Install libraries import gradio as gr from transformers import pipeline, AutoTokenizer, AutoModelForImageTextToText from sentence_transformers import SentenceTransformer, util import evaluate import warnings import logging # Filter out the "FutureWarning" and "UserWarning" to keep the console clean warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=UserWarning) logging.getLogger("transformers").setLevel(logging.ERROR) # ============================================================================== # SECTION 2: LOAD MODELS # ============================================================================== # --- 1. Load Image Captioning Models --- # Model 1: BLIP (Base) print("Loading Model 1 (BLIP)...") captioner_model1 = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") # Model 2: ViT-GPT2 (With Tokenizer Fix) print("Loading Model 2 (ViT-GPT2)...") # Load the tokenizer manually to set the pad_token and fix the warning vit_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") vit_tokenizer.pad_token = vit_tokenizer.eos_token # <--- THE FIX captioner_model2 = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", tokenizer=vit_tokenizer) # --- 2. Load NLP Analysis Models (Unit 4 Techniques) --- # A. Zero-Shot Classifier (For Nuanced Vibe/Sentiment) print("Loading Zero-Shot Classifier...") classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-xsmall-zeroshot-v1.1-all-33") # B. Semantic Similarity (For Model Agreement) print("Loading Sentence Transformer...") similarity_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') # C. ROUGE Metric (For Accuracy vs Ground Truth) print("Loading ROUGE Metric...") rouge = evaluate.load("rouge") # Define Nuanced Labels based on the image list # These cover: Peaceful dog, Sad funeral, Happy kids, Angry man, Scared people, Fighting tigers VIBE_LABELS = ["Peaceful/Calm", "Happy/Joy", "Sad/Sorrow", "Angry/Upset", "Fear/Scared", "Action/Violence"] # ============================================================================== # SECTION 3: ANALYSIS FUNCTIONS # ============================================================================== # --- Analysis Function --- def analyze_image(image, ground_truth): # -- A. Generate Captions -- res1 = captioner_model1(image) cap1 = res1[0]['generated_text'] res2 = captioner_model2(image) cap2 = res2[0]['generated_text'] # -- B. Analyze Vibe (Zero-Shot) -- # Model 1 Vibe vibe1_result = classifier(cap1, VIBE_LABELS) vibe1_label = vibe1_result['labels'][0] vibe1_score = vibe1_result['scores'][0] # Model 2 Vibe vibe2_result = classifier(cap2, VIBE_LABELS) vibe2_label = vibe2_result['labels'][0] vibe2_score = vibe2_result['scores'][0] # -- C. Calculate Statistics -- # 1. Semantic Similarity (Do the models agree?) emb1 = similarity_model.encode(cap1, convert_to_tensor=True) emb2 = similarity_model.encode(cap2, convert_to_tensor=True) sim_score = util.pytorch_cos_sim(emb1, emb2).item() # 2. ROUGE Scores (How accurate are they vs Ground Truth?) rouge_output = "N/A (No Ground Truth provided)" if ground_truth and ground_truth.strip() != "": # Calculate scores r1 = rouge.compute(predictions=[cap1], references=[ground_truth]) r2 = rouge.compute(predictions=[cap2], references=[ground_truth]) # Format the ROUGE output nicely rouge_output = ( f"Model 1 ROUGE-L: {r1['rougeL']:.3f}\n" f"Model 2 ROUGE-L: {r2['rougeL']:.3f}\n" f"(Higher is better)" ) # -- D. Format Output Strings -- # Create clean, formatted strings for the large textboxes out1 = ( f"CAPTION: {cap1}\n" f"-----------------------------\n" f"DETECTED VIBE: {vibe1_label}\n" f"CONFIDENCE: {vibe1_score:.1%}" ) out2 = ( f"CAPTION: {cap2}\n" f"-----------------------------\n" f"DETECTED VIBE: {vibe2_label}\n" f"CONFIDENCE: {vibe2_score:.1%}" ) stats = ( f"--- 1. MODEL AGREEMENT (Semantic Similarity) ---\n" f"Score: {sim_score:.3f}\n" f"(Scale: 0.0 = Different, 1.0 = Identical)\n\n" f"--- 2. OBJECT IDENTIFICATION ACCURACY (ROUGE) ---\n" f"Ground Truth: '{ground_truth}'\n" f"{rouge_output}" ) return out1, out2, stats # ============================================================================== # SECTION 4: GRADIO INTERFACE # ============================================================================== # Define Inputs image_input = gr.Image(type="pil", label="Upload Image") text_input = gr.Textbox(label="Ground Truth Description", placeholder="e.g. 'A peaceful dog on a beach'") # Define Outputs with LARGER viewing areas (lines=5 or 10) output_m1 = gr.Textbox(label="Model 1 (BLIP) Analysis", lines=4) output_m2 = gr.Textbox(label="Model 2 (ViT-GPT2) Analysis", lines=4) output_stats = gr.Textbox(label="Comparison Metrics & Statistics", lines=10) # Create Interface interface = gr.Interface( fn=analyze_image, inputs=[image_input, text_input], outputs=[output_m1, output_m2, output_stats], title="Multimodal AI: Nuanced Image Analysis", description="This application uses two Image Captioning models (BLIP & ViT-GPT2) to identify objects, Zero-Shot Classification to detect emotional vibes (Happy, Sad, Angry, etc.), and calculates ROUGE/Similarity metrics.", examples=[ ["images/1.png", "A peaceful dog on a sunny beach"], ["images/2.png", "Sad men carrying a casket at a funeral"], ["images/3.png", "Happy kids at a birthday party"], ["images/4.png", "An angry man in a car"], ["images/5.png", "Two people happy mountain biking"], ["images/6.png", "A man upset about his food at a restaurant"], ["images/7.png", "A couple happy at a restaurant"], ["images/8.png", "A sad woman reading a book"], ["images/9.png", "People scared at a movie"], ["images/10.png", "Two tigers fighting"] ] ) if __name__ == "__main__": interface.launch()