from transformers import pipeline, AutoTokenizer import io import base64 from PIL import Image import gradio as gr model = "Salesforce/blip-image-captioning-large" tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True) pipe = pipeline(task="image-to-text", model=model, tokenizer=tokenizer) def image_to_base64(image: Image) -> str: """ Convert an image to a base64 string. """ bytearray= io.BytesIO() image.save(bytearray, format="PNG") return str(base64.b64encode(bytearray.getvalue()).decode('utf-8')) def caption_image(image): result = pipe( image_to_base64(image), #Temperature=0.7, # max_length=130, # min_length=30, #do_sample=True ) return result[0]['generated_text'].upper() if __name__ == "__main__": gr.close_all() with gr.Blocks() as interface: gr.Markdown("### Image Captioning using BLIP Large") with gr.Row(): image_input = gr.Image(type="pil", label="Image") with gr.Row(): caption_output = gr.Textbox(lines=2, label="Caption") with gr.Row(): clear_button = gr.ClearButton() caption_button = gr.Button("Caption", variant="primary") with gr.Row(): example_images = gr.Examples( examples=[ "data/image1.jpg", "data/image2.png", "data/image3.jpg", "data/image4.jpg", "data/image5.jpg", "data/image6.png", "data/image7.png", "data/image8.jpeg", "data/image9.jpeg", "data/image10.jpg", ], inputs=[image_input], label="Example Images" ) caption_button.click(fn=caption_image, inputs=[image_input], outputs=[caption_output] ) clear_button.click(fn=lambda: [None,""], inputs=[], outputs=[image_input, caption_output]) interface.launch(share=True, server_port=7860)