Spaces:
Runtime error
Runtime error
File size: 2,830 Bytes
2a4dcb6 e01b564 1925b40 e01b564 2a4dcb6 e01b564 2a4dcb6 e01b564 1925b40 e01b564 2a4dcb6 e01b564 1925b40 2a4dcb6 e01b564 2a4dcb6 e01b564 1925b40 e01b564 1925b40 e01b564 1925b40 e01b564 1925b40 e01b564 2a4dcb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
#
import gradio as gr
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer
from byaldi import RAGMultiModalModel
from PIL import Image
from auto_gptq import GPTQForModel # Import GPTQ for quantization
# Check for CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Load and quantize the Qwen2 model with GPTQ
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct",
torch_dtype=torch.float16, # Use float16 for initial load
device_map="auto",
low_cpu_mem_usage=True,
)
model = GPTQForModel.from_pretrained(model) # Load the model into GPTQ
model.quantize(bits=8) # Quantize to 8-bit
model.to(device)
model.eval() # Set the model to evaluation mode
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
# Load and quantize the RAG model with GPTQ
RAG = RAGMultiModalModel.from_pretrained("vidore/colpali-v1.2")
RAG = GPTQForModel.from_pretrained(RAG) # Load the RAG model into GPTQ
RAG.quantize(bits=8) # Quantize to 8-bit
RAG.to(device)
RAG.eval() # Set RAG model to evaluation mode
def process_image(image, keywords):
# Perform OCR
ocr_result = RAG.ocr(image)
# Process image
prompt = f"Analyze this text: {ocr_result}\n1. Identify Hindi and English parts.\n2. Translate Hindi to English.\n3. Summarize content."
inputs = processor(images=image, text=prompt, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
# Generate output with quantized model
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=500,
do_sample=True,
top_k=50,
top_p=0.95,
num_return_sequences=1,
)
analysis = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Search for keywords
keyword_list = [kw.strip() for kw in keywords.split(',')]
found_keywords = [kw for kw in keyword_list if kw.lower() in analysis.lower()]
return analysis, ', '.join(found_keywords) if found_keywords else "No keywords found"
# Create Gradio interface
iface = gr.Interface(
fn=process_image,
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Textbox(label="Enter keywords (comma-separated)")
],
outputs=[
gr.Textbox(label="Analysis Result"),
gr.Textbox(label="Found Keywords")
],
title="Image OCR and Keyword Search (Quantized Model)",
description="Upload an image to extract and analyze text, then search for specific keywords. This version uses a quantized model for improved efficiency."
)
# Launch the interface
iface.launch()
|