# import gradio as gr import torch from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer from byaldi import RAGMultiModalModel from PIL import Image from auto_gptq import GPTQForModel # Import GPTQ for quantization # Check for CUDA availability device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") # Load and quantize the Qwen2 model with GPTQ model = Qwen2VLForConditionalGeneration.from_pretrained( "Qwen/Qwen2-VL-7B-Instruct", torch_dtype=torch.float16, # Use float16 for initial load device_map="auto", low_cpu_mem_usage=True, ) model = GPTQForModel.from_pretrained(model) # Load the model into GPTQ model.quantize(bits=8) # Quantize to 8-bit model.to(device) model.eval() # Set the model to evaluation mode processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct") tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-7B-Instruct") # Load and quantize the RAG model with GPTQ RAG = RAGMultiModalModel.from_pretrained("vidore/colpali-v1.2") RAG = GPTQForModel.from_pretrained(RAG) # Load the RAG model into GPTQ RAG.quantize(bits=8) # Quantize to 8-bit RAG.to(device) RAG.eval() # Set RAG model to evaluation mode def process_image(image, keywords): # Perform OCR ocr_result = RAG.ocr(image) # Process image prompt = f"Analyze this text: {ocr_result}\n1. Identify Hindi and English parts.\n2. Translate Hindi to English.\n3. Summarize content." inputs = processor(images=image, text=prompt, return_tensors="pt") inputs = {k: v.to(device) for k, v in inputs.items()} # Generate output with quantized model with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=500, do_sample=True, top_k=50, top_p=0.95, num_return_sequences=1, ) analysis = tokenizer.decode(outputs[0], skip_special_tokens=True) # Search for keywords keyword_list = [kw.strip() for kw in keywords.split(',')] found_keywords = [kw for kw in keyword_list if kw.lower() in analysis.lower()] return analysis, ', '.join(found_keywords) if found_keywords else "No keywords found" # Create Gradio interface iface = gr.Interface( fn=process_image, inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Textbox(label="Enter keywords (comma-separated)") ], outputs=[ gr.Textbox(label="Analysis Result"), gr.Textbox(label="Found Keywords") ], title="Image OCR and Keyword Search (Quantized Model)", description="Upload an image to extract and analyze text, then search for specific keywords. This version uses a quantized model for improved efficiency." ) # Launch the interface iface.launch()