File size: 2,830 Bytes
2a4dcb6
e01b564
 
1925b40
e01b564
 
2a4dcb6
e01b564
 
 
 
 
2a4dcb6
e01b564
 
1925b40
e01b564
 
 
2a4dcb6
 
e01b564
 
 
 
1925b40
 
2a4dcb6
e01b564
2a4dcb6
 
e01b564
 
 
 
 
 
 
 
 
 
 
 
1925b40
e01b564
1925b40
 
 
 
 
 
 
 
e01b564
1925b40
e01b564
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1925b40
 
e01b564
 
 
2a4dcb6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# 
import gradio as gr
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer
from byaldi import RAGMultiModalModel
from PIL import Image
from auto_gptq import GPTQForModel  # Import GPTQ for quantization

# Check for CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load and quantize the Qwen2 model with GPTQ
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct",
    torch_dtype=torch.float16,  # Use float16 for initial load
    device_map="auto",
    low_cpu_mem_usage=True,
)
model = GPTQForModel.from_pretrained(model)  # Load the model into GPTQ
model.quantize(bits=8)  # Quantize to 8-bit
model.to(device)
model.eval()  # Set the model to evaluation mode

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

# Load and quantize the RAG model with GPTQ
RAG = RAGMultiModalModel.from_pretrained("vidore/colpali-v1.2")
RAG = GPTQForModel.from_pretrained(RAG)  # Load the RAG model into GPTQ
RAG.quantize(bits=8)  # Quantize to 8-bit
RAG.to(device)
RAG.eval()  # Set RAG model to evaluation mode

def process_image(image, keywords):
    # Perform OCR
    ocr_result = RAG.ocr(image)
    
    # Process image
    prompt = f"Analyze this text: {ocr_result}\n1. Identify Hindi and English parts.\n2. Translate Hindi to English.\n3. Summarize content."
    inputs = processor(images=image, text=prompt, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate output with quantized model
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=500,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            num_return_sequences=1,
        )
    
    analysis = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Search for keywords
    keyword_list = [kw.strip() for kw in keywords.split(',')]
    found_keywords = [kw for kw in keyword_list if kw.lower() in analysis.lower()]
    
    return analysis, ', '.join(found_keywords) if found_keywords else "No keywords found"

# Create Gradio interface
iface = gr.Interface(
    fn=process_image,
    inputs=[
        gr.Image(type="pil", label="Upload Image"),
        gr.Textbox(label="Enter keywords (comma-separated)")
    ],
    outputs=[
        gr.Textbox(label="Analysis Result"),
        gr.Textbox(label="Found Keywords")
    ],
    title="Image OCR and Keyword Search (Quantized Model)",
    description="Upload an image to extract and analyze text, then search for specific keywords. This version uses a quantized model for improved efficiency."
)

# Launch the interface
iface.launch()