Spaces:

apoorvgoyalxx
/

OCR

Runtime error

App Files Files Community

OCR / app.py

apoorvgoyalxx

gptq quant

2a4dcb6 about 1 year ago

raw

history blame contribute delete

2.83 kB

	#
	import gradio as gr
	import torch
	from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer
	from byaldi import RAGMultiModalModel
	from PIL import Image
	from auto_gptq import GPTQForModel # Import GPTQ for quantization

	# Check for CUDA availability
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")

	# Load and quantize the Qwen2 model with GPTQ
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	"Qwen/Qwen2-VL-7B-Instruct",
	torch_dtype=torch.float16, # Use float16 for initial load
	device_map="auto",
	low_cpu_mem_usage=True,
	)
	model = GPTQForModel.from_pretrained(model) # Load the model into GPTQ
	model.quantize(bits=8) # Quantize to 8-bit
	model.to(device)
	model.eval() # Set the model to evaluation mode

	processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
	tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

	# Load and quantize the RAG model with GPTQ
	RAG = RAGMultiModalModel.from_pretrained("vidore/colpali-v1.2")
	RAG = GPTQForModel.from_pretrained(RAG) # Load the RAG model into GPTQ
	RAG.quantize(bits=8) # Quantize to 8-bit
	RAG.to(device)
	RAG.eval() # Set RAG model to evaluation mode

	def process_image(image, keywords):
	# Perform OCR
	ocr_result = RAG.ocr(image)

	# Process image
	prompt = f"Analyze this text: {ocr_result}\n1. Identify Hindi and English parts.\n2. Translate Hindi to English.\n3. Summarize content."
	inputs = processor(images=image, text=prompt, return_tensors="pt")
	inputs = {k: v.to(device) for k, v in inputs.items()}

	# Generate output with quantized model
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=500,
	do_sample=True,
	top_k=50,
	top_p=0.95,
	num_return_sequences=1,
	)

	analysis = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Search for keywords
	keyword_list = [kw.strip() for kw in keywords.split(',')]
	found_keywords = [kw for kw in keyword_list if kw.lower() in analysis.lower()]

	return analysis, ', '.join(found_keywords) if found_keywords else "No keywords found"

	# Create Gradio interface
	iface = gr.Interface(
	fn=process_image,
	inputs=[
	gr.Image(type="pil", label="Upload Image"),
	gr.Textbox(label="Enter keywords (comma-separated)")
	],
	outputs=[
	gr.Textbox(label="Analysis Result"),
	gr.Textbox(label="Found Keywords")
	],
	title="Image OCR and Keyword Search (Quantized Model)",
	description="Upload an image to extract and analyze text, then search for specific keywords. This version uses a quantized model for improved efficiency."
	)

	# Launch the interface
	iface.launch()