Spaces:

whyumesh
/

eterniq_vision_to_code

Sleeping

App Files Files Community

eterniq_vision_to_code / app.py

whyumesh

Update app.py

f0b1947 verified about 1 year ago

raw

history blame contribute delete

6.48 kB

	import gradio as gr
	from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
	from qwen_vl_utils import process_vision_info
	from PIL import Image
	import cv2
	import numpy as np
	import os
	import torch

	print(f"PyTorch version: {torch.__version__}")
	print(f"CUDA available: {torch.cuda.is_available()}")
	if torch.cuda.is_available():
	print(f"CUDA version: {torch.version.cuda}")
	print(f"Device count: {torch.cuda.device_count()}")
	print(f"Current device: {torch.cuda.current_device()}")
	print(f"Device name: {torch.cuda.get_device_name()}")

	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True

	# Set the default tensor type to cuda
	if torch.cuda.is_available():
	torch.set_default_tensor_type('torch.cuda.FloatTensor')


	def load_model():
	try:
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")

	model = Qwen2VLForConditionalGeneration.from_pretrained(
	"Qwen/Qwen2-VL-2B-Instruct",
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	device_map="auto",
	low_cpu_mem_usage=True
	)
	processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
	return model, processor, device
	except Exception as e:
	print(f"Error loading model: {e}")
	return None, None, None

	model, processor, device = load_model()

	SYSTEM_PROMPT = """You are an expert technical analyst specializing in identifying bugs, fixing errors, and explaining code functions from visual inputs. When presented with an image or video:
	1. If you see code, analyze it for potential bugs or errors, and suggest fixes.
	2. If you see a function or algorithm, explain its purpose and how it works.
	3. If you see a technical diagram or flowchart, interpret its meaning and purpose.
	4. For any technical content, provide detailed explanations and insights.
	Always maintain a professional and technical tone in your responses."""

	def process_content(file, user_prompt):
	if file is None:
	return "No content provided. Please upload an image or video of technical content."

	file_path = file.name
	file_extension = os.path.splitext(file_path)[1].lower()

	if file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
	image = Image.open(file_path)
	return analyze_image(image, user_prompt)
	elif file_extension in ['.mp4', '.avi', '.mov']:
	return analyze_video(file_path, user_prompt)
	else:
	return "Unsupported file type. Please provide an image (jpg, jpeg, png, bmp) or video (mp4, avi, mov) of technical content."

	def analyze_image(image, prompt):
	messages = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": f"Based on the system instructions, {prompt}"},
	],
	}
	]

	return generate_response(messages)

	def analyze_video(video_path, prompt, max_frames=16, frame_interval=30, max_resolution=224):
	try:
	cap = cv2.VideoCapture(video_path)
	if not cap.isOpened():
	return "Error: Could not open video file."

	frames = []
	frame_count = 0

	while len(frames) < max_frames:
	ret, frame = cap.read()
	if not ret:
	break

	if frame_count % frame_interval == 0:
	h, w = frame.shape[:2]
	if h > w:
	new_h, new_w = max_resolution, int(w * max_resolution / h)
	else:
	new_h, new_w = int(h * max_resolution / w), max_resolution
	frame = cv2.resize(frame, (new_w, new_h))

	frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	frame = Image.fromarray(frame)

	frames.append(frame)

	frame_count += 1

	return generate_response([
	{"role": "system", "content": SYSTEM_PROMPT},
	{
	"role": "user",
	"content": [
	{"type": "video", "video": frames},
	{"type": "text", "text": f"Based on the system instructions, {prompt}"},
	],
	}
	])
	except Exception as e:
	return f"Error processing video: {e}"
	finally:
	if 'cap' in locals():
	cap.release()


	def generate_response(messages):
	try:
	text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	image_inputs, video_inputs = process_vision_info(messages)

	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt"
	)

	# Move inputs to GPU
	inputs = {k: v.to(device) for k, v in inputs.items()}

	with torch.no_grad():
	generated_ids = model.generate(
	**inputs,
	max_new_tokens=512,
	do_sample=True,
	top_k=20,
	top_p=0.9,
	temperature=0.7
	)

	generated_ids_trimmed = [
	out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = processor.batch_decode(
	generated_ids_trimmed,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False
	)

	# Clear CUDA cache
	torch.cuda.empty_cache()

	return output_text[0]
	except Exception as e:
	return f"Error generating response: {e}"

	# Gradio interface
	iface = gr.Interface(
	fn=process_content,
	inputs=[
	gr.File(label="Upload Image or Video of Technical Content"),
	gr.Textbox(label="Enter your technical question", placeholder="e.g., Identify any bugs in this code and suggest fixes", value="Analyze this technical content and provide insights.")
	],
	outputs="text",
	title="Technical Content Analysis",
	description="Upload an image or video of code, diagrams, or technical content. Ask questions about bugs, errors, or explanations of functions.",
	)

	iface.launch(share=True)