Spaces:

YongdongWang
/

DART-LLM-Multi-Model-GGUF

Build error

Yongdong Wang

Add JSON processing support for Python dict format in model responses

a1f2008 5 months ago

10.5 kB

	import gradio as gr
	import spaces # Import spaces module for ZeroGPU
	from huggingface_hub import login
	import os
	from json_processor import JsonProcessor
	import json

	# 1) Read Secrets
	hf_token = os.getenv("HUGGINGFACE_TOKEN")
	if not hf_token:
	raise RuntimeError("❌ HUGGINGFACE_TOKEN not detected, please check Space Settings → Secrets")
	# 2) Login to ensure all subsequent from_pretrained calls have proper permissions
	login(hf_token)

	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
	from peft import PeftModel
	import warnings
	import os
	warnings.filterwarnings("ignore")

	# Model configuration
	MODEL_NAME = "meta-llama/Llama-3.1-8B"
	LORA_MODEL = "YongdongWang/llama3.1-8b-lora-qlora-dart-llm"

	# Global variables to store model and tokenizer
	model = None
	tokenizer = None
	model_loaded = False

	def load_model_and_tokenizer():
	"""Load tokenizer - executed on CPU"""
	global tokenizer, model_loaded

	if model_loaded:
	return

	print("🔄 Loading tokenizer...")

	# Load tokenizer (on CPU)
	tokenizer = AutoTokenizer.from_pretrained(
	MODEL_NAME,
	use_fast=False,
	trust_remote_code=True
	)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	model_loaded = True
	print("✅ Tokenizer loaded successfully!")

	@spaces.GPU(duration=60) # Request GPU for loading model at startup
	def load_model_on_gpu():
	"""Load model on GPU"""
	global model

	if model is not None:
	return model

	print("🔄 Loading model on GPU...")

	try:
	# 4-bit quantization configuration
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_use_double_quant=True,
	)

	# Load base model
	base_model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	quantization_config=bnb_config,
	device_map="auto",
	torch_dtype=torch.float16,
	trust_remote_code=True,
	low_cpu_mem_usage=True,
	use_safetensors=True
	)

	# Load LoRA adapter
	model = PeftModel.from_pretrained(
	base_model,
	LORA_MODEL,
	torch_dtype=torch.float16,
	use_safetensors=True
	)
	model.eval()

	print("✅ Model loaded on GPU successfully!")
	return model

	except Exception as load_error:
	print(f"❌ Model loading failed: {load_error}")
	raise load_error

	def process_json_in_response(response):
	"""Process and format JSON content in the response"""
	try:
	# Check if response contains JSON-like content
	if '{' in response and '}' in response:
	processor = JsonProcessor()

	# Try to process the response for JSON content
	processed_json = processor.process_response(response)

	if processed_json:
	# Format the JSON nicely
	formatted_json = json.dumps(processed_json, indent=2, ensure_ascii=False)
	# Replace the JSON part in the response
	import re
	json_pattern = r'\{.*\}'
	match = re.search(json_pattern, response, re.DOTALL)
	if match:
	# Replace the matched JSON with the formatted version
	response = response.replace(match.group(), formatted_json)

	return response
	except Exception:
	# If processing fails, return original response
	return response

	@spaces.GPU(duration=60) # GPU inference
	def generate_response_gpu(prompt, max_tokens=512):
	"""Generate response - executed on GPU"""
	global model

	# Ensure tokenizer is loaded
	if tokenizer is None:
	load_model_and_tokenizer()

	# Ensure model is loaded on GPU
	if model is None:
	model = load_model_on_gpu()

	if model is None:
	return "❌ Model failed to load. Please check the Space logs."

	try:
	formatted_prompt = (
	"### Instruction:\n"
	f"{prompt.strip()}\n\n"
	"### Response:\n"
	)

	# Encode input
	inputs = tokenizer(
	formatted_prompt,
	return_tensors="pt",
	truncation=True,
	max_length=2048
	).to(model.device)

	# Generate response
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=max_tokens,
	do_sample=False,
	temperature=None,
	top_p=None,
	pad_token_id=tokenizer.pad_token_id,
	eos_token_id=tokenizer.eos_token_id,
	repetition_penalty=1.1,
	early_stopping=True,
	no_repeat_ngram_size=3
	)

	# Decode output
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Extract generated part
	if "### Response:" in response:
	response = response.split("### Response:")[-1].strip()
	elif len(response) > len(formatted_prompt):
	response = response[len(formatted_prompt):].strip()

	# Process JSON if present in response
	response = process_json_in_response(response)

	return response if response else "❌ No response generated. Please try again with a different prompt."

	except Exception as generation_error:
	return f"❌ Generation Error: {str(generation_error)}"

	def chat_interface(message, history, max_tokens):
	"""Chat interface - runs on CPU, calls GPU functions"""
	if not message.strip():
	return history, ""

	# Initialize tokenizer (if needed)
	if tokenizer is None:
	load_model_and_tokenizer()

	try:
	# Call GPU function to generate response
	response = generate_response_gpu(message, max_tokens)
	history.append((message, response))
	return history, ""
	except Exception as chat_error:
	error_msg = f"❌ Chat Error: {str(chat_error)}"
	history.append((message, error_msg))
	return history, ""

	# Load tokenizer at startup
	load_model_and_tokenizer()

	# Create Gradio application
	with gr.Blocks(
	title="Robot Task Planning - Llama 3.1 8B",
	theme=gr.themes.Soft(),
	css="""
	.gradio-container {
	max-width: 1200px;
	margin: auto;
	}
	"""
	) as app:
	gr.Markdown("""
	# 🤖 Llama 3.1 8B - Robot Task Planning

	This is a fine-tuned version of Meta's Llama 3.1 8B model specialized for robot task planning using QLoRA technique.

	Capabilities: Convert natural language robot commands into structured task sequences for excavators, dump trucks, and other construction robots.

	Model: [YongdongWang/llama3.1-8b-lora-qlora-dart-llm](https://huggingface.co/YongdongWang/llama3.1-8b-lora-qlora-dart-llm)

	⚡ Using ZeroGPU: This Space uses dynamic GPU allocation (Nvidia H200). First generation might take a bit longer.
	""")

	with gr.Row():
	with gr.Column(scale=3):
	chatbot = gr.Chatbot(
	label="Task Planning Results",
	height=500,
	show_label=True,
	container=True,
	bubble_full_width=False,
	show_copy_button=True
	)

	msg = gr.Textbox(
	label="Robot Command",
	placeholder="Enter robot task command (e.g., 'Deploy Excavator 1 to Soil Area 1')...",
	lines=2,
	max_lines=5,
	show_label=True,
	container=True
	)

	with gr.Row():
	send_btn = gr.Button("🚀 Generate Tasks", variant="primary", size="sm")
	clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="sm")

	with gr.Column(scale=1):
	gr.Markdown("### ⚙️ Generation Settings")

	max_tokens = gr.Slider(
	minimum=50,
	maximum=5000,
	value=512,
	step=10,
	label="Max Tokens",
	info="Maximum number of tokens to generate"
	)

	gr.Markdown("""
	### 📊 Model Status
	- Hardware: ZeroGPU (Dynamic Nvidia H200)
	- Status: Ready
	- Note: First generation allocates GPU resources
	""")

	# Example conversations
	gr.Examples(
	examples=[
	"Dump truck 1 goes to the puddle for inspection, after which all robots avoid the puddle.",
	"Drive the Excavator 1 to the obstacle, and perform excavation to clear the obstacle.",
	"Send Excavator 1 and Dump Truck 1 to the soil area; Excavator 1 will excavate and unload, followed by Dump Truck 1 proceeding to the puddle for unloading.",
	"Move Excavator 1 and Dump Truck 1 to soil area 2; Excavator 1 will excavate and unload, then Dump Truck 1 returns to the starting position to unload.",
	"Excavator 1 is guided to the obstacle to excavate and unload to clear the obstacle, then excavator 1 and dump truck 1 are moved to the soil area, and the excavator excavates and unloads. Finally, dump truck 1 unloads the soil into the puddle.",
	"Excavator 1 goes to the obstacle to excavate and unload to clear the obstacle. Once the obstacle is cleared, mobilize all available robots to proceed to the puddle area for inspection.",
	],
	inputs=msg,
	label="💡 Example Operator Commands"
	)

	# Event handling
	msg.submit(
	chat_interface,
	inputs=[msg, chatbot, max_tokens],
	outputs=[chatbot, msg]
	)

	send_btn.click(
	chat_interface,
	inputs=[msg, chatbot, max_tokens],
	outputs=[chatbot, msg]
	)

	clear_btn.click(
	lambda: ([], ""),
	outputs=[chatbot, msg]
	)

	if __name__ == "__main__":
	app.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True,
	show_error=True
	)