Spaces:

YongdongWang
/

DART-LLM-Multi-Model-GGUF

Build error

Yongdong

Implement GGUF model support with DAG visualization

a4f228c 5 months ago

13.5 kB

	import gradio as gr
	import spaces # Import spaces module for ZeroGPU
	from huggingface_hub import login
	import os
	from json_processor import JsonProcessor
	from dag_visualizer import DAGVisualizer
	import json

	# 1) Read Secrets
	hf_token = os.getenv("HUGGINGFACE_TOKEN")
	if not hf_token:
	raise RuntimeError("❌ HUGGINGFACE_TOKEN not detected, please check Space Settings → Secrets")
	# 2) Login to ensure all subsequent from_pretrained calls have proper permissions
	login(hf_token)

	from transformers import AutoTokenizer
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama
	import warnings
	import os
	warnings.filterwarnings("ignore")

	# Model configurations for GGUF models
	MODEL_CONFIGS = {
	"1B": {
	"name": "Dart-llm-model-1B",
	"base_model": "meta-llama/Llama-3.2-1B", # For tokenizer
	"gguf_model": "YongdongWang/llama-3.2-1b-lora-qlora-dart-llm-gguf",
	"gguf_file": "llama_3.2_1b-lora-qlora-dart-llm_q5_k_m.gguf"
	},
	"3B": {
	"name": "Dart-llm-model-3B",
	"base_model": "meta-llama/Llama-3.2-3B", # For tokenizer
	"gguf_model": "YongdongWang/llama-3.2-3b-lora-qlora-dart-llm-gguf",
	"gguf_file": "llama_3.2_3b-lora-qlora-dart-llm_q4_k_m.gguf"
	},
	"8B": {
	"name": "Dart-llm-model-8B",
	"base_model": "meta-llama/Llama-3.1-8B", # For tokenizer
	"gguf_model": "YongdongWang/llama-3.1-8b-lora-qlora-dart-llm-gguf",
	"gguf_file": "llama3.1-8b-lora-qlora-dart-llm_q4_k_m_fp16.gguf"
	}
	}

	DEFAULT_MODEL = "1B" # Set 1B as default

	# Global variables to store model and tokenizer
	llm_model = None
	tokenizer = None
	current_model_config = None
	model_loaded = False

	# Initialize DAG visualizer
	dag_visualizer = DAGVisualizer()

	def load_model_and_tokenizer(selected_model=DEFAULT_MODEL):
	"""Load tokenizer - executed on CPU"""
	global tokenizer, model_loaded, current_model_config

	if model_loaded and current_model_config == selected_model:
	return

	print(f"🔄 Loading tokenizer for {MODEL_CONFIGS[selected_model]['name']}...")

	# Load tokenizer from base model
	base_model = MODEL_CONFIGS[selected_model]["base_model"]
	tokenizer = AutoTokenizer.from_pretrained(
	base_model,
	use_fast=False,
	trust_remote_code=True
	)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	current_model_config = selected_model
	model_loaded = True
	print("✅ Tokenizer loaded successfully!")

	@spaces.GPU(duration=60) # Request GPU for loading model at startup
	def load_gguf_model_on_gpu(selected_model=DEFAULT_MODEL):
	"""Load GGUF model using llama-cpp-python"""
	global llm_model

	# If model is already loaded and it's the same model, return it
	if llm_model is not None and current_model_config == selected_model:
	return llm_model

	# Clear existing model if switching
	if llm_model is not None:
	print("🗑️ Clearing existing model from GPU...")
	del llm_model
	llm_model = None

	model_config = MODEL_CONFIGS[selected_model]
	print(f"🔄 Loading {model_config['name']} GGUF model...")

	try:
	# Download GGUF model file from HuggingFace Hub
	model_file = hf_hub_download(
	repo_id=model_config["gguf_model"],
	filename=model_config["gguf_file"],
	cache_dir="./gguf_cache"
	)
	print(f"📦 Downloaded GGUF file: {model_file}")

	# Load GGUF model with llama-cpp-python
	llm_model = Llama(
	model_path=model_file,
	n_ctx=2048, # Context length
	n_gpu_layers=-1, # Use all GPU layers if available
	verbose=False
	)

	print(f"✅ {model_config['name']} GGUF model loaded successfully!")
	return llm_model

	except Exception as load_error:
	print(f"❌ GGUF Model loading failed: {load_error}")
	raise load_error

	def process_json_in_response(response):
	"""Process and format JSON content in the response, and generate DAG visualization"""
	dag_image_path = None

	try:
	# Check if response contains JSON-like content
	if '{' in response and '}' in response:
	processor = JsonProcessor()

	# Try to process the response for JSON content
	processed_json = processor.process_response(response)

	if processed_json:
	# Format the JSON nicely
	formatted_json = json.dumps(processed_json, indent=2, ensure_ascii=False)

	# Generate DAG visualization if the JSON contains tasks
	if "tasks" in processed_json and processed_json["tasks"]:
	try:
	dag_image_path = dag_visualizer.create_dag_visualization(
	processed_json,
	title="Robot Task Dependency Graph"
	)
	except Exception as e:
	print(f"DAG visualization failed: {e}")

	# Replace the JSON part in the response
	import re
	json_pattern = r'\{.*\}'
	match = re.search(json_pattern, response, re.DOTALL)
	if match:
	# Replace the matched JSON with the formatted version
	response = response.replace(match.group(), formatted_json)

	return response, dag_image_path
	except Exception:
	# If processing fails, return original response
	return response, None

	@spaces.GPU(duration=60) # GPU inference
	def generate_response_gpu(prompt, max_tokens=512, selected_model=DEFAULT_MODEL):
	"""Generate response using GGUF model - executed on GPU"""
	global llm_model

	# Ensure model is loaded on GPU
	if llm_model is None or current_model_config != selected_model:
	llm_model = load_gguf_model_on_gpu(selected_model)

	if llm_model is None:
	return ("❌ GGUF Model failed to load. Please check the Space logs.", None)

	try:
	formatted_prompt = (
	"### Instruction:\n"
	f"{prompt.strip()}\n\n"
	"### Response:\n"
	)

	# Generate response using llama-cpp-python
	output = llm_model(
	formatted_prompt,
	max_tokens=max_tokens,
	stop=["### Instruction:", "###"],
	echo=False,
	temperature=0.1,
	top_p=0.9,
	repeat_penalty=1.1
	)

	# Extract the generated text
	response = output['choices'][0]['text'].strip()

	# Process JSON if present in response and generate DAG
	response, dag_image_path = process_json_in_response(response)

	return (response if response else "❌ No response generated. Please try again with a different prompt.", dag_image_path)

	except Exception as generation_error:
	return (f"❌ Generation Error: {str(generation_error)}", None)

	def chat_interface(message, history, max_tokens, selected_model):
	"""Chat interface - runs on CPU, calls GPU functions"""
	if not message.strip():
	return history, "", None

	try:
	# Call GPU function to generate response
	response, dag_image_path = generate_response_gpu(message, max_tokens, selected_model)
	history.append((message, response))
	return history, "", dag_image_path
	except Exception as chat_error:
	error_msg = f"❌ Chat Error: {str(chat_error)}"
	history.append((message, error_msg))
	return history, "", None

	# GGUF models include tokenizer, no separate loading needed

	# Create Gradio application
	with gr.Blocks(
	title="Robot Task Planning - DART-LLM Multi-Model",
	theme=gr.themes.Soft(),
	css="""
	.gradio-container {
	max-width: 1200px;
	margin: auto;
	}
	"""
	) as app:
	gr.Markdown("""
	# 🤖 DART-LLM Multi-Model - Robot Task Planning

	Choose from three GGUF quantized models specialized for robot task planning using QLoRA fine-tuning:

	- 🚀 Dart-llm-model-1B (Default): Fastest inference, Q5_K_M quantization
	- ⚖️ Dart-llm-model-3B: Balanced performance, Q4_K_M quantization
	- 🎯 Dart-llm-model-8B: Best quality output, Q4_K_M quantization

	GGUF Implementation: Uses native GGUF format with llama-cpp-python for optimal memory efficiency and GPU acceleration.

	Capabilities:
	- Convert natural language robot commands into structured task sequences
	- NEW: Automatic DAG Visualization - Generates visual dependency graphs for robot task sequences
	- Support for excavators, dump trucks, and other construction robots

	GGUF Models:
	- [YongdongWang/llama-3.2-1b-lora-qlora-dart-llm-gguf](https://huggingface.co/YongdongWang/llama-3.2-1b-lora-qlora-dart-llm-gguf) (Default - Q5_K_M)
	- [YongdongWang/llama-3.2-3b-lora-qlora-dart-llm-gguf](https://huggingface.co/YongdongWang/llama-3.2-3b-lora-qlora-dart-llm-gguf) (Q4_K_M)
	- [YongdongWang/llama-3.1-8b-lora-qlora-dart-llm-gguf](https://huggingface.co/YongdongWang/llama-3.1-8b-lora-qlora-dart-llm-gguf) (Q4_K_M)

	⚡ Using ZeroGPU: This Space uses dynamic GPU allocation (Nvidia H200). First generation might take a bit longer.
	""")

	with gr.Row():
	with gr.Column(scale=2):
	chatbot = gr.Chatbot(
	label="Task Planning Results",
	height=400,
	show_label=True,
	container=True,
	bubble_full_width=False,
	show_copy_button=True
	)

	msg = gr.Textbox(
	label="Robot Command",
	placeholder="Enter robot task command (e.g., 'Deploy Excavator 1 to Soil Area 1')...",
	lines=2,
	max_lines=5,
	show_label=True,
	container=True
	)

	with gr.Row():
	send_btn = gr.Button("🚀 Generate Tasks", variant="primary", size="sm")
	clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="sm")

	with gr.Column(scale=2):
	dag_image = gr.Image(
	label="Task Dependency Graph (DAG)",
	show_label=True,
	container=True,
	height=400,
	interactive=False
	)

	with gr.Column(scale=1):
	gr.Markdown("### ⚙️ Generation Settings")

	model_selector = gr.Dropdown(
	choices=[(config["name"], key) for key, config in MODEL_CONFIGS.items()],
	value=DEFAULT_MODEL,
	label="Model Size",
	info="Select model size (1B = fastest, 8B = best quality)",
	interactive=True
	)

	max_tokens = gr.Slider(
	minimum=50,
	maximum=5000,
	value=512,
	step=10,
	label="Max Tokens",
	info="Maximum number of tokens to generate"
	)

	gr.Markdown("""
	### 📊 Model Status
	- Hardware: ZeroGPU (Dynamic Nvidia H200)
	- Status: Ready
	- Note: First generation allocates GPU resources
	- Dart-llm-model-1B: Fastest inference (Default)
	- Dart-llm-model-3B: Balanced speed/quality
	- Dart-llm-model-8B: Best quality, slower
	""")

	# Example conversations
	gr.Examples(
	examples=[
	"Dump truck 1 goes to the puddle for inspection, after which all robots avoid the puddle.",
	"Drive the Excavator 1 to the obstacle, and perform excavation to clear the obstacle.",
	"Send Excavator 1 and Dump Truck 1 to the soil area; Excavator 1 will excavate and unload, followed by Dump Truck 1 proceeding to the puddle for unloading.",
	"Move Excavator 1 and Dump Truck 1 to soil area 2; Excavator 1 will excavate and unload, then Dump Truck 1 returns to the starting position to unload.",
	"Excavator 1 is guided to the obstacle to excavate and unload to clear the obstacle, then excavator 1 and dump truck 1 are moved to the soil area, and the excavator excavates and unloads. Finally, dump truck 1 unloads the soil into the puddle.",
	"Excavator 1 goes to the obstacle to excavate and unload to clear the obstacle. Once the obstacle is cleared, mobilize all available robots to proceed to the puddle area for inspection.",
	],
	inputs=msg,
	label="💡 Example Operator Commands"
	)

	# Event handling
	msg.submit(
	chat_interface,
	inputs=[msg, chatbot, max_tokens, model_selector],
	outputs=[chatbot, msg, dag_image]
	)

	send_btn.click(
	chat_interface,
	inputs=[msg, chatbot, max_tokens, model_selector],
	outputs=[chatbot, msg, dag_image]
	)

	clear_btn.click(
	lambda: ([], "", None),
	outputs=[chatbot, msg, dag_image]
	)

	if __name__ == "__main__":
	app.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True,
	show_error=True
	)