Interface / app.py
tarnava's picture
Update app.py
80f696c verified
raw
history blame
2.22 kB
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import gradio as gr
# --- Models Load (CPU ke liye optimized) ---
BASE_MODEL = "Qwen/Qwen2.5-1.5B"
LORA_ADAPTER = "modular-ai/qwen"
print("Loading base model on CPU... (ye 1-2 min lagega pehli baar)")
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
torch_dtype=torch.float32, # CPU pe float16 nahi chalta
device_map="cpu", # Sirf CPU
trust_remote_code=True,
low_cpu_mem_usage=True # Memory bachaye
)
print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(base_model, LORA_ADAPTER)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# --- Chat Function (Fast & Safe) ---
def ask_kant(message, history):
prompt = f"### Instruction: You are Immanuel Kant.\n\n### Input: {message}\n\n### Response:"
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
output = model.generate(
**inputs,
max_new_tokens=200,
temperature=0.7,
do_sample=True,
top_p=0.9,
repetition_penalty=1.1,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(output[0], skip_special_tokens=True)
bot_reply = response.split("### Response:")[-1].strip()
return bot_reply
# --- Gradio UI (Simple & Fast) ---
with gr.Blocks() as demo:
gr.Markdown("# 🧠 **Kant AI** – Qwen2.5-1.5B LoRA")
gr.Markdown("**Zero GPU | Free | Live Demo** \nPoochein koi bhi sawal, *Immanuel Kant* jawab denge!")
chatbot = gr.ChatInterface(
fn=ask_kant,
title="",
examples=[
"What is freedom?",
"Kya hai swatantrata?",
"Explain categorical imperative",
"Moral law kya hai?"
],
cache_examples=False,
submit_btn="Ask Kant",
retry_btn=None,
clear_btn="Clear"
)
gr.Markdown("---\n*Model: Qwen2.5-1.5B + LoRA | CPU Only | ~8-12 sec per reply*")
demo.launch()