Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from peft import PeftModel | |
| import gradio as gr | |
| # --- Models Load (CPU ke liye optimized) --- | |
| BASE_MODEL = "Qwen/Qwen2.5-1.5B" | |
| LORA_ADAPTER = "modular-ai/qwen" | |
| print("Loading base model on CPU... (ye 1-2 min lagega pehli baar)") | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL, | |
| torch_dtype=torch.float32, # CPU pe float16 nahi chalta | |
| device_map="cpu", # Sirf CPU | |
| trust_remote_code=True, | |
| low_cpu_mem_usage=True # Memory bachaye | |
| ) | |
| print("Loading LoRA adapter...") | |
| model = PeftModel.from_pretrained(base_model, LORA_ADAPTER) | |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| # --- Chat Function (Fast & Safe) --- | |
| def ask_kant(message, history): | |
| prompt = f"### Instruction: You are Immanuel Kant.\n\n### Input: {message}\n\n### Response:" | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512) | |
| with torch.no_grad(): | |
| output = model.generate( | |
| **inputs, | |
| max_new_tokens=200, | |
| temperature=0.7, | |
| do_sample=True, | |
| top_p=0.9, | |
| repetition_penalty=1.1, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| response = tokenizer.decode(output[0], skip_special_tokens=True) | |
| bot_reply = response.split("### Response:")[-1].strip() | |
| return bot_reply | |
| # --- Gradio UI (Simple & Fast) --- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# 🧠 **Kant AI** – Qwen2.5-1.5B LoRA") | |
| gr.Markdown("**Zero GPU | Free | Live Demo** \nPoochein koi bhi sawal, *Immanuel Kant* jawab denge!") | |
| chatbot = gr.ChatInterface( | |
| fn=ask_kant, | |
| title="", | |
| examples=[ | |
| "What is freedom?", | |
| "Kya hai swatantrata?", | |
| "Explain categorical imperative", | |
| "Moral law kya hai?" | |
| ], | |
| cache_examples=False, | |
| submit_btn="Ask Kant", | |
| retry_btn=None, | |
| clear_btn="Clear" | |
| ) | |
| gr.Markdown("---\n*Model: Qwen2.5-1.5B + LoRA | CPU Only | ~8-12 sec per reply*") | |
| demo.launch() |