# test.py
from unsloth import FastLanguageModel
import torch

# Load base 4-bit model (downloads once ~7.5GB, then cached forever)
model, tokenizer = FastLanguageModel.from_pretrained(
    "unsloth/Phi-3-mini-4k-instruct-bnb-4bit",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

# Load YOUR fine-tuned LoRA (super fast, uses(adapter_model.safetensors)
model = FastLanguageModel.get_peft_model(
    model,
    r=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=128,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing=False,
)

# Point directly to your folder (where adapter_model.safetensors is)
model.load_adapter("/home/saad/Downloads/phi3-mini-lora-only/content/phi3-mini-lora-only")

FastLanguageModel.for_inference(model)

# Test it
messages = [{"role": "user", "content": "delete duplicate entries in a table using two columns"}]
inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

outputs = model.generate(inputs, max_new_tokens=512, temperature=0.7, do_sample=True)
print(tokenizer.decode(outputs[0], skip_special_tokens=False))