# test.py from unsloth import FastLanguageModel import torch # Load base 4-bit model (downloads once ~7.5GB, then cached forever) model, tokenizer = FastLanguageModel.from_pretrained( "unsloth/Phi-3-mini-4k-instruct-bnb-4bit", max_seq_length=2048, dtype=None, load_in_4bit=True, ) # Load YOUR fine-tuned LoRA (super fast, uses(adapter_model.safetensors) model = FastLanguageModel.get_peft_model( model, r=64, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], lora_alpha=128, lora_dropout=0, bias="none", use_gradient_checkpointing=False, ) # Point directly to your folder (where adapter_model.safetensors is) model.load_adapter("/home/saad/Downloads/phi3-mini-lora-only/content/phi3-mini-lora-only") FastLanguageModel.for_inference(model) # Test it messages = [{"role": "user", "content": "delete duplicate entries in a table using two columns"}] inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda") outputs = model.generate(inputs, max_new_tokens=512, temperature=0.7, do_sample=True) print(tokenizer.decode(outputs[0], skip_special_tokens=False))