| # test.py | |
| from unsloth import FastLanguageModel | |
| import torch | |
| # Load base 4-bit model (downloads once ~7.5GB, then cached forever) | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| "unsloth/Phi-3-mini-4k-instruct-bnb-4bit", | |
| max_seq_length=2048, | |
| dtype=None, | |
| load_in_4bit=True, | |
| ) | |
| # Load YOUR fine-tuned LoRA (super fast, uses(adapter_model.safetensors) | |
| model = FastLanguageModel.get_peft_model( | |
| model, | |
| r=64, | |
| target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], | |
| lora_alpha=128, | |
| lora_dropout=0, | |
| bias="none", | |
| use_gradient_checkpointing=False, | |
| ) | |
| # Point directly to your folder (where adapter_model.safetensors is) | |
| model.load_adapter("/home/saad/Downloads/phi3-mini-lora-only/content/phi3-mini-lora-only") | |
| FastLanguageModel.for_inference(model) | |
| # Test it | |
| messages = [{"role": "user", "content": "delete duplicate entries in a table using two columns"}] | |
| inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda") | |
| outputs = model.generate(inputs, max_new_tokens=512, temperature=0.7, do_sample=True) | |
| print(tokenizer.decode(outputs[0], skip_special_tokens=False)) |