import torch import unsloth from transformers import AutoTokenizer, pipeline from peft import AutoPeftModelForCausalLM from unsloth import FastLanguageModel # FastVisionModel for LLMs MODEL_NAME = "unsloth/Phi-4-unsloth-bnb-4bit" # Base model name (e.g., mistralai/Mistral-7B) model_id = "Machlovi/Safe_Phi4" # Your LoRA fine-tuned adapter max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! load_in_4bit = True def load_model(): """Loads the base model and LoRA adapter using Unsloth.""" print("Loading base model with Unsloth...") # Use Unsloth to load model in 4-bit efficiently model, tokenizer = FastLanguageModel.from_pretrained( model_name=model_id, max_seq_length=max_seq_length, load_in_4bit=load_in_4bit, ) print("Creating text generation pipeline...") text_gen_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer) return text_gen_pipeline # Load model globally so it doesn't reload on every request pipe = load_model() def infer(prompt: str, max_new_tokens=128): """Generate text using the Unsloth LoRA-adapted model.""" return pipe(prompt, max_new_tokens=max_new_tokens)[0]['generated_text']