File size: 6,015 Bytes
6e07610 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
# train_ppo.py
import json
import torch
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, create_reference_model
from transformers import AutoTokenizer
from datasets import Dataset
from reward_model_loader import load_reward_pipeline
from huggingface_hub import login, HfApi
import os
import shutil
from datetime import datetime
from dotenv import load_dotenv
load_dotenv()
FEEDBACK_FILE = "feedback.json"
MODEL_PATH = "./current_model"
PPO_OUTPUT = "./ppo_model_temp"
REWARD_PATH = "./reward_model"
HF_TOKEN = os.getenv("HF_TOKEN")
HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "modular-ai/kantian-critic-qwen")
# BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct" # Smaller 0.5B model
BASE_MODEL = "modular-ai/qwen"
# Kantian System Prompt for PPO training
KANTIAN_SYSTEM_PROMPT = """You are Kantian - an ADVERSARIAL CRITIC whose job is to challenge and test arguments.
ADVERSARIAL MODE:
1. Challenge the document's arguments systematically.
2. Be critically rigorous - identify flaws and weaknesses.
3. Quote exact text when making critiques.
4. Attack logical fallacies and poor reasoning directly.
5. Your goal: Test arguments through adversarial analysis, not validate them.
Apply Kantian framework: universalizability, human dignity, moral duty over consequences.
"""
# Load data
if not os.path.exists(FEEDBACK_FILE):
print("No data.")
exit()
with open(FEEDBACK_FILE, "r") as f:
data = json.load(f)
if len(data) < 100:
print(f"Need 100+ samples. Current: {len(data)}")
exit()
# Use recent prompts (Kantian critique contexts)
# Include text feedback for better training signal
prompts_data = data[-64:] # Batch-friendly
prompts = []
for d in prompts_data:
# Extract just the user question part if it exists
prompt_text = d["prompt"]
text_feedback = d.get("text_feedback", "")
if "Question:" in prompt_text:
# Extract the question part for Kantian critique generation
question = prompt_text.split("Question:")[-1].strip()
# Prepend Kantian context and feedback if available
if text_feedback:
prompts.append(f"{KANTIAN_SYSTEM_PROMPT}\n\nFeedback Context: {text_feedback}\n\n{question}")
else:
prompts.append(f"{KANTIAN_SYSTEM_PROMPT}\n{question}")
else:
if text_feedback:
prompts.append(f"{KANTIAN_SYSTEM_PROMPT}\n\nFeedback Context: {text_feedback}\n\n{prompt_text}")
else:
prompts.append(f"{KANTIAN_SYSTEM_PROMPT}\n{prompt_text}")
dataset = Dataset.from_dict({"prompt": prompts})
# Load reward model
reward_pipe = load_reward_pipeline(REWARD_PATH)
# Load base model
base_model_path = MODEL_PATH if os.path.exists(MODEL_PATH) else BASE_MODEL
print(f"Loading base model: {base_model_path}")
tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLMWithValueHead.from_pretrained(base_model_path, trust_remote_code=True)
ref_model = create_reference_model(model)
config = PPOConfig(
model_name=base_model_path,
learning_rate=1.41e-5,
batch_size=8,
mini_batch_size=4,
gradient_accumulation_steps=1,
ppo_epochs=3,
)
ppo_trainer = PPOTrainer(
config=config,
model=model,
ref_model=ref_model,
tokenizer=tokenizer,
dataset=dataset,
)
generation_kwargs = {
"max_new_tokens": 100,
"do_sample": True,
"temperature": 0.7,
"top_p": 0.9,
"pad_token_id": tokenizer.eos_token_id
}
print("Starting PPO training...")
for batch in ppo_trainer.dataloader:
query_tensors = batch["input_ids"]
response_tensors = ppo_trainer.generate(query_tensors, **generation_kwargs)
responses = [tokenizer.decode(r, skip_special_tokens=True) for r in response_tensors]
# Compute rewards
texts = [f"Prompt: {p} Response: {r}" for p, r in zip(batch["prompt"], responses)]
pipe_outputs = reward_pipe(texts)
rewards = []
for out in pipe_outputs:
pos_score = next((s["score"] for s in out if s["label"] == "LABEL_1"), 0.0)
neg_score = next((s["score"] for s in out if s["label"] == "LABEL_0"), 0.0)
reward = pos_score - neg_score
rewards.append(torch.tensor(reward))
ppo_trainer.step(query_tensors, response_tensors, rewards)
ppo_trainer.save_model(PPO_OUTPUT)
if os.path.exists(MODEL_PATH):
shutil.rmtree(MODEL_PATH)
os.rename(PPO_OUTPUT, MODEL_PATH)
print(f"PPO model updated at {MODEL_PATH}")
# Push to Hugging Face with version tag
if HF_TOKEN:
try:
login(token=HF_TOKEN)
api = HfApi()
# Create version tag based on timestamp and sample count
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
version_tag = f"v-{len(data)}-samples-{timestamp}"
print(f"\nPushing fine-tuned model to Hugging Face as version: {version_tag}")
print(f"Repository: {HF_MODEL_REPO}")
# Push model to HF Hub (creates new commit while preserving old versions)
api.upload_folder(
folder_path=MODEL_PATH,
repo_id=HF_MODEL_REPO,
commit_message=f"PPO fine-tuned on {len(data)} samples - {timestamp}",
repo_type="model",
)
# Add tags for versioning
try:
api.update_repo_settings(
repo_id=HF_MODEL_REPO,
tags=[version_tag, f"samples-{len(data)}", "ppo", "kantian-critic", "qwen"],
)
except:
pass # Tags update might fail on some repos, non-critical
print(f"✓ Model pushed to {HF_MODEL_REPO}")
print(f" Version tag: {version_tag}")
print(f" All previous versions remain accessible via commit history")
print(f" Access at: https://huggingface.co/{HF_MODEL_REPO}")
except Exception as e:
print(f"Warning: Could not push to Hugging Face: {e}")
else:
print("Warning: HF_TOKEN not set, skipping model upload") |