|
|
|
|
|
import json |
|
|
import torch |
|
|
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, create_reference_model |
|
|
from transformers import AutoTokenizer |
|
|
from datasets import Dataset |
|
|
from reward_model_loader import load_reward_pipeline |
|
|
from huggingface_hub import login, HfApi |
|
|
import os |
|
|
import shutil |
|
|
from datetime import datetime |
|
|
from dotenv import load_dotenv |
|
|
|
|
|
load_dotenv() |
|
|
FEEDBACK_FILE = "feedback.json" |
|
|
MODEL_PATH = "./current_model" |
|
|
PPO_OUTPUT = "./ppo_model_temp" |
|
|
REWARD_PATH = "./reward_model" |
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "modular-ai/kantian-critic-qwen") |
|
|
|
|
|
BASE_MODEL = "modular-ai/qwen" |
|
|
|
|
|
KANTIAN_SYSTEM_PROMPT = """You are Kantian - an ADVERSARIAL CRITIC whose job is to challenge and test arguments. |
|
|
|
|
|
ADVERSARIAL MODE: |
|
|
1. Challenge the document's arguments systematically. |
|
|
2. Be critically rigorous - identify flaws and weaknesses. |
|
|
3. Quote exact text when making critiques. |
|
|
4. Attack logical fallacies and poor reasoning directly. |
|
|
5. Your goal: Test arguments through adversarial analysis, not validate them. |
|
|
|
|
|
Apply Kantian framework: universalizability, human dignity, moral duty over consequences. |
|
|
""" |
|
|
|
|
|
|
|
|
if not os.path.exists(FEEDBACK_FILE): |
|
|
print("No data.") |
|
|
exit() |
|
|
|
|
|
with open(FEEDBACK_FILE, "r") as f: |
|
|
data = json.load(f) |
|
|
|
|
|
if len(data) < 100: |
|
|
print(f"Need 100+ samples. Current: {len(data)}") |
|
|
exit() |
|
|
|
|
|
|
|
|
|
|
|
prompts_data = data[-64:] |
|
|
prompts = [] |
|
|
for d in prompts_data: |
|
|
|
|
|
prompt_text = d["prompt"] |
|
|
text_feedback = d.get("text_feedback", "") |
|
|
|
|
|
if "Question:" in prompt_text: |
|
|
|
|
|
question = prompt_text.split("Question:")[-1].strip() |
|
|
|
|
|
if text_feedback: |
|
|
prompts.append(f"{KANTIAN_SYSTEM_PROMPT}\n\nFeedback Context: {text_feedback}\n\n{question}") |
|
|
else: |
|
|
prompts.append(f"{KANTIAN_SYSTEM_PROMPT}\n{question}") |
|
|
else: |
|
|
if text_feedback: |
|
|
prompts.append(f"{KANTIAN_SYSTEM_PROMPT}\n\nFeedback Context: {text_feedback}\n\n{prompt_text}") |
|
|
else: |
|
|
prompts.append(f"{KANTIAN_SYSTEM_PROMPT}\n{prompt_text}") |
|
|
|
|
|
dataset = Dataset.from_dict({"prompt": prompts}) |
|
|
|
|
|
|
|
|
reward_pipe = load_reward_pipeline(REWARD_PATH) |
|
|
|
|
|
|
|
|
base_model_path = MODEL_PATH if os.path.exists(MODEL_PATH) else BASE_MODEL |
|
|
print(f"Loading base model: {base_model_path}") |
|
|
tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True) |
|
|
if tokenizer.pad_token is None: |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
model = AutoModelForCausalLMWithValueHead.from_pretrained(base_model_path, trust_remote_code=True) |
|
|
ref_model = create_reference_model(model) |
|
|
|
|
|
config = PPOConfig( |
|
|
model_name=base_model_path, |
|
|
learning_rate=1.41e-5, |
|
|
batch_size=8, |
|
|
mini_batch_size=4, |
|
|
gradient_accumulation_steps=1, |
|
|
ppo_epochs=3, |
|
|
) |
|
|
|
|
|
ppo_trainer = PPOTrainer( |
|
|
config=config, |
|
|
model=model, |
|
|
ref_model=ref_model, |
|
|
tokenizer=tokenizer, |
|
|
dataset=dataset, |
|
|
) |
|
|
|
|
|
generation_kwargs = { |
|
|
"max_new_tokens": 100, |
|
|
"do_sample": True, |
|
|
"temperature": 0.7, |
|
|
"top_p": 0.9, |
|
|
"pad_token_id": tokenizer.eos_token_id |
|
|
} |
|
|
|
|
|
print("Starting PPO training...") |
|
|
for batch in ppo_trainer.dataloader: |
|
|
query_tensors = batch["input_ids"] |
|
|
|
|
|
response_tensors = ppo_trainer.generate(query_tensors, **generation_kwargs) |
|
|
responses = [tokenizer.decode(r, skip_special_tokens=True) for r in response_tensors] |
|
|
|
|
|
|
|
|
texts = [f"Prompt: {p} Response: {r}" for p, r in zip(batch["prompt"], responses)] |
|
|
pipe_outputs = reward_pipe(texts) |
|
|
rewards = [] |
|
|
for out in pipe_outputs: |
|
|
pos_score = next((s["score"] for s in out if s["label"] == "LABEL_1"), 0.0) |
|
|
neg_score = next((s["score"] for s in out if s["label"] == "LABEL_0"), 0.0) |
|
|
reward = pos_score - neg_score |
|
|
rewards.append(torch.tensor(reward)) |
|
|
|
|
|
ppo_trainer.step(query_tensors, response_tensors, rewards) |
|
|
|
|
|
ppo_trainer.save_model(PPO_OUTPUT) |
|
|
if os.path.exists(MODEL_PATH): |
|
|
shutil.rmtree(MODEL_PATH) |
|
|
os.rename(PPO_OUTPUT, MODEL_PATH) |
|
|
print(f"PPO model updated at {MODEL_PATH}") |
|
|
|
|
|
|
|
|
if HF_TOKEN: |
|
|
try: |
|
|
login(token=HF_TOKEN) |
|
|
api = HfApi() |
|
|
|
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") |
|
|
version_tag = f"v-{len(data)}-samples-{timestamp}" |
|
|
|
|
|
print(f"\nPushing fine-tuned model to Hugging Face as version: {version_tag}") |
|
|
print(f"Repository: {HF_MODEL_REPO}") |
|
|
|
|
|
|
|
|
api.upload_folder( |
|
|
folder_path=MODEL_PATH, |
|
|
repo_id=HF_MODEL_REPO, |
|
|
commit_message=f"PPO fine-tuned on {len(data)} samples - {timestamp}", |
|
|
repo_type="model", |
|
|
) |
|
|
|
|
|
|
|
|
try: |
|
|
api.update_repo_settings( |
|
|
repo_id=HF_MODEL_REPO, |
|
|
tags=[version_tag, f"samples-{len(data)}", "ppo", "kantian-critic", "qwen"], |
|
|
) |
|
|
except: |
|
|
pass |
|
|
|
|
|
print(f"✓ Model pushed to {HF_MODEL_REPO}") |
|
|
print(f" Version tag: {version_tag}") |
|
|
print(f" All previous versions remain accessible via commit history") |
|
|
print(f" Access at: https://huggingface.co/{HF_MODEL_REPO}") |
|
|
except Exception as e: |
|
|
print(f"Warning: Could not push to Hugging Face: {e}") |
|
|
else: |
|
|
print("Warning: HF_TOKEN not set, skipping model upload") |