# train_reward.py
import json
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
from huggingface_hub import login, HfApi
import os
from datetime import datetime
from dotenv import load_dotenv

load_dotenv()
FEEDBACK_FILE = "feedback.json"
REWARD_MODEL_PATH = "./reward_model"
HF_TOKEN = os.getenv("HF_TOKEN")
HF_REWARD_REPO = os.getenv("HF_REWARD_REPO", "modular-ai/kantian-reward-model")

# Kantian Persona Context
KANTIAN_CONTEXT = """Kantian Adversarial Critic - Personality: duty-focused, universality-tester, moral-consistency-seeker, rights-defender.

ADVERSARIAL CRITIQUE MODE:
1. Challenge arguments systematically
2. Identify flaws and weaknesses rigorously
3. Quote exact text when critiquing
4. Attack logical fallacies directly
5. Test through adversarial analysis

Evaluates critiques based on:
- Strength of adversarial challenge
- Rigor in identifying weaknesses
- Application of Kantian principles
- Systematic argument testing
"""

if not os.path.exists(FEEDBACK_FILE):
    print("No feedback data.")
    exit()

with open(FEEDBACK_FILE, "r") as f:
    data = json.load(f)

if len(data) < 50:
    print(f"Need 50+ samples. Current: {len(data)}")
    exit()

# Prepare - Format for Kantian critique training
# Include text feedback when available for richer training
print(f"Processing {len(data)} feedback samples...")
texts = []
for d in data:
    prompt = d['prompt']
    response = d['response']
    text_feedback = d.get('text_feedback', '')
    
    # Create training text that captures Kantian critique quality
    if text_feedback:
        # Include detailed feedback for more nuanced training
        text = f"{KANTIAN_CONTEXT}\n\n{prompt}\n\nKantian Critique: {response}\n\nDetailed Feedback: {text_feedback}"
    else:
        text = f"{KANTIAN_CONTEXT}\n\n{prompt}\n\nKantian Critique: {response}"
    texts.append(text)

labels = [d['reward'] for d in data]  # 0 (not helpful) or 1 (helpful)

dataset = Dataset.from_dict({"text": texts, "label": labels})
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_fn(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

tokenized = dataset.map(tokenize_fn, batched=True)
splits = tokenized.train_test_split(test_size=0.2)

# Model + LoRA
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8, lora_alpha=32, lora_dropout=0.1,
    bias="none",
    target_modules=["query", "value"]  # Specify target modules for DistilBERT
)
model = get_peft_model(model, peft_config)

args = TrainingArguments(
    output_dir=REWARD_MODEL_PATH,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir="./logs_reward",
    fp16=torch.cuda.is_available(),
    report_to=None,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=splits["train"],
    eval_dataset=splits["test"],
)

print("Training reward model...")
trainer.train()
trainer.save_model(REWARD_MODEL_PATH)
print(f"Reward model saved to {REWARD_MODEL_PATH}")

# Push to Hugging Face with version tag
if HF_TOKEN:
    try:
        login(token=HF_TOKEN)
        api = HfApi()
        
        # Create version tag based on timestamp and sample count
        timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
        version_tag = f"v-{len(data)}-samples-{timestamp}"
        
        print(f"Pushing reward model to Hugging Face as version: {version_tag}")
        
        # Push model with version tag (creates new version while keeping old ones)
        model.push_to_hub(
            HF_REWARD_REPO,
            commit_message=f"Reward model trained on {len(data)} samples - {timestamp}",
        )
        tokenizer.push_to_hub(HF_REWARD_REPO)
        
        print(f"✓ Reward model pushed to {HF_REWARD_REPO} with tag: {version_tag}")
        print(f"  Old versions remain accessible on Hugging Face")
    except Exception as e:
        print(f"Warning: Could not push to Hugging Face: {e}")
else:
    print("Warning: HF_TOKEN not set, skipping model upload")