nathanael-fijalkow's picture
First version ready with webhook and deterministic eval
cb44915
"""
Training script for the Chess Challenge.
This script provides a complete training pipeline using the Hugging Face Trainer.
Students can modify this script to experiment with different training strategies.
"""
from __future__ import annotations
import argparse
import os
import warnings
from pathlib import Path
# Suppress warnings from third-party libraries (multiprocess has Python 3.14 compat issues)
warnings.filterwarnings("ignore", message="'return' in a 'finally' block")
import torch
from transformers import (
Trainer,
TrainingArguments,
set_seed,
)
from data import ChessDataCollator, create_train_val_datasets
from model import ChessConfig, ChessForCausalLM
from tokenizer import ChessTokenizer
def count_parameters(model, trainable_only=True):
"""Count the number of parameters in a model."""
if trainable_only:
return sum(p.numel() for p in model.parameters() if p.requires_grad)
return sum(p.numel() for p in model.parameters())
def parse_args():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(
description="Train a chess-playing language model"
)
# Model arguments
parser.add_argument(
"--n_embd", type=int, default=128,
help="Embedding dimension"
)
parser.add_argument(
"--n_layer", type=int, default=4,
help="Number of transformer layers"
)
parser.add_argument(
"--n_head", type=int, default=4,
help="Number of attention heads"
)
parser.add_argument(
"--n_ctx", type=int, default=256,
help="Maximum context length"
)
parser.add_argument(
"--n_inner", type=int, default=None,
help="Feed-forward inner dimension (default: 4 * n_embd)"
)
parser.add_argument(
"--dropout", type=float, default=0.1,
help="Dropout probability"
)
parser.add_argument(
"--no_tie_weights", action="store_true",
help="Disable weight tying between embedding and output layers"
)
# Data arguments
parser.add_argument(
"--dataset_name", type=str, default="dlouapre/lichess_2025-01_1M",
help="Name of the dataset on Hugging Face Hub"
)
parser.add_argument(
"--max_train_samples", type=int, default=None,
help="Maximum number of training samples"
)
parser.add_argument(
"--val_samples", type=int, default=5000,
help="Number of validation samples"
)
# Training arguments
parser.add_argument(
"--output_dir", type=str, default="./output",
help="Output directory for model and logs"
)
parser.add_argument(
"--num_train_epochs", type=int, default=3,
help="Number of training epochs"
)
parser.add_argument(
"--per_device_train_batch_size", type=int, default=32,
help="Training batch size per device"
)
parser.add_argument(
"--per_device_eval_batch_size", type=int, default=64,
help="Evaluation batch size per device"
)
parser.add_argument(
"--learning_rate", type=float, default=5e-4,
help="Learning rate"
)
parser.add_argument(
"--weight_decay", type=float, default=0.01,
help="Weight decay"
)
parser.add_argument(
"--warmup_ratio", type=float, default=0.1,
help="Warmup ratio"
)
parser.add_argument(
"--seed", type=int, default=42,
help="Random seed"
)
# Logging arguments
parser.add_argument(
"--logging_steps", type=int, default=100,
help="Logging frequency"
)
parser.add_argument(
"--eval_steps", type=int, default=500,
help="Evaluation frequency"
)
parser.add_argument(
"--save_steps", type=int, default=1000,
help="Checkpoint saving frequency"
)
return parser.parse_args()
def main():
"""Main training function."""
args = parse_args()
# Set seed for reproducibility
set_seed(args.seed)
print("=" * 60)
print("CHESS CHALLENGE - TRAINING")
print("=" * 60)
# Build tokenizer from dataset
print("\nBuilding tokenizer from dataset...")
tokenizer = ChessTokenizer.build_vocab_from_dataset(
dataset_name=args.dataset_name,
min_frequency=500, # Only keep moves that appear at least 500 times
max_samples=100000, # Use 100k games to build vocabulary
)
print(f" Vocabulary size: {tokenizer.vocab_size}")
# Use the vocab size from tokenizer (override args if provided)
actual_vocab_size = tokenizer.vocab_size
# Create model configuration
print("\nCreating model configuration...")
config = ChessConfig(
vocab_size=actual_vocab_size,
n_embd=args.n_embd,
n_layer=args.n_layer,
n_head=args.n_head,
n_ctx=args.n_ctx,
n_inner=args.n_inner,
dropout=args.dropout,
tie_weights=not args.no_tie_weights,
pad_token_id=tokenizer.pad_token_id,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
)
# Print configuration
print(f"\nModel configuration:")
print(f" vocab_size: {config.vocab_size}")
print(f" n_embd: {config.n_embd}")
print(f" n_layer: {config.n_layer}")
print(f" n_head: {config.n_head}")
print(f" tie_weights: {config.tie_weights}")
# Create model
print("\nCreating model...")
model = ChessForCausalLM(config)
n_params = count_parameters(model)
print(f" Total parameters: {n_params:,}")
if n_params > 1_000_000:
print("WARNING: Model exceeds 1M parameter limit!")
else:
print("OK: Model is within 1M parameter limit")
# Load datasets
print("\nLoading datasets...")
train_dataset, val_dataset = create_train_val_datasets(
tokenizer=tokenizer,
dataset_name=args.dataset_name,
max_length=args.n_ctx,
train_samples=args.max_train_samples,
val_samples=args.val_samples,
)
print(f" Training samples: {len(train_dataset):,}")
print(f" Validation samples: {len(val_dataset):,}")
# Create data collator
data_collator = ChessDataCollator(tokenizer, max_length=args.n_ctx)
# Training arguments
training_args = TrainingArguments(
output_dir=args.output_dir,
num_train_epochs=args.num_train_epochs,
per_device_train_batch_size=args.per_device_train_batch_size,
per_device_eval_batch_size=args.per_device_eval_batch_size,
learning_rate=args.learning_rate,
weight_decay=args.weight_decay,
warmup_ratio=args.warmup_ratio,
logging_dir=os.path.join(args.output_dir, "logs"),
logging_steps=args.logging_steps,
eval_strategy="epoch",
save_strategy="epoch",
save_total_limit=3,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
seed=args.seed,
bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
report_to=["none"],
)
# Create trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
data_collator=data_collator,
tokenizer=tokenizer,
)
# Train
print("\nStarting training...")
trainer.train()
# Save final model
print("\nSaving final model...")
final_model_dir = os.path.join(args.output_dir, "final_model")
trainer.save_model(final_model_dir)
tokenizer.save_pretrained(final_model_dir)
# Copy model.py and tokenizer.py for trust_remote_code loading
import shutil
import json
script_dir = Path(__file__).parent
shutil.copy(script_dir / "model.py", final_model_dir)
shutil.copy(script_dir / "tokenizer.py", final_model_dir)
print(" Copied model.py and tokenizer.py")
# Add auto_map to config.json for AutoModelForCausalLM
config_path = os.path.join(final_model_dir, "config.json")
with open(config_path) as f:
config_dict = json.load(f)
config_dict["auto_map"] = {
"AutoConfig": "model.ChessConfig",
"AutoModelForCausalLM": "model.ChessForCausalLM",
}
with open(config_path, "w") as f:
json.dump(config_dict, f, indent=2)
print(" Added auto_map to config.json")
# Add auto_map to tokenizer_config.json for AutoTokenizer
tokenizer_config_path = os.path.join(final_model_dir, "tokenizer_config.json")
with open(tokenizer_config_path) as f:
tokenizer_dict = json.load(f)
tokenizer_dict["auto_map"] = {
"AutoTokenizer": ["tokenizer.ChessTokenizer", None],
}
with open(tokenizer_config_path, "w") as f:
json.dump(tokenizer_dict, f, indent=2)
print(" Added auto_map to tokenizer_config.json")
print("\nTraining complete!")
print(f" Model saved to: {final_model_dir}")
print(" Ready for submission with: python submit.py --model_path " + final_model_dir)
if __name__ == "__main__":
main()