Spaces:
Running
Running
| """ | |
| Training script for the Chess Challenge. | |
| This script provides a complete training pipeline using the Hugging Face Trainer. | |
| Students can modify this script to experiment with different training strategies. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import os | |
| import warnings | |
| from pathlib import Path | |
| # Suppress warnings from third-party libraries (multiprocess has Python 3.14 compat issues) | |
| warnings.filterwarnings("ignore", message="'return' in a 'finally' block") | |
| import torch | |
| from transformers import ( | |
| Trainer, | |
| TrainingArguments, | |
| set_seed, | |
| ) | |
| from data import ChessDataCollator, create_train_val_datasets | |
| from model import ChessConfig, ChessForCausalLM | |
| from tokenizer import ChessTokenizer | |
| def count_parameters(model, trainable_only=True): | |
| """Count the number of parameters in a model.""" | |
| if trainable_only: | |
| return sum(p.numel() for p in model.parameters() if p.requires_grad) | |
| return sum(p.numel() for p in model.parameters()) | |
| def parse_args(): | |
| """Parse command line arguments.""" | |
| parser = argparse.ArgumentParser( | |
| description="Train a chess-playing language model" | |
| ) | |
| # Model arguments | |
| parser.add_argument( | |
| "--n_embd", type=int, default=128, | |
| help="Embedding dimension" | |
| ) | |
| parser.add_argument( | |
| "--n_layer", type=int, default=4, | |
| help="Number of transformer layers" | |
| ) | |
| parser.add_argument( | |
| "--n_head", type=int, default=4, | |
| help="Number of attention heads" | |
| ) | |
| parser.add_argument( | |
| "--n_ctx", type=int, default=256, | |
| help="Maximum context length" | |
| ) | |
| parser.add_argument( | |
| "--n_inner", type=int, default=None, | |
| help="Feed-forward inner dimension (default: 4 * n_embd)" | |
| ) | |
| parser.add_argument( | |
| "--dropout", type=float, default=0.1, | |
| help="Dropout probability" | |
| ) | |
| parser.add_argument( | |
| "--no_tie_weights", action="store_true", | |
| help="Disable weight tying between embedding and output layers" | |
| ) | |
| # Data arguments | |
| parser.add_argument( | |
| "--dataset_name", type=str, default="dlouapre/lichess_2025-01_1M", | |
| help="Name of the dataset on Hugging Face Hub" | |
| ) | |
| parser.add_argument( | |
| "--max_train_samples", type=int, default=None, | |
| help="Maximum number of training samples" | |
| ) | |
| parser.add_argument( | |
| "--val_samples", type=int, default=5000, | |
| help="Number of validation samples" | |
| ) | |
| # Training arguments | |
| parser.add_argument( | |
| "--output_dir", type=str, default="./output", | |
| help="Output directory for model and logs" | |
| ) | |
| parser.add_argument( | |
| "--num_train_epochs", type=int, default=3, | |
| help="Number of training epochs" | |
| ) | |
| parser.add_argument( | |
| "--per_device_train_batch_size", type=int, default=32, | |
| help="Training batch size per device" | |
| ) | |
| parser.add_argument( | |
| "--per_device_eval_batch_size", type=int, default=64, | |
| help="Evaluation batch size per device" | |
| ) | |
| parser.add_argument( | |
| "--learning_rate", type=float, default=5e-4, | |
| help="Learning rate" | |
| ) | |
| parser.add_argument( | |
| "--weight_decay", type=float, default=0.01, | |
| help="Weight decay" | |
| ) | |
| parser.add_argument( | |
| "--warmup_ratio", type=float, default=0.1, | |
| help="Warmup ratio" | |
| ) | |
| parser.add_argument( | |
| "--seed", type=int, default=42, | |
| help="Random seed" | |
| ) | |
| # Logging arguments | |
| parser.add_argument( | |
| "--logging_steps", type=int, default=100, | |
| help="Logging frequency" | |
| ) | |
| parser.add_argument( | |
| "--eval_steps", type=int, default=500, | |
| help="Evaluation frequency" | |
| ) | |
| parser.add_argument( | |
| "--save_steps", type=int, default=1000, | |
| help="Checkpoint saving frequency" | |
| ) | |
| return parser.parse_args() | |
| def main(): | |
| """Main training function.""" | |
| args = parse_args() | |
| # Set seed for reproducibility | |
| set_seed(args.seed) | |
| print("=" * 60) | |
| print("CHESS CHALLENGE - TRAINING") | |
| print("=" * 60) | |
| # Build tokenizer from dataset | |
| print("\nBuilding tokenizer from dataset...") | |
| tokenizer = ChessTokenizer.build_vocab_from_dataset( | |
| dataset_name=args.dataset_name, | |
| min_frequency=500, # Only keep moves that appear at least 500 times | |
| max_samples=100000, # Use 100k games to build vocabulary | |
| ) | |
| print(f" Vocabulary size: {tokenizer.vocab_size}") | |
| # Use the vocab size from tokenizer (override args if provided) | |
| actual_vocab_size = tokenizer.vocab_size | |
| # Create model configuration | |
| print("\nCreating model configuration...") | |
| config = ChessConfig( | |
| vocab_size=actual_vocab_size, | |
| n_embd=args.n_embd, | |
| n_layer=args.n_layer, | |
| n_head=args.n_head, | |
| n_ctx=args.n_ctx, | |
| n_inner=args.n_inner, | |
| dropout=args.dropout, | |
| tie_weights=not args.no_tie_weights, | |
| pad_token_id=tokenizer.pad_token_id, | |
| bos_token_id=tokenizer.bos_token_id, | |
| eos_token_id=tokenizer.eos_token_id, | |
| ) | |
| # Print configuration | |
| print(f"\nModel configuration:") | |
| print(f" vocab_size: {config.vocab_size}") | |
| print(f" n_embd: {config.n_embd}") | |
| print(f" n_layer: {config.n_layer}") | |
| print(f" n_head: {config.n_head}") | |
| print(f" tie_weights: {config.tie_weights}") | |
| # Create model | |
| print("\nCreating model...") | |
| model = ChessForCausalLM(config) | |
| n_params = count_parameters(model) | |
| print(f" Total parameters: {n_params:,}") | |
| if n_params > 1_000_000: | |
| print("WARNING: Model exceeds 1M parameter limit!") | |
| else: | |
| print("OK: Model is within 1M parameter limit") | |
| # Load datasets | |
| print("\nLoading datasets...") | |
| train_dataset, val_dataset = create_train_val_datasets( | |
| tokenizer=tokenizer, | |
| dataset_name=args.dataset_name, | |
| max_length=args.n_ctx, | |
| train_samples=args.max_train_samples, | |
| val_samples=args.val_samples, | |
| ) | |
| print(f" Training samples: {len(train_dataset):,}") | |
| print(f" Validation samples: {len(val_dataset):,}") | |
| # Create data collator | |
| data_collator = ChessDataCollator(tokenizer, max_length=args.n_ctx) | |
| # Training arguments | |
| training_args = TrainingArguments( | |
| output_dir=args.output_dir, | |
| num_train_epochs=args.num_train_epochs, | |
| per_device_train_batch_size=args.per_device_train_batch_size, | |
| per_device_eval_batch_size=args.per_device_eval_batch_size, | |
| learning_rate=args.learning_rate, | |
| weight_decay=args.weight_decay, | |
| warmup_ratio=args.warmup_ratio, | |
| logging_dir=os.path.join(args.output_dir, "logs"), | |
| logging_steps=args.logging_steps, | |
| eval_strategy="epoch", | |
| save_strategy="epoch", | |
| save_total_limit=3, | |
| load_best_model_at_end=True, | |
| metric_for_best_model="eval_loss", | |
| greater_is_better=False, | |
| seed=args.seed, | |
| bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(), | |
| report_to=["none"], | |
| ) | |
| # Create trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=val_dataset, | |
| data_collator=data_collator, | |
| tokenizer=tokenizer, | |
| ) | |
| # Train | |
| print("\nStarting training...") | |
| trainer.train() | |
| # Save final model | |
| print("\nSaving final model...") | |
| final_model_dir = os.path.join(args.output_dir, "final_model") | |
| trainer.save_model(final_model_dir) | |
| tokenizer.save_pretrained(final_model_dir) | |
| # Copy model.py and tokenizer.py for trust_remote_code loading | |
| import shutil | |
| import json | |
| script_dir = Path(__file__).parent | |
| shutil.copy(script_dir / "model.py", final_model_dir) | |
| shutil.copy(script_dir / "tokenizer.py", final_model_dir) | |
| print(" Copied model.py and tokenizer.py") | |
| # Add auto_map to config.json for AutoModelForCausalLM | |
| config_path = os.path.join(final_model_dir, "config.json") | |
| with open(config_path) as f: | |
| config_dict = json.load(f) | |
| config_dict["auto_map"] = { | |
| "AutoConfig": "model.ChessConfig", | |
| "AutoModelForCausalLM": "model.ChessForCausalLM", | |
| } | |
| with open(config_path, "w") as f: | |
| json.dump(config_dict, f, indent=2) | |
| print(" Added auto_map to config.json") | |
| # Add auto_map to tokenizer_config.json for AutoTokenizer | |
| tokenizer_config_path = os.path.join(final_model_dir, "tokenizer_config.json") | |
| with open(tokenizer_config_path) as f: | |
| tokenizer_dict = json.load(f) | |
| tokenizer_dict["auto_map"] = { | |
| "AutoTokenizer": ["tokenizer.ChessTokenizer", None], | |
| } | |
| with open(tokenizer_config_path, "w") as f: | |
| json.dump(tokenizer_dict, f, indent=2) | |
| print(" Added auto_map to tokenizer_config.json") | |
| print("\nTraining complete!") | |
| print(f" Model saved to: {final_model_dir}") | |
| print(" Ready for submission with: python submit.py --model_path " + final_model_dir) | |
| if __name__ == "__main__": | |
| main() | |