Spaces:
Sleeping
Sleeping
| from SmolLm3 import LlamaModel | |
| import torch | |
| import yaml | |
| from transformers import AutoTokenizer | |
| from torch.utils.data import DataLoader | |
| import numpy as np | |
| from datasets import load_dataset | |
| import logging | |
| import math | |
| from utils import upload_file_to_s3 | |
| # At the start of training loop | |
| # print(f"GPU Memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB") | |
| # print(f"GPU Memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB") | |
| logger = logging.getLogger(__name__) | |
| formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') | |
| file_handler = logging.FileHandler('training.log') | |
| file_handler.setFormatter(formatter) # Set formatter on the handler, not the logger | |
| logger.addHandler(file_handler) | |
| logger.setLevel(logging.INFO) | |
| def encode_text(examples, tokenizer, seq_length): | |
| """Tokenize and prepare text examples for training.""" | |
| tokens = tokenizer( | |
| examples["text"], | |
| truncation=True, | |
| padding="max_length", | |
| max_length=seq_length + 1, | |
| return_tensors="pt", | |
| ) | |
| # Use clone().detach() as recommended | |
| input_ids = tokens["input_ids"].squeeze(0).clone().detach() | |
| input_ids = torch.clamp(input_ids, min=0, max=tokenizer.vocab_size - 1) | |
| labels = input_ids.clone().detach() | |
| labels = labels[1:].to(torch.int64) | |
| input_ids = input_ids[:-1].to(torch.int64) | |
| return {"input_ids": input_ids, "labels": labels} | |
| def load_cosmopedia_dataset(batch_size=8, seq_length=1024, tokenizer=None): | |
| """ | |
| Returns a torch dataloader for the cosmopedia dataset | |
| """ | |
| # Set tokenizer parallelism explicitly | |
| import os | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
| logger.info("tokenizer parallelism set to false") | |
| try: | |
| # Increase timeout and retries for dataset loading | |
| from datasets import config | |
| config.HF_DATASETS_TIMEOUT = 300 # 5 minutes timeout | |
| config.MAX_RETRIES = 10 # Increase retry attempts | |
| logger.info("dataset loading config set") | |
| train_dataset = load_dataset( | |
| "HuggingFaceTB/smollm-corpus", | |
| name="cosmopedia-v2", | |
| split="train", | |
| streaming=True, | |
| ) | |
| logger.info("dataset loaded") | |
| # Use partial to bind tokenizer and seq_length to the encode function | |
| from functools import partial | |
| encode_fn = partial(encode_text, tokenizer=tokenizer, seq_length=seq_length) | |
| train_dataset = train_dataset.map( | |
| encode_fn, | |
| remove_columns=["text"], | |
| batched=False | |
| ) | |
| train_dataset = train_dataset.with_format("torch") | |
| train_dataloader = DataLoader( | |
| train_dataset, | |
| batch_size=batch_size, | |
| num_workers=2, | |
| pin_memory=True, | |
| prefetch_factor=4, | |
| persistent_workers=True | |
| ) | |
| return train_dataloader | |
| except Exception as e: | |
| logger.error(f"Error loading dataset: {str(e)}") | |
| return None | |
| def generate(model, idx, max_new_tokens, context_length, temperature=1.0, top_k=None, eos_token=None, device=None): | |
| logger.info(f"Generating on device {device}") | |
| model = model.to(device) | |
| idx = idx.to(device) | |
| model.eval() | |
| for _ in range(max_new_tokens): | |
| idx_cond = idx[:, -context_length:] | |
| with torch.no_grad(): | |
| logits, _ = model(idx_cond) # Unpack both logits and loss (ignore loss) | |
| logits = logits.view(idx_cond.shape[0], -1, model.config['vocab_size']) # Reshape to [batch, seq, vocab] | |
| # Get the logits for the last token only | |
| logits = logits[:, -1, :] # Shape: [batch_size, vocab_size] | |
| if top_k is not None: | |
| # top k sampling | |
| top_logits, top_pos = torch.topk(logits, top_k) | |
| min_logit = top_logits[:, -1].unsqueeze(-1) | |
| logits = torch.where(logits < min_logit, | |
| torch.tensor(float('-inf')).to(logits.device), | |
| logits) | |
| # temperature scaling | |
| if temperature > 0.0: | |
| logits /= temperature | |
| probs = torch.softmax(logits, dim=-1) | |
| idx_next = torch.multinomial(probs, num_samples=1) | |
| else: | |
| idx_next = torch.argmax(logits, dim=-1, keepdim=True) | |
| if idx_next.item() == eos_token: | |
| break | |
| idx = torch.cat((idx, idx_next), dim=1) | |
| model.train() | |
| return idx | |
| def sync_device(device): | |
| if device.startswith('cuda'): | |
| torch.cuda.synchronize() | |
| elif device == 'cpu': | |
| torch.cpu.synchronize() if hasattr(torch.cpu, 'synchronize') else None | |
| elif device.startswith('mps'): # For Apple Silicon | |
| torch.mps.synchronize() | |
| def print_gpu_memory(step_name=""): | |
| """ | |
| Print GPU memory statistics with a specified step name | |
| """ | |
| if torch.cuda.is_available(): | |
| logger.info(f"\nGPU Memory Stats {step_name}:") | |
| logger.info(f"GPU Memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB") | |
| logger.info(f"GPU Memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB") | |
| logger.info(f"Max GPU Memory allocated: {torch.cuda.max_memory_allocated() / 1024**2:.2f} MB") | |
| # Learning rate scheduler | |
| def get_lr_lambda(current_step, warmup_steps, max_steps, max_lr): | |
| """ | |
| Modified learning rate scheduler with: | |
| 1. Linear warmup for first 3000 steps | |
| 2. Cosine decay from 3000 to 60000 steps | |
| 3. Minimum learning rate of 1.5e-5 (5% of max_lr) | |
| """ | |
| min_lr = max_lr * 0.05 # Minimum learning rate (5% of max_lr) | |
| if current_step < warmup_steps: | |
| # Linear warmup from 0 to max_lr | |
| return float(current_step) / float(max(1, warmup_steps)) | |
| else: | |
| # Cosine decay from max_lr to min_lr | |
| progress = float(current_step - warmup_steps) / float(max(1, max_steps - warmup_steps)) | |
| return min_lr + 0.5 * (max_lr - min_lr) * (1.0 + math.cos(math.pi * progress)) | |
| def train_model(config, model, train_loader, test_loader, optimizer, device, num_epochs, eval_freq, eval_iter, start_context="Jack Gisburn rather a cheap genius- ", tokenizer=None): | |
| total_loss = 0 | |
| tokens_seen, global_step = 0, -1 | |
| # Adjusted gradient accumulation setup | |
| actual_batch_size = config['tokens']['micro_batch_size'] # Now 16 | |
| effective_batch_size_multiplier = 2 # Reduced from 4 to maintain reasonable memory usage | |
| target_batch_size = effective_batch_size_multiplier * config['tokens']['micro_batch_size'] | |
| gradient_accumulation_steps = target_batch_size // actual_batch_size | |
| # Adjusted learning rate parameters for new batch size | |
| max_lr = 3e-4 # Keep the same max learning rate | |
| warmup_steps = 3000 # Increase warmup steps for longer training | |
| max_steps = 60000 # Set to match 10 hours of training | |
| min_lr = max_lr * 0.05 # Reduce minimum LR to 5% of max (was 10%) | |
| # Create LambdaLR scheduler with the improved lambda function | |
| lr_lambda = lambda step: get_lr_lambda(step, warmup_steps, max_steps, max_lr) | |
| scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) | |
| logger.info(f"Training with learning rate schedule:") | |
| logger.info(f"Max LR: {max_lr}") | |
| logger.info(f"Warmup Steps: {warmup_steps}") | |
| logger.info(f"Max Steps: {max_steps}") | |
| logger.info(f"Min LR: {max_lr * 0.05}") | |
| logger.info(f"Gradient Accumulation Steps: {gradient_accumulation_steps}") | |
| logger.info(f"Effective Batch Size: {actual_batch_size * gradient_accumulation_steps}") | |
| print_gpu_memory("at start of training") | |
| # Add these near the start of training loop | |
| torch.cuda.empty_cache() | |
| torch.backends.cudnn.benchmark = True | |
| for epoch in range(num_epochs): | |
| model.train() | |
| optimizer.zero_grad() # Zero gradients at start of epoch | |
| for batch_idx, batch in enumerate(train_loader): | |
| input_batch = batch['input_ids'].to(device) | |
| target_batch = batch['labels'].to(device) | |
| # Forward pass | |
| with torch.autocast(device_type=device, dtype=torch.bfloat16): | |
| logits, original_loss = model(input_batch, target_batch) | |
| # Scale loss for gradient accumulation | |
| scaled_loss = original_loss / gradient_accumulation_steps | |
| scaled_loss.backward() | |
| # Add the original loss to total_loss for logging | |
| total_loss += original_loss.item() # Don't multiply back up | |
| tokens_seen += input_batch.numel() | |
| # Calculate running average loss | |
| total_batches = batch_idx + 1 | |
| avg_loss = total_loss / total_batches | |
| if batch_idx % 25 == 0: | |
| logger.info(f"Batch {batch_idx + 1}, Running Avg Loss: {avg_loss:.5f}") | |
| # Only update weights after accumulating gradients | |
| if (batch_idx + 1) % gradient_accumulation_steps == 0: | |
| # Gradient clipping | |
| torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) | |
| optimizer.step() | |
| scheduler.step() # Update learning rate | |
| optimizer.zero_grad() | |
| global_step += 1 | |
| # Evaluation block | |
| if global_step % eval_freq == 0 and global_step > 0: | |
| # Use total batches processed instead of global_step | |
| current_lr = scheduler.get_last_lr()[0] | |
| optimizer_lr = optimizer.param_groups[0]['lr'] | |
| print_gpu_memory(f"at step {global_step}") | |
| logger.info(f"learning rate: {current_lr:.8f}") | |
| logger.info(f"Ep {epoch+1} (Step {global_step:06d}): " | |
| f"Avg loss {avg_loss:.3f} | {tokens_seen} tokens seen") | |
| logger.info(f"optimizer lr: {optimizer_lr:.8f}") | |
| logger.info(f"scheduler lr: {current_lr:.8f}") | |
| # Generate sample text | |
| encoded_text = tokenizer.encode(start_context, return_tensors="pt") | |
| random_topk = np.random.randint(1, 10) | |
| logger.info(f"random_topk: {random_topk}") | |
| random_temperature = np.random.uniform(0.7, 0.9) | |
| logger.info(f"random_temperature: {random_temperature}") | |
| logger.info(f"global step {global_step} , batch_idx {batch_idx} => generating text") | |
| generated_text = generate(model, | |
| idx=encoded_text, | |
| max_new_tokens=256, | |
| context_length=256, | |
| temperature=random_temperature, | |
| top_k=random_topk, | |
| eos_token=tokenizer.eos_token_id, | |
| device=device) | |
| logger.info(f"+++"*30) | |
| logger.info(tokenizer.decode(generated_text.squeeze(0))) | |
| logger.info(f"+++"*30) | |
| # Save checkpoint | |
| model_file_name = f"model_{global_step}_steps_avg_loss_{avg_loss:.5f}_optimizer_lr_{optimizer_lr:.8f}.pth" | |
| torch.save({ | |
| 'step': global_step, | |
| 'model_state_dict': model.state_dict(), | |
| 'optimizer_state_dict': optimizer.state_dict(), | |
| 'scheduler_state_dict': scheduler.state_dict(), | |
| 'loss': avg_loss, | |
| }, model_file_name) | |
| s3_path = upload_file_to_s3(model_file_name, config['model']['model_config']['s3_bucket'], | |
| config['model']['model_config']['s3_checkpoint_folder']) | |
| logger.info(f"Model saved to S3: {s3_path}") | |
| log_path = upload_file_to_s3(config['model']['model_config']['s3_log_file_name'], config['model']['model_config']['s3_bucket'], | |
| config['model']['model_config']['s3_log_folder']) | |
| logger.info(f"Log saved to S3: {log_path}") | |
| if batch_idx % 100 == 0: | |
| logger.info(f"Batch {batch_idx} finished") | |
| logger.info(f"+++"*30) | |
| logger.info("Training complete") | |
| if __name__ == "__main__": | |
| config = yaml.load(open("config_smollm2_135M.yaml", "r"), Loader=yaml.FullLoader) | |
| logger.info(config) | |
| # Set memory efficient settings | |
| torch.set_float32_matmul_precision('high') | |
| torch.backends.cudnn.benchmark = True | |
| torch.backends.cuda.matmul.allow_tf32 = True | |
| # Empty cache before model creation | |
| torch.cuda.empty_cache() | |
| model = LlamaModel(config['model']) | |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| # Enable gradient checkpointing for memory efficiency | |
| # model.gradient_checkpointing_enable() | |
| model.to(device) | |
| model = torch.compile(model) | |
| logger.info(model) | |
| logger.info("++"*30) | |
| optimizer = torch.optim.AdamW( | |
| model.parameters(), | |
| lr=3e-4, | |
| weight_decay=0.15, | |
| betas=(0.9, 0.95) | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer") | |
| tokenizer.pad_token = tokenizer.eos_token | |
| vocab_size = tokenizer.vocab_size | |
| # Adjusted batch size and sequence length | |
| train_loader = load_cosmopedia_dataset( | |
| batch_size=16, # Set to 16 | |
| seq_length=1024, # Kept at 1024 | |
| tokenizer=tokenizer | |
| ) | |
| import time | |
| t1 = time.time() | |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| # Set environment variable for memory allocation | |
| import os | |
| os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512' | |
| train_model( | |
| config, | |
| model, | |
| train_loader, | |
| train_loader, | |
| optimizer=optimizer, | |
| device=device, | |
| num_epochs=1, | |
| eval_freq=1000, # Increase eval frequency to every 500 steps | |
| eval_iter=1000, | |
| start_context="Once Upon a Time far far away in a galaxy", | |
| tokenizer=tokenizer | |
| ) | |
| t2 = time.time() | |
| logger.info(f"Time taken for training: {t2 - t1:.2f} seconds") |