| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.8930602957906713, | |
| "eval_steps": 100000, | |
| "global_step": 5200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 8.332918167114258, | |
| "learning_rate": 9.999e-07, | |
| "loss": 0.256, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 6.65649938583374, | |
| "learning_rate": 9.998e-07, | |
| "loss": 0.174, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 5.654027462005615, | |
| "learning_rate": 9.997e-07, | |
| "loss": 0.1674, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 6.793820858001709, | |
| "learning_rate": 9.996e-07, | |
| "loss": 0.1538, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.857839107513428, | |
| "learning_rate": 9.995e-07, | |
| "loss": 0.1507, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.979122638702393, | |
| "learning_rate": 9.994e-07, | |
| "loss": 0.1413, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 3.4787421226501465, | |
| "learning_rate": 9.993e-07, | |
| "loss": 0.1426, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.989820957183838, | |
| "learning_rate": 9.992e-07, | |
| "loss": 0.139, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 4.455531597137451, | |
| "learning_rate": 9.990999999999999e-07, | |
| "loss": 0.145, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 3.2097902297973633, | |
| "learning_rate": 9.989999999999999e-07, | |
| "loss": 0.1349, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 4.64670467376709, | |
| "learning_rate": 9.988999999999999e-07, | |
| "loss": 0.1394, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 6.975039958953857, | |
| "learning_rate": 9.988e-07, | |
| "loss": 0.1346, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 7.464960098266602, | |
| "learning_rate": 9.987e-07, | |
| "loss": 0.1314, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 4.806921482086182, | |
| "learning_rate": 9.986e-07, | |
| "loss": 0.131, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 6.7189531326293945, | |
| "learning_rate": 9.985e-07, | |
| "loss": 0.1262, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 5.656557559967041, | |
| "learning_rate": 9.983999999999998e-07, | |
| "loss": 0.1223, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 4.686679363250732, | |
| "learning_rate": 9.982999999999998e-07, | |
| "loss": 0.1247, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 5.541558265686035, | |
| "learning_rate": 9.982e-07, | |
| "loss": 0.1269, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 6.699037551879883, | |
| "learning_rate": 9.981e-07, | |
| "loss": 0.1212, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 7.294734001159668, | |
| "learning_rate": 9.98e-07, | |
| "loss": 0.1288, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 6.782406806945801, | |
| "learning_rate": 9.979e-07, | |
| "loss": 0.1282, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 7.686770439147949, | |
| "learning_rate": 9.978e-07, | |
| "loss": 0.1252, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 5.651573181152344, | |
| "learning_rate": 9.977e-07, | |
| "loss": 0.128, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 3.8114349842071533, | |
| "learning_rate": 9.976e-07, | |
| "loss": 0.1189, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 4.4628777503967285, | |
| "learning_rate": 9.975e-07, | |
| "loss": 0.1227, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 3.973808526992798, | |
| "learning_rate": 9.974e-07, | |
| "loss": 0.1225, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 7.216485977172852, | |
| "learning_rate": 9.973e-07, | |
| "loss": 0.1171, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 4.588983058929443, | |
| "learning_rate": 9.972e-07, | |
| "loss": 0.1152, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 2.4740259647369385, | |
| "learning_rate": 9.971e-07, | |
| "loss": 0.096, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 4.7505903244018555, | |
| "learning_rate": 9.97e-07, | |
| "loss": 0.0986, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 5.158182621002197, | |
| "learning_rate": 9.969e-07, | |
| "loss": 0.1006, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 6.2976202964782715, | |
| "learning_rate": 9.968e-07, | |
| "loss": 0.0979, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 5.02843713760376, | |
| "learning_rate": 9.967e-07, | |
| "loss": 0.0923, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 5.816647052764893, | |
| "learning_rate": 9.966e-07, | |
| "loss": 0.1005, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 5.387178421020508, | |
| "learning_rate": 9.965e-07, | |
| "loss": 0.0992, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 5.27618932723999, | |
| "learning_rate": 9.964e-07, | |
| "loss": 0.0945, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 2.8429291248321533, | |
| "learning_rate": 9.962999999999999e-07, | |
| "loss": 0.0999, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 3.8993031978607178, | |
| "learning_rate": 9.961999999999999e-07, | |
| "loss": 0.0946, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 4.275607585906982, | |
| "learning_rate": 9.960999999999999e-07, | |
| "loss": 0.0943, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 4.941762447357178, | |
| "learning_rate": 9.959999999999999e-07, | |
| "loss": 0.1011, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 3.800781726837158, | |
| "learning_rate": 9.958999999999999e-07, | |
| "loss": 0.096, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 3.8520452976226807, | |
| "learning_rate": 9.958e-07, | |
| "loss": 0.0986, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 8.225783348083496, | |
| "learning_rate": 9.957e-07, | |
| "loss": 0.0935, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 3.4622890949249268, | |
| "learning_rate": 9.956e-07, | |
| "loss": 0.0964, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 4.632036209106445, | |
| "learning_rate": 9.955e-07, | |
| "loss": 0.0899, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 4.176944732666016, | |
| "learning_rate": 9.953999999999998e-07, | |
| "loss": 0.0947, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 4.445220947265625, | |
| "learning_rate": 9.952999999999998e-07, | |
| "loss": 0.0882, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 4.21484375, | |
| "learning_rate": 9.952e-07, | |
| "loss": 0.0987, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 7.656230449676514, | |
| "learning_rate": 9.951e-07, | |
| "loss": 0.0995, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 4.51482629776001, | |
| "learning_rate": 9.95e-07, | |
| "loss": 0.092, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 4.313094139099121, | |
| "learning_rate": 9.949e-07, | |
| "loss": 0.0935, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 5.037979602813721, | |
| "learning_rate": 9.948e-07, | |
| "loss": 0.0998, | |
| "step": 5200 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 1000000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 365, | |
| "save_steps": 200, | |
| "total_flos": 3.542617035836621e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |