maxkordn's picture
Model save
156a661 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 615,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008146639511201629,
"grad_norm": 0.4904627755196518,
"learning_rate": 0.0,
"loss": 0.0296,
"num_tokens": 468319.0,
"step": 1
},
{
"epoch": 0.016293279022403257,
"grad_norm": 0.44639106202217055,
"learning_rate": 5.263157894736843e-07,
"loss": 0.0278,
"num_tokens": 931744.0,
"step": 2
},
{
"epoch": 0.024439918533604887,
"grad_norm": 0.5235906052705608,
"learning_rate": 1.0526315789473685e-06,
"loss": 0.0346,
"num_tokens": 1382492.0,
"step": 3
},
{
"epoch": 0.032586558044806514,
"grad_norm": 0.48829378532794426,
"learning_rate": 1.5789473684210526e-06,
"loss": 0.0298,
"num_tokens": 1822837.0,
"step": 4
},
{
"epoch": 0.04073319755600815,
"grad_norm": 0.46192310755459265,
"learning_rate": 2.105263157894737e-06,
"loss": 0.0282,
"num_tokens": 2324341.0,
"step": 5
},
{
"epoch": 0.048879837067209775,
"grad_norm": 0.42590423154372875,
"learning_rate": 2.631578947368421e-06,
"loss": 0.0259,
"num_tokens": 2786402.0,
"step": 6
},
{
"epoch": 0.05702647657841141,
"grad_norm": 0.3780878258784539,
"learning_rate": 3.157894736842105e-06,
"loss": 0.0257,
"num_tokens": 3249490.0,
"step": 7
},
{
"epoch": 0.06517311608961303,
"grad_norm": 0.3621520375009199,
"learning_rate": 3.6842105263157896e-06,
"loss": 0.0244,
"num_tokens": 3691588.0,
"step": 8
},
{
"epoch": 0.07331975560081466,
"grad_norm": 0.34785014992590463,
"learning_rate": 4.210526315789474e-06,
"loss": 0.0243,
"num_tokens": 4145266.0,
"step": 9
},
{
"epoch": 0.0814663951120163,
"grad_norm": 0.4866514034362246,
"learning_rate": 4.736842105263158e-06,
"loss": 0.0283,
"num_tokens": 4589804.0,
"step": 10
},
{
"epoch": 0.08961303462321792,
"grad_norm": 0.6175459481284201,
"learning_rate": 5.263157894736842e-06,
"loss": 0.0314,
"num_tokens": 5028241.0,
"step": 11
},
{
"epoch": 0.09775967413441955,
"grad_norm": 0.6496287922379511,
"learning_rate": 5.789473684210527e-06,
"loss": 0.0345,
"num_tokens": 5485301.0,
"step": 12
},
{
"epoch": 0.10590631364562118,
"grad_norm": 0.6359630764639106,
"learning_rate": 6.31578947368421e-06,
"loss": 0.0325,
"num_tokens": 5919605.0,
"step": 13
},
{
"epoch": 0.11405295315682282,
"grad_norm": 0.5165849518665486,
"learning_rate": 6.842105263157896e-06,
"loss": 0.027,
"num_tokens": 6397578.0,
"step": 14
},
{
"epoch": 0.12219959266802444,
"grad_norm": 0.5201523052382824,
"learning_rate": 7.368421052631579e-06,
"loss": 0.034,
"num_tokens": 6859276.0,
"step": 15
},
{
"epoch": 0.13034623217922606,
"grad_norm": 0.46155792727423955,
"learning_rate": 7.894736842105265e-06,
"loss": 0.0301,
"num_tokens": 7339093.0,
"step": 16
},
{
"epoch": 0.1384928716904277,
"grad_norm": 0.42614925852564395,
"learning_rate": 8.421052631578948e-06,
"loss": 0.03,
"num_tokens": 7829130.0,
"step": 17
},
{
"epoch": 0.14663951120162932,
"grad_norm": 0.4234114672689651,
"learning_rate": 8.947368421052632e-06,
"loss": 0.0318,
"num_tokens": 8284433.0,
"step": 18
},
{
"epoch": 0.15478615071283094,
"grad_norm": 0.4251114371201452,
"learning_rate": 9.473684210526315e-06,
"loss": 0.0303,
"num_tokens": 8720891.0,
"step": 19
},
{
"epoch": 0.1629327902240326,
"grad_norm": 0.40955870197241373,
"learning_rate": 1e-05,
"loss": 0.0297,
"num_tokens": 9262899.0,
"step": 20
},
{
"epoch": 0.1710794297352342,
"grad_norm": 0.4087963920152161,
"learning_rate": 9.999937484351817e-06,
"loss": 0.0306,
"num_tokens": 9716643.0,
"step": 21
},
{
"epoch": 0.17922606924643583,
"grad_norm": 0.3693398530680141,
"learning_rate": 9.999749939144244e-06,
"loss": 0.0298,
"num_tokens": 10180183.0,
"step": 22
},
{
"epoch": 0.18737270875763748,
"grad_norm": 0.41806848178827605,
"learning_rate": 9.99943736958818e-06,
"loss": 0.0321,
"num_tokens": 10631870.0,
"step": 23
},
{
"epoch": 0.1955193482688391,
"grad_norm": 0.3809861049653307,
"learning_rate": 9.998999784368282e-06,
"loss": 0.0309,
"num_tokens": 11100352.0,
"step": 24
},
{
"epoch": 0.20366598778004075,
"grad_norm": 0.39880645680339244,
"learning_rate": 9.99843719564274e-06,
"loss": 0.0352,
"num_tokens": 11561399.0,
"step": 25
},
{
"epoch": 0.21181262729124237,
"grad_norm": 0.35747673914837313,
"learning_rate": 9.997749619042932e-06,
"loss": 0.0302,
"num_tokens": 12052978.0,
"step": 26
},
{
"epoch": 0.219959266802444,
"grad_norm": 0.3646974156800663,
"learning_rate": 9.996937073672988e-06,
"loss": 0.0326,
"num_tokens": 12510505.0,
"step": 27
},
{
"epoch": 0.22810590631364563,
"grad_norm": 0.37781095340314147,
"learning_rate": 9.995999582109266e-06,
"loss": 0.0329,
"num_tokens": 12972726.0,
"step": 28
},
{
"epoch": 0.23625254582484725,
"grad_norm": 0.32601449306874525,
"learning_rate": 9.994937170399715e-06,
"loss": 0.0338,
"num_tokens": 13415015.0,
"step": 29
},
{
"epoch": 0.24439918533604887,
"grad_norm": 0.3124719848123855,
"learning_rate": 9.993749868063162e-06,
"loss": 0.0321,
"num_tokens": 13862924.0,
"step": 30
},
{
"epoch": 0.2525458248472505,
"grad_norm": 0.33624412669801873,
"learning_rate": 9.992437708088487e-06,
"loss": 0.0343,
"num_tokens": 14336744.0,
"step": 31
},
{
"epoch": 0.2606924643584521,
"grad_norm": 0.2822892406548645,
"learning_rate": 9.991000726933702e-06,
"loss": 0.0317,
"num_tokens": 14787461.0,
"step": 32
},
{
"epoch": 0.26883910386965376,
"grad_norm": 0.3141509965138737,
"learning_rate": 9.989438964524943e-06,
"loss": 0.0348,
"num_tokens": 15260166.0,
"step": 33
},
{
"epoch": 0.2769857433808554,
"grad_norm": 0.28289872142233236,
"learning_rate": 9.987752464255365e-06,
"loss": 0.0328,
"num_tokens": 15716455.0,
"step": 34
},
{
"epoch": 0.285132382892057,
"grad_norm": 0.2988023355786341,
"learning_rate": 9.98594127298392e-06,
"loss": 0.0327,
"num_tokens": 16208303.0,
"step": 35
},
{
"epoch": 0.29327902240325865,
"grad_norm": 0.2827377533678326,
"learning_rate": 9.984005441034079e-06,
"loss": 0.0316,
"num_tokens": 16661734.0,
"step": 36
},
{
"epoch": 0.3014256619144603,
"grad_norm": 0.3277891610784485,
"learning_rate": 9.981945022192412e-06,
"loss": 0.0363,
"num_tokens": 17117668.0,
"step": 37
},
{
"epoch": 0.3095723014256619,
"grad_norm": 0.29741031623668746,
"learning_rate": 9.979760073707106e-06,
"loss": 0.0322,
"num_tokens": 17568922.0,
"step": 38
},
{
"epoch": 0.31771894093686354,
"grad_norm": 0.28575184625841077,
"learning_rate": 9.977450656286371e-06,
"loss": 0.0317,
"num_tokens": 18032936.0,
"step": 39
},
{
"epoch": 0.3258655804480652,
"grad_norm": 0.28459652313484274,
"learning_rate": 9.97501683409675e-06,
"loss": 0.0334,
"num_tokens": 18462483.0,
"step": 40
},
{
"epoch": 0.3340122199592668,
"grad_norm": 0.2853320257498706,
"learning_rate": 9.972458674761347e-06,
"loss": 0.0325,
"num_tokens": 18918154.0,
"step": 41
},
{
"epoch": 0.3421588594704684,
"grad_norm": 0.31245376505929573,
"learning_rate": 9.96977624935793e-06,
"loss": 0.0356,
"num_tokens": 19392456.0,
"step": 42
},
{
"epoch": 0.35030549898167007,
"grad_norm": 0.29339121491288905,
"learning_rate": 9.96696963241697e-06,
"loss": 0.0358,
"num_tokens": 19864410.0,
"step": 43
},
{
"epoch": 0.35845213849287166,
"grad_norm": 0.308001575808903,
"learning_rate": 9.964038901919573e-06,
"loss": 0.0344,
"num_tokens": 20325616.0,
"step": 44
},
{
"epoch": 0.3665987780040733,
"grad_norm": 0.29777121599268264,
"learning_rate": 9.9609841392953e-06,
"loss": 0.0361,
"num_tokens": 20754956.0,
"step": 45
},
{
"epoch": 0.37474541751527496,
"grad_norm": 0.27446985734348617,
"learning_rate": 9.95780542941991e-06,
"loss": 0.0367,
"num_tokens": 21197697.0,
"step": 46
},
{
"epoch": 0.38289205702647655,
"grad_norm": 0.2723208448567585,
"learning_rate": 9.954502860613011e-06,
"loss": 0.0355,
"num_tokens": 21644714.0,
"step": 47
},
{
"epoch": 0.3910386965376782,
"grad_norm": 0.34829072831093,
"learning_rate": 9.951076524635593e-06,
"loss": 0.0343,
"num_tokens": 22094029.0,
"step": 48
},
{
"epoch": 0.39918533604887985,
"grad_norm": 0.2633667374393046,
"learning_rate": 9.947526516687484e-06,
"loss": 0.0342,
"num_tokens": 22577438.0,
"step": 49
},
{
"epoch": 0.4073319755600815,
"grad_norm": 0.2781504189612014,
"learning_rate": 9.943852935404706e-06,
"loss": 0.0356,
"num_tokens": 23046436.0,
"step": 50
},
{
"epoch": 0.4154786150712831,
"grad_norm": 0.29581469873784194,
"learning_rate": 9.940055882856734e-06,
"loss": 0.038,
"num_tokens": 23498243.0,
"step": 51
},
{
"epoch": 0.42362525458248473,
"grad_norm": 0.2656899667965322,
"learning_rate": 9.936135464543652e-06,
"loss": 0.0347,
"num_tokens": 23972330.0,
"step": 52
},
{
"epoch": 0.4317718940936864,
"grad_norm": 0.2543418233162407,
"learning_rate": 9.93209178939324e-06,
"loss": 0.0341,
"num_tokens": 24453685.0,
"step": 53
},
{
"epoch": 0.439918533604888,
"grad_norm": 0.25163009959008703,
"learning_rate": 9.927924969757926e-06,
"loss": 0.034,
"num_tokens": 24926242.0,
"step": 54
},
{
"epoch": 0.4480651731160896,
"grad_norm": 0.2530048416696052,
"learning_rate": 9.923635121411683e-06,
"loss": 0.0341,
"num_tokens": 25365241.0,
"step": 55
},
{
"epoch": 0.45621181262729127,
"grad_norm": 0.2591530319599859,
"learning_rate": 9.919222363546797e-06,
"loss": 0.0353,
"num_tokens": 25833971.0,
"step": 56
},
{
"epoch": 0.46435845213849286,
"grad_norm": 0.23005642120058867,
"learning_rate": 9.914686818770567e-06,
"loss": 0.0328,
"num_tokens": 26279628.0,
"step": 57
},
{
"epoch": 0.4725050916496945,
"grad_norm": 0.2612401425726277,
"learning_rate": 9.910028613101888e-06,
"loss": 0.0343,
"num_tokens": 26734776.0,
"step": 58
},
{
"epoch": 0.48065173116089616,
"grad_norm": 0.25501336518012946,
"learning_rate": 9.905247875967764e-06,
"loss": 0.035,
"num_tokens": 27206001.0,
"step": 59
},
{
"epoch": 0.48879837067209775,
"grad_norm": 0.25907516477795234,
"learning_rate": 9.900344740199691e-06,
"loss": 0.0342,
"num_tokens": 27647448.0,
"step": 60
},
{
"epoch": 0.4969450101832994,
"grad_norm": 0.2627756492187737,
"learning_rate": 9.895319342029992e-06,
"loss": 0.0352,
"num_tokens": 28116087.0,
"step": 61
},
{
"epoch": 0.505091649694501,
"grad_norm": 0.2520744974011735,
"learning_rate": 9.890171821088006e-06,
"loss": 0.034,
"num_tokens": 28556029.0,
"step": 62
},
{
"epoch": 0.5132382892057027,
"grad_norm": 0.25566988242695377,
"learning_rate": 9.884902320396228e-06,
"loss": 0.0345,
"num_tokens": 29003546.0,
"step": 63
},
{
"epoch": 0.5213849287169042,
"grad_norm": 0.26761657061201327,
"learning_rate": 9.879510986366321e-06,
"loss": 0.0386,
"num_tokens": 29464833.0,
"step": 64
},
{
"epoch": 0.5295315682281059,
"grad_norm": 0.25151679573138824,
"learning_rate": 9.873997968795066e-06,
"loss": 0.0361,
"num_tokens": 29908906.0,
"step": 65
},
{
"epoch": 0.5376782077393075,
"grad_norm": 0.25192725491977325,
"learning_rate": 9.868363420860176e-06,
"loss": 0.0363,
"num_tokens": 30339618.0,
"step": 66
},
{
"epoch": 0.5458248472505092,
"grad_norm": 0.2558097074022343,
"learning_rate": 9.86260749911606e-06,
"loss": 0.0359,
"num_tokens": 30798302.0,
"step": 67
},
{
"epoch": 0.5539714867617108,
"grad_norm": 0.23903896250926235,
"learning_rate": 9.856730363489465e-06,
"loss": 0.0321,
"num_tokens": 31270382.0,
"step": 68
},
{
"epoch": 0.5621181262729125,
"grad_norm": 0.23678636099022307,
"learning_rate": 9.85073217727503e-06,
"loss": 0.0332,
"num_tokens": 31743990.0,
"step": 69
},
{
"epoch": 0.570264765784114,
"grad_norm": 0.2718182538363666,
"learning_rate": 9.844613107130758e-06,
"loss": 0.0377,
"num_tokens": 32188589.0,
"step": 70
},
{
"epoch": 0.5784114052953157,
"grad_norm": 0.2447806090135222,
"learning_rate": 9.838373323073376e-06,
"loss": 0.0335,
"num_tokens": 32654341.0,
"step": 71
},
{
"epoch": 0.5865580448065173,
"grad_norm": 0.24791886655928558,
"learning_rate": 9.832012998473612e-06,
"loss": 0.0357,
"num_tokens": 33133443.0,
"step": 72
},
{
"epoch": 0.594704684317719,
"grad_norm": 0.2602111918495323,
"learning_rate": 9.825532310051383e-06,
"loss": 0.0369,
"num_tokens": 33600590.0,
"step": 73
},
{
"epoch": 0.6028513238289206,
"grad_norm": 0.23958876317959238,
"learning_rate": 9.818931437870888e-06,
"loss": 0.0347,
"num_tokens": 34081907.0,
"step": 74
},
{
"epoch": 0.6109979633401222,
"grad_norm": 0.24528240723597736,
"learning_rate": 9.812210565335591e-06,
"loss": 0.0347,
"num_tokens": 34528542.0,
"step": 75
},
{
"epoch": 0.6191446028513238,
"grad_norm": 0.2511113811601625,
"learning_rate": 9.805369879183143e-06,
"loss": 0.0358,
"num_tokens": 34984490.0,
"step": 76
},
{
"epoch": 0.6272912423625254,
"grad_norm": 0.23964291648975655,
"learning_rate": 9.798409569480171e-06,
"loss": 0.0368,
"num_tokens": 35438413.0,
"step": 77
},
{
"epoch": 0.6354378818737271,
"grad_norm": 0.22854430928208863,
"learning_rate": 9.791329829617025e-06,
"loss": 0.0329,
"num_tokens": 35861862.0,
"step": 78
},
{
"epoch": 0.6435845213849287,
"grad_norm": 0.25934229180134305,
"learning_rate": 9.784130856302383e-06,
"loss": 0.0352,
"num_tokens": 36334726.0,
"step": 79
},
{
"epoch": 0.6517311608961304,
"grad_norm": 0.249853867356781,
"learning_rate": 9.77681284955779e-06,
"loss": 0.0334,
"num_tokens": 36806966.0,
"step": 80
},
{
"epoch": 0.659877800407332,
"grad_norm": 0.24228111972158922,
"learning_rate": 9.769376012712107e-06,
"loss": 0.0355,
"num_tokens": 37255978.0,
"step": 81
},
{
"epoch": 0.6680244399185336,
"grad_norm": 0.24656941383849604,
"learning_rate": 9.761820552395857e-06,
"loss": 0.0372,
"num_tokens": 37695349.0,
"step": 82
},
{
"epoch": 0.6761710794297352,
"grad_norm": 0.24557463844035055,
"learning_rate": 9.754146678535483e-06,
"loss": 0.0364,
"num_tokens": 38137196.0,
"step": 83
},
{
"epoch": 0.6843177189409368,
"grad_norm": 0.25045832824836683,
"learning_rate": 9.74635460434752e-06,
"loss": 0.036,
"num_tokens": 38601156.0,
"step": 84
},
{
"epoch": 0.6924643584521385,
"grad_norm": 0.23961222253413397,
"learning_rate": 9.738444546332663e-06,
"loss": 0.0348,
"num_tokens": 39098917.0,
"step": 85
},
{
"epoch": 0.7006109979633401,
"grad_norm": 0.21623543203559747,
"learning_rate": 9.73041672426976e-06,
"loss": 0.0313,
"num_tokens": 39589476.0,
"step": 86
},
{
"epoch": 0.7087576374745418,
"grad_norm": 0.2454384444263673,
"learning_rate": 9.722271361209698e-06,
"loss": 0.035,
"num_tokens": 40040757.0,
"step": 87
},
{
"epoch": 0.7169042769857433,
"grad_norm": 0.2514790044121715,
"learning_rate": 9.714008683469212e-06,
"loss": 0.035,
"num_tokens": 40503981.0,
"step": 88
},
{
"epoch": 0.725050916496945,
"grad_norm": 0.2574428715510541,
"learning_rate": 9.705628920624592e-06,
"loss": 0.0365,
"num_tokens": 40969365.0,
"step": 89
},
{
"epoch": 0.7331975560081466,
"grad_norm": 0.25017040048121353,
"learning_rate": 9.69713230550531e-06,
"loss": 0.0349,
"num_tokens": 41427533.0,
"step": 90
},
{
"epoch": 0.7413441955193483,
"grad_norm": 0.2526246003424556,
"learning_rate": 9.68851907418754e-06,
"loss": 0.0385,
"num_tokens": 41894302.0,
"step": 91
},
{
"epoch": 0.7494908350305499,
"grad_norm": 0.2461082056251613,
"learning_rate": 9.679789465987614e-06,
"loss": 0.0357,
"num_tokens": 42349463.0,
"step": 92
},
{
"epoch": 0.7576374745417516,
"grad_norm": 0.2617726018040813,
"learning_rate": 9.67094372345536e-06,
"loss": 0.0389,
"num_tokens": 42774515.0,
"step": 93
},
{
"epoch": 0.7657841140529531,
"grad_norm": 0.24705231631404728,
"learning_rate": 9.661982092367366e-06,
"loss": 0.036,
"num_tokens": 43230624.0,
"step": 94
},
{
"epoch": 0.7739307535641547,
"grad_norm": 0.235679439367168,
"learning_rate": 9.652904821720158e-06,
"loss": 0.0365,
"num_tokens": 43672523.0,
"step": 95
},
{
"epoch": 0.7820773930753564,
"grad_norm": 0.2510768490849978,
"learning_rate": 9.643712163723271e-06,
"loss": 0.0377,
"num_tokens": 44158995.0,
"step": 96
},
{
"epoch": 0.790224032586558,
"grad_norm": 0.2533074838565773,
"learning_rate": 9.63440437379225e-06,
"loss": 0.0376,
"num_tokens": 44636347.0,
"step": 97
},
{
"epoch": 0.7983706720977597,
"grad_norm": 0.23715260979777855,
"learning_rate": 9.624981710541548e-06,
"loss": 0.0356,
"num_tokens": 45086574.0,
"step": 98
},
{
"epoch": 0.8065173116089613,
"grad_norm": 0.23369067636824356,
"learning_rate": 9.615444435777343e-06,
"loss": 0.0357,
"num_tokens": 45541713.0,
"step": 99
},
{
"epoch": 0.814663951120163,
"grad_norm": 0.22571635640078413,
"learning_rate": 9.605792814490263e-06,
"loss": 0.0348,
"num_tokens": 46007566.0,
"step": 100
},
{
"epoch": 0.8228105906313645,
"grad_norm": 0.23077275204681233,
"learning_rate": 9.596027114848025e-06,
"loss": 0.0345,
"num_tokens": 46477746.0,
"step": 101
},
{
"epoch": 0.8309572301425662,
"grad_norm": 0.22566558819394333,
"learning_rate": 9.58614760818798e-06,
"loss": 0.0338,
"num_tokens": 46929999.0,
"step": 102
},
{
"epoch": 0.8391038696537678,
"grad_norm": 0.21695625400644095,
"learning_rate": 9.57615456900958e-06,
"loss": 0.0347,
"num_tokens": 47395766.0,
"step": 103
},
{
"epoch": 0.8472505091649695,
"grad_norm": 0.2620473147070263,
"learning_rate": 9.566048274966745e-06,
"loss": 0.0383,
"num_tokens": 47845971.0,
"step": 104
},
{
"epoch": 0.8553971486761711,
"grad_norm": 0.2410799135804227,
"learning_rate": 9.55582900686015e-06,
"loss": 0.0365,
"num_tokens": 48287919.0,
"step": 105
},
{
"epoch": 0.8635437881873728,
"grad_norm": 0.2396885428184001,
"learning_rate": 9.545497048629427e-06,
"loss": 0.0348,
"num_tokens": 48749479.0,
"step": 106
},
{
"epoch": 0.8716904276985743,
"grad_norm": 0.24688227687368633,
"learning_rate": 9.535052687345273e-06,
"loss": 0.0387,
"num_tokens": 49192411.0,
"step": 107
},
{
"epoch": 0.879837067209776,
"grad_norm": 0.24294158661413467,
"learning_rate": 9.524496213201473e-06,
"loss": 0.0378,
"num_tokens": 49653484.0,
"step": 108
},
{
"epoch": 0.8879837067209776,
"grad_norm": 0.2405101629778957,
"learning_rate": 9.513827919506835e-06,
"loss": 0.0363,
"num_tokens": 50112406.0,
"step": 109
},
{
"epoch": 0.8961303462321792,
"grad_norm": 0.23181354337095814,
"learning_rate": 9.503048102677048e-06,
"loss": 0.0349,
"num_tokens": 50574830.0,
"step": 110
},
{
"epoch": 0.9042769857433809,
"grad_norm": 0.23382747597194983,
"learning_rate": 9.492157062226438e-06,
"loss": 0.0341,
"num_tokens": 51043765.0,
"step": 111
},
{
"epoch": 0.9124236252545825,
"grad_norm": 0.22729966362083456,
"learning_rate": 9.481155100759651e-06,
"loss": 0.0345,
"num_tokens": 51491061.0,
"step": 112
},
{
"epoch": 0.9205702647657841,
"grad_norm": 0.24513175538240015,
"learning_rate": 9.470042523963243e-06,
"loss": 0.039,
"num_tokens": 51927088.0,
"step": 113
},
{
"epoch": 0.9287169042769857,
"grad_norm": 0.24880865741998745,
"learning_rate": 9.458819640597193e-06,
"loss": 0.0379,
"num_tokens": 52372997.0,
"step": 114
},
{
"epoch": 0.9368635437881874,
"grad_norm": 0.2220343898509789,
"learning_rate": 9.447486762486307e-06,
"loss": 0.034,
"num_tokens": 52812484.0,
"step": 115
},
{
"epoch": 0.945010183299389,
"grad_norm": 0.22431667653715365,
"learning_rate": 9.436044204511575e-06,
"loss": 0.0346,
"num_tokens": 53269746.0,
"step": 116
},
{
"epoch": 0.9531568228105907,
"grad_norm": 0.2278604942336719,
"learning_rate": 9.42449228460141e-06,
"loss": 0.0364,
"num_tokens": 53715464.0,
"step": 117
},
{
"epoch": 0.9613034623217923,
"grad_norm": 0.2233927678176066,
"learning_rate": 9.412831323722813e-06,
"loss": 0.0354,
"num_tokens": 54163779.0,
"step": 118
},
{
"epoch": 0.9694501018329938,
"grad_norm": 0.2258095576824266,
"learning_rate": 9.401061645872469e-06,
"loss": 0.0356,
"num_tokens": 54622927.0,
"step": 119
},
{
"epoch": 0.9775967413441955,
"grad_norm": 0.21843742724066828,
"learning_rate": 9.389183578067725e-06,
"loss": 0.0332,
"num_tokens": 55117094.0,
"step": 120
},
{
"epoch": 0.9857433808553971,
"grad_norm": 0.23443623658924626,
"learning_rate": 9.37719745033752e-06,
"loss": 0.0372,
"num_tokens": 55571058.0,
"step": 121
},
{
"epoch": 0.9938900203665988,
"grad_norm": 0.22544218766750995,
"learning_rate": 9.365103595713206e-06,
"loss": 0.0347,
"num_tokens": 56023909.0,
"step": 122
},
{
"epoch": 1.0,
"grad_norm": 0.23694297938614514,
"learning_rate": 9.352902350219298e-06,
"loss": 0.0325,
"num_tokens": 56069607.0,
"step": 123
},
{
"epoch": 1.0,
"eval_loss": 0.041923802345991135,
"eval_num_tokens": 56069607.0,
"eval_runtime": 59.8807,
"eval_samples_per_second": 40.731,
"eval_steps_per_second": 5.093,
"step": 123
},
{
"epoch": 1.0081466395112015,
"grad_norm": 0.18235571925323477,
"learning_rate": 9.34059405286414e-06,
"loss": 0.0242,
"num_tokens": 56508815.0,
"step": 124
},
{
"epoch": 1.0162932790224033,
"grad_norm": 0.19439769536061022,
"learning_rate": 9.32817904563048e-06,
"loss": 0.0234,
"num_tokens": 56965411.0,
"step": 125
},
{
"epoch": 1.0244399185336048,
"grad_norm": 0.1736558218986549,
"learning_rate": 9.315657673465978e-06,
"loss": 0.0225,
"num_tokens": 57414294.0,
"step": 126
},
{
"epoch": 1.0325865580448066,
"grad_norm": 0.19113275019426793,
"learning_rate": 9.303030284273606e-06,
"loss": 0.0225,
"num_tokens": 57877954.0,
"step": 127
},
{
"epoch": 1.0407331975560081,
"grad_norm": 0.19036081030160895,
"learning_rate": 9.290297228901994e-06,
"loss": 0.022,
"num_tokens": 58325030.0,
"step": 128
},
{
"epoch": 1.0488798370672097,
"grad_norm": 0.1984639840701536,
"learning_rate": 9.277458861135684e-06,
"loss": 0.0219,
"num_tokens": 58808552.0,
"step": 129
},
{
"epoch": 1.0570264765784114,
"grad_norm": 0.2043532515942055,
"learning_rate": 9.264515537685289e-06,
"loss": 0.0217,
"num_tokens": 59306149.0,
"step": 130
},
{
"epoch": 1.065173116089613,
"grad_norm": 0.24055798224631966,
"learning_rate": 9.251467618177588e-06,
"loss": 0.0238,
"num_tokens": 59747428.0,
"step": 131
},
{
"epoch": 1.0733197556008147,
"grad_norm": 0.1995629506004207,
"learning_rate": 9.238315465145536e-06,
"loss": 0.0204,
"num_tokens": 60204616.0,
"step": 132
},
{
"epoch": 1.0814663951120163,
"grad_norm": 0.2525900691277178,
"learning_rate": 9.225059444018185e-06,
"loss": 0.0239,
"num_tokens": 60656969.0,
"step": 133
},
{
"epoch": 1.089613034623218,
"grad_norm": 0.2318763327290573,
"learning_rate": 9.21169992311054e-06,
"loss": 0.0218,
"num_tokens": 61138427.0,
"step": 134
},
{
"epoch": 1.0977596741344195,
"grad_norm": 0.24997426536385803,
"learning_rate": 9.198237273613311e-06,
"loss": 0.0249,
"num_tokens": 61577876.0,
"step": 135
},
{
"epoch": 1.105906313645621,
"grad_norm": 0.2271197177471986,
"learning_rate": 9.184671869582617e-06,
"loss": 0.0229,
"num_tokens": 62045028.0,
"step": 136
},
{
"epoch": 1.1140529531568228,
"grad_norm": 0.22400406233634754,
"learning_rate": 9.17100408792958e-06,
"loss": 0.0207,
"num_tokens": 62486192.0,
"step": 137
},
{
"epoch": 1.1221995926680244,
"grad_norm": 0.23845965068678432,
"learning_rate": 9.157234308409859e-06,
"loss": 0.0225,
"num_tokens": 62956027.0,
"step": 138
},
{
"epoch": 1.1303462321792261,
"grad_norm": 0.2106619550266949,
"learning_rate": 9.14336291361309e-06,
"loss": 0.0213,
"num_tokens": 63414690.0,
"step": 139
},
{
"epoch": 1.1384928716904277,
"grad_norm": 0.2187838784331104,
"learning_rate": 9.129390288952273e-06,
"loss": 0.0228,
"num_tokens": 63863726.0,
"step": 140
},
{
"epoch": 1.1466395112016294,
"grad_norm": 0.19858994365963545,
"learning_rate": 9.115316822653043e-06,
"loss": 0.0203,
"num_tokens": 64339457.0,
"step": 141
},
{
"epoch": 1.154786150712831,
"grad_norm": 0.2091640165384244,
"learning_rate": 9.101142905742898e-06,
"loss": 0.0224,
"num_tokens": 64797748.0,
"step": 142
},
{
"epoch": 1.1629327902240325,
"grad_norm": 0.21848028557367125,
"learning_rate": 9.086868932040327e-06,
"loss": 0.0237,
"num_tokens": 65261816.0,
"step": 143
},
{
"epoch": 1.1710794297352343,
"grad_norm": 0.21024580943693202,
"learning_rate": 9.072495298143876e-06,
"loss": 0.0222,
"num_tokens": 65710096.0,
"step": 144
},
{
"epoch": 1.1792260692464358,
"grad_norm": 0.21112539693299767,
"learning_rate": 9.058022403421112e-06,
"loss": 0.0234,
"num_tokens": 66180522.0,
"step": 145
},
{
"epoch": 1.1873727087576376,
"grad_norm": 0.20089428024021022,
"learning_rate": 9.043450649997546e-06,
"loss": 0.0221,
"num_tokens": 66643220.0,
"step": 146
},
{
"epoch": 1.195519348268839,
"grad_norm": 0.21269472349968574,
"learning_rate": 9.028780442745452e-06,
"loss": 0.0236,
"num_tokens": 67103696.0,
"step": 147
},
{
"epoch": 1.2036659877800409,
"grad_norm": 0.19919608780198533,
"learning_rate": 9.014012189272612e-06,
"loss": 0.0215,
"num_tokens": 67578752.0,
"step": 148
},
{
"epoch": 1.2118126272912424,
"grad_norm": 0.19516667942695456,
"learning_rate": 8.999146299911001e-06,
"loss": 0.0226,
"num_tokens": 68024730.0,
"step": 149
},
{
"epoch": 1.219959266802444,
"grad_norm": 0.21431471881204775,
"learning_rate": 8.984183187705376e-06,
"loss": 0.0237,
"num_tokens": 68513599.0,
"step": 150
},
{
"epoch": 1.2281059063136457,
"grad_norm": 0.19529826679401555,
"learning_rate": 8.969123268401807e-06,
"loss": 0.0207,
"num_tokens": 68988237.0,
"step": 151
},
{
"epoch": 1.2362525458248472,
"grad_norm": 0.2014146714986417,
"learning_rate": 8.953966960436125e-06,
"loss": 0.0231,
"num_tokens": 69430574.0,
"step": 152
},
{
"epoch": 1.2443991853360488,
"grad_norm": 0.21239498172005217,
"learning_rate": 8.938714684922294e-06,
"loss": 0.0233,
"num_tokens": 69884264.0,
"step": 153
},
{
"epoch": 1.2525458248472505,
"grad_norm": 0.213004792751643,
"learning_rate": 8.923366865640708e-06,
"loss": 0.0237,
"num_tokens": 70361322.0,
"step": 154
},
{
"epoch": 1.260692464358452,
"grad_norm": 0.21940926870270266,
"learning_rate": 8.90792392902642e-06,
"loss": 0.0224,
"num_tokens": 70825081.0,
"step": 155
},
{
"epoch": 1.2688391038696538,
"grad_norm": 0.21496750501528322,
"learning_rate": 8.892386304157297e-06,
"loss": 0.0221,
"num_tokens": 71283936.0,
"step": 156
},
{
"epoch": 1.2769857433808554,
"grad_norm": 0.21230254367904663,
"learning_rate": 8.876754422742084e-06,
"loss": 0.0246,
"num_tokens": 71732305.0,
"step": 157
},
{
"epoch": 1.2851323828920571,
"grad_norm": 0.2188832765541447,
"learning_rate": 8.861028719108431e-06,
"loss": 0.0224,
"num_tokens": 72199220.0,
"step": 158
},
{
"epoch": 1.2932790224032586,
"grad_norm": 0.215744449219536,
"learning_rate": 8.845209630190804e-06,
"loss": 0.0232,
"num_tokens": 72686777.0,
"step": 159
},
{
"epoch": 1.3014256619144602,
"grad_norm": 0.2027419921199597,
"learning_rate": 8.829297595518357e-06,
"loss": 0.0217,
"num_tokens": 73141281.0,
"step": 160
},
{
"epoch": 1.309572301425662,
"grad_norm": 0.1999503892860215,
"learning_rate": 8.81329305720272e-06,
"loss": 0.0236,
"num_tokens": 73601661.0,
"step": 161
},
{
"epoch": 1.3177189409368635,
"grad_norm": 0.17428881801329021,
"learning_rate": 8.797196459925707e-06,
"loss": 0.0205,
"num_tokens": 74058631.0,
"step": 162
},
{
"epoch": 1.3258655804480652,
"grad_norm": 0.18566703149612335,
"learning_rate": 8.78100825092697e-06,
"loss": 0.0218,
"num_tokens": 74506287.0,
"step": 163
},
{
"epoch": 1.3340122199592668,
"grad_norm": 0.21095321978269194,
"learning_rate": 8.764728879991563e-06,
"loss": 0.0233,
"num_tokens": 74961649.0,
"step": 164
},
{
"epoch": 1.3421588594704685,
"grad_norm": 0.19297708933381486,
"learning_rate": 8.748358799437454e-06,
"loss": 0.0218,
"num_tokens": 75451492.0,
"step": 165
},
{
"epoch": 1.35030549898167,
"grad_norm": 0.21567434563638074,
"learning_rate": 8.731898464102955e-06,
"loss": 0.0233,
"num_tokens": 75906898.0,
"step": 166
},
{
"epoch": 1.3584521384928716,
"grad_norm": 0.20493170237350014,
"learning_rate": 8.715348331334079e-06,
"loss": 0.0225,
"num_tokens": 76352518.0,
"step": 167
},
{
"epoch": 1.3665987780040734,
"grad_norm": 0.20033609244286213,
"learning_rate": 8.698708860971837e-06,
"loss": 0.0217,
"num_tokens": 76833416.0,
"step": 168
},
{
"epoch": 1.374745417515275,
"grad_norm": 0.19488013729722037,
"learning_rate": 8.681980515339464e-06,
"loss": 0.0228,
"num_tokens": 77274089.0,
"step": 169
},
{
"epoch": 1.3828920570264764,
"grad_norm": 0.23159956916525645,
"learning_rate": 8.66516375922957e-06,
"loss": 0.026,
"num_tokens": 77722945.0,
"step": 170
},
{
"epoch": 1.3910386965376782,
"grad_norm": 0.20950731710653633,
"learning_rate": 8.648259059891222e-06,
"loss": 0.0238,
"num_tokens": 78165800.0,
"step": 171
},
{
"epoch": 1.39918533604888,
"grad_norm": 0.20384280771126798,
"learning_rate": 8.631266887016973e-06,
"loss": 0.0234,
"num_tokens": 78606210.0,
"step": 172
},
{
"epoch": 1.4073319755600815,
"grad_norm": 0.2174107340618266,
"learning_rate": 8.614187712729801e-06,
"loss": 0.025,
"num_tokens": 79049600.0,
"step": 173
},
{
"epoch": 1.415478615071283,
"grad_norm": 0.21419711356536544,
"learning_rate": 8.597022011569993e-06,
"loss": 0.0236,
"num_tokens": 79519022.0,
"step": 174
},
{
"epoch": 1.4236252545824848,
"grad_norm": 0.19839711735747953,
"learning_rate": 8.579770260481967e-06,
"loss": 0.0233,
"num_tokens": 79988589.0,
"step": 175
},
{
"epoch": 1.4317718940936863,
"grad_norm": 0.20080459279151233,
"learning_rate": 8.56243293880101e-06,
"loss": 0.0222,
"num_tokens": 80468185.0,
"step": 176
},
{
"epoch": 1.4399185336048879,
"grad_norm": 0.1934313717220866,
"learning_rate": 8.545010528239969e-06,
"loss": 0.0236,
"num_tokens": 80915153.0,
"step": 177
},
{
"epoch": 1.4480651731160896,
"grad_norm": 0.20353533737845392,
"learning_rate": 8.527503512875862e-06,
"loss": 0.023,
"num_tokens": 81406072.0,
"step": 178
},
{
"epoch": 1.4562118126272914,
"grad_norm": 0.1913760568401795,
"learning_rate": 8.509912379136429e-06,
"loss": 0.0213,
"num_tokens": 81861174.0,
"step": 179
},
{
"epoch": 1.464358452138493,
"grad_norm": 0.18760725003184955,
"learning_rate": 8.492237615786613e-06,
"loss": 0.0232,
"num_tokens": 82291515.0,
"step": 180
},
{
"epoch": 1.4725050916496945,
"grad_norm": 0.2073497810013695,
"learning_rate": 8.474479713914985e-06,
"loss": 0.0241,
"num_tokens": 82746207.0,
"step": 181
},
{
"epoch": 1.4806517311608962,
"grad_norm": 0.20252547578412558,
"learning_rate": 8.456639166920104e-06,
"loss": 0.023,
"num_tokens": 83217896.0,
"step": 182
},
{
"epoch": 1.4887983706720977,
"grad_norm": 0.19100843841767107,
"learning_rate": 8.438716470496793e-06,
"loss": 0.0234,
"num_tokens": 83673415.0,
"step": 183
},
{
"epoch": 1.4969450101832993,
"grad_norm": 0.18768759409970084,
"learning_rate": 8.42071212262238e-06,
"loss": 0.0217,
"num_tokens": 84165622.0,
"step": 184
},
{
"epoch": 1.505091649694501,
"grad_norm": 0.21146877851004245,
"learning_rate": 8.402626623542853e-06,
"loss": 0.0236,
"num_tokens": 84623691.0,
"step": 185
},
{
"epoch": 1.5132382892057028,
"grad_norm": 0.209209778931465,
"learning_rate": 8.384460475758967e-06,
"loss": 0.0244,
"num_tokens": 85066604.0,
"step": 186
},
{
"epoch": 1.5213849287169041,
"grad_norm": 0.20794230796465518,
"learning_rate": 8.36621418401228e-06,
"loss": 0.0245,
"num_tokens": 85500800.0,
"step": 187
},
{
"epoch": 1.5295315682281059,
"grad_norm": 0.19401787928805586,
"learning_rate": 8.347888255271126e-06,
"loss": 0.0227,
"num_tokens": 85950718.0,
"step": 188
},
{
"epoch": 1.5376782077393076,
"grad_norm": 0.19358587269712685,
"learning_rate": 8.329483198716536e-06,
"loss": 0.0216,
"num_tokens": 86425214.0,
"step": 189
},
{
"epoch": 1.5458248472505092,
"grad_norm": 0.19988901116993596,
"learning_rate": 8.310999525728083e-06,
"loss": 0.0237,
"num_tokens": 86872612.0,
"step": 190
},
{
"epoch": 1.5539714867617107,
"grad_norm": 0.21347868715899784,
"learning_rate": 8.292437749869676e-06,
"loss": 0.0237,
"num_tokens": 87321247.0,
"step": 191
},
{
"epoch": 1.5621181262729125,
"grad_norm": 0.21370368402938023,
"learning_rate": 8.273798386875292e-06,
"loss": 0.0247,
"num_tokens": 87762936.0,
"step": 192
},
{
"epoch": 1.570264765784114,
"grad_norm": 0.20394116229065584,
"learning_rate": 8.255081954634646e-06,
"loss": 0.0224,
"num_tokens": 88233384.0,
"step": 193
},
{
"epoch": 1.5784114052953155,
"grad_norm": 0.21271701085924696,
"learning_rate": 8.236288973178806e-06,
"loss": 0.024,
"num_tokens": 88702888.0,
"step": 194
},
{
"epoch": 1.5865580448065173,
"grad_norm": 0.20525261813526166,
"learning_rate": 8.217419964665728e-06,
"loss": 0.0228,
"num_tokens": 89157902.0,
"step": 195
},
{
"epoch": 1.594704684317719,
"grad_norm": 0.20518578666067122,
"learning_rate": 8.198475453365772e-06,
"loss": 0.0239,
"num_tokens": 89596892.0,
"step": 196
},
{
"epoch": 1.6028513238289206,
"grad_norm": 0.20424504177429212,
"learning_rate": 8.179455965647117e-06,
"loss": 0.024,
"num_tokens": 90043689.0,
"step": 197
},
{
"epoch": 1.6109979633401221,
"grad_norm": 0.21550795243608867,
"learning_rate": 8.16036202996114e-06,
"loss": 0.0255,
"num_tokens": 90493255.0,
"step": 198
},
{
"epoch": 1.6191446028513239,
"grad_norm": 0.21505565048112654,
"learning_rate": 8.141194176827738e-06,
"loss": 0.0246,
"num_tokens": 90933700.0,
"step": 199
},
{
"epoch": 1.6272912423625254,
"grad_norm": 0.18057787414765422,
"learning_rate": 8.12195293882058e-06,
"loss": 0.0205,
"num_tokens": 91402906.0,
"step": 200
},
{
"epoch": 1.635437881873727,
"grad_norm": 0.2128757560225609,
"learning_rate": 8.102638850552323e-06,
"loss": 0.0236,
"num_tokens": 91854715.0,
"step": 201
},
{
"epoch": 1.6435845213849287,
"grad_norm": 0.1977918829414763,
"learning_rate": 8.083252448659742e-06,
"loss": 0.022,
"num_tokens": 92317914.0,
"step": 202
},
{
"epoch": 1.6517311608961305,
"grad_norm": 0.22533293918121253,
"learning_rate": 8.063794271788826e-06,
"loss": 0.0256,
"num_tokens": 92775730.0,
"step": 203
},
{
"epoch": 1.659877800407332,
"grad_norm": 0.21714220334981602,
"learning_rate": 8.044264860579816e-06,
"loss": 0.0255,
"num_tokens": 93222261.0,
"step": 204
},
{
"epoch": 1.6680244399185336,
"grad_norm": 0.2061132581627763,
"learning_rate": 8.02466475765218e-06,
"loss": 0.0229,
"num_tokens": 93713195.0,
"step": 205
},
{
"epoch": 1.6761710794297353,
"grad_norm": 0.21189634915409705,
"learning_rate": 8.004994507589532e-06,
"loss": 0.0244,
"num_tokens": 94167787.0,
"step": 206
},
{
"epoch": 1.6843177189409368,
"grad_norm": 0.2049250916068622,
"learning_rate": 7.985254656924512e-06,
"loss": 0.0227,
"num_tokens": 94634140.0,
"step": 207
},
{
"epoch": 1.6924643584521384,
"grad_norm": 0.21854416704059987,
"learning_rate": 7.965445754123592e-06,
"loss": 0.0252,
"num_tokens": 95093967.0,
"step": 208
},
{
"epoch": 1.7006109979633401,
"grad_norm": 0.20098034036974133,
"learning_rate": 7.945568349571834e-06,
"loss": 0.0233,
"num_tokens": 95578447.0,
"step": 209
},
{
"epoch": 1.708757637474542,
"grad_norm": 0.19707920391781453,
"learning_rate": 7.925622995557609e-06,
"loss": 0.0234,
"num_tokens": 96028708.0,
"step": 210
},
{
"epoch": 1.7169042769857432,
"grad_norm": 0.183646916505621,
"learning_rate": 7.905610246257243e-06,
"loss": 0.0219,
"num_tokens": 96490579.0,
"step": 211
},
{
"epoch": 1.725050916496945,
"grad_norm": 0.21800938479643353,
"learning_rate": 7.885530657719623e-06,
"loss": 0.0245,
"num_tokens": 96939215.0,
"step": 212
},
{
"epoch": 1.7331975560081467,
"grad_norm": 0.17805921063304794,
"learning_rate": 7.865384787850742e-06,
"loss": 0.0207,
"num_tokens": 97416826.0,
"step": 213
},
{
"epoch": 1.7413441955193483,
"grad_norm": 0.20335070394293855,
"learning_rate": 7.845173196398213e-06,
"loss": 0.023,
"num_tokens": 97870409.0,
"step": 214
},
{
"epoch": 1.7494908350305498,
"grad_norm": 0.2014363721260783,
"learning_rate": 7.824896444935692e-06,
"loss": 0.023,
"num_tokens": 98303923.0,
"step": 215
},
{
"epoch": 1.7576374745417516,
"grad_norm": 0.19767917831916373,
"learning_rate": 7.804555096847298e-06,
"loss": 0.0206,
"num_tokens": 98792735.0,
"step": 216
},
{
"epoch": 1.765784114052953,
"grad_norm": 0.18927709030960627,
"learning_rate": 7.784149717311947e-06,
"loss": 0.0228,
"num_tokens": 99283099.0,
"step": 217
},
{
"epoch": 1.7739307535641546,
"grad_norm": 0.19540533688345146,
"learning_rate": 7.763680873287648e-06,
"loss": 0.0224,
"num_tokens": 99728623.0,
"step": 218
},
{
"epoch": 1.7820773930753564,
"grad_norm": 0.2021434762578394,
"learning_rate": 7.743149133495763e-06,
"loss": 0.0226,
"num_tokens": 100217105.0,
"step": 219
},
{
"epoch": 1.7902240325865582,
"grad_norm": 0.20319556075451253,
"learning_rate": 7.722555068405186e-06,
"loss": 0.024,
"num_tokens": 100658986.0,
"step": 220
},
{
"epoch": 1.7983706720977597,
"grad_norm": 0.2037408366987311,
"learning_rate": 7.70189925021651e-06,
"loss": 0.0243,
"num_tokens": 101137134.0,
"step": 221
},
{
"epoch": 1.8065173116089612,
"grad_norm": 0.21058268386430223,
"learning_rate": 7.681182252846115e-06,
"loss": 0.0241,
"num_tokens": 101594654.0,
"step": 222
},
{
"epoch": 1.814663951120163,
"grad_norm": 0.20499883443387898,
"learning_rate": 7.660404651910236e-06,
"loss": 0.0263,
"num_tokens": 102027887.0,
"step": 223
},
{
"epoch": 1.8228105906313645,
"grad_norm": 0.2084551925346071,
"learning_rate": 7.639567024708953e-06,
"loss": 0.0234,
"num_tokens": 102479243.0,
"step": 224
},
{
"epoch": 1.830957230142566,
"grad_norm": 0.21438521035457928,
"learning_rate": 7.6186699502101676e-06,
"loss": 0.0226,
"num_tokens": 102944020.0,
"step": 225
},
{
"epoch": 1.8391038696537678,
"grad_norm": 0.20743883238353383,
"learning_rate": 7.597714009033505e-06,
"loss": 0.0243,
"num_tokens": 103377204.0,
"step": 226
},
{
"epoch": 1.8472505091649696,
"grad_norm": 0.19590114337198036,
"learning_rate": 7.5766997834341836e-06,
"loss": 0.0229,
"num_tokens": 103836520.0,
"step": 227
},
{
"epoch": 1.8553971486761711,
"grad_norm": 0.2072497473244054,
"learning_rate": 7.555627857286843e-06,
"loss": 0.0247,
"num_tokens": 104285481.0,
"step": 228
},
{
"epoch": 1.8635437881873727,
"grad_norm": 0.18899125629327573,
"learning_rate": 7.534498816069315e-06,
"loss": 0.0213,
"num_tokens": 104746152.0,
"step": 229
},
{
"epoch": 1.8716904276985744,
"grad_norm": 0.21687392806104466,
"learning_rate": 7.513313246846357e-06,
"loss": 0.0232,
"num_tokens": 105207211.0,
"step": 230
},
{
"epoch": 1.879837067209776,
"grad_norm": 0.20114168053955322,
"learning_rate": 7.492071738253343e-06,
"loss": 0.0243,
"num_tokens": 105657445.0,
"step": 231
},
{
"epoch": 1.8879837067209775,
"grad_norm": 0.31880562870408674,
"learning_rate": 7.470774880479909e-06,
"loss": 0.0216,
"num_tokens": 106145000.0,
"step": 232
},
{
"epoch": 1.8961303462321792,
"grad_norm": 0.17709039062644658,
"learning_rate": 7.449423265253551e-06,
"loss": 0.0195,
"num_tokens": 106619177.0,
"step": 233
},
{
"epoch": 1.904276985743381,
"grad_norm": 0.1941234160393901,
"learning_rate": 7.428017485823189e-06,
"loss": 0.0221,
"num_tokens": 107100389.0,
"step": 234
},
{
"epoch": 1.9124236252545825,
"grad_norm": 0.21047496416728861,
"learning_rate": 7.406558136942677e-06,
"loss": 0.0253,
"num_tokens": 107531535.0,
"step": 235
},
{
"epoch": 1.920570264765784,
"grad_norm": 0.1811130030622756,
"learning_rate": 7.3850458148542835e-06,
"loss": 0.0218,
"num_tokens": 108000369.0,
"step": 236
},
{
"epoch": 1.9287169042769858,
"grad_norm": 0.18791035767087905,
"learning_rate": 7.363481117272125e-06,
"loss": 0.0217,
"num_tokens": 108465611.0,
"step": 237
},
{
"epoch": 1.9368635437881874,
"grad_norm": 0.174382304685201,
"learning_rate": 7.341864643365557e-06,
"loss": 0.0214,
"num_tokens": 108923767.0,
"step": 238
},
{
"epoch": 1.945010183299389,
"grad_norm": 0.1996921946422325,
"learning_rate": 7.320196993742522e-06,
"loss": 0.023,
"num_tokens": 109367680.0,
"step": 239
},
{
"epoch": 1.9531568228105907,
"grad_norm": 0.21146568012414002,
"learning_rate": 7.29847877043287e-06,
"loss": 0.0231,
"num_tokens": 109818455.0,
"step": 240
},
{
"epoch": 1.9613034623217924,
"grad_norm": 0.20624057045002148,
"learning_rate": 7.2767105768716295e-06,
"loss": 0.024,
"num_tokens": 110268348.0,
"step": 241
},
{
"epoch": 1.9694501018329937,
"grad_norm": 0.17960428006685406,
"learning_rate": 7.254893017882233e-06,
"loss": 0.0222,
"num_tokens": 110696800.0,
"step": 242
},
{
"epoch": 1.9775967413441955,
"grad_norm": 0.18718314902352962,
"learning_rate": 7.233026699659723e-06,
"loss": 0.0226,
"num_tokens": 111154475.0,
"step": 243
},
{
"epoch": 1.9857433808553973,
"grad_norm": 0.18787650373147796,
"learning_rate": 7.211112229753901e-06,
"loss": 0.0213,
"num_tokens": 111620815.0,
"step": 244
},
{
"epoch": 1.9938900203665988,
"grad_norm": 0.19693361518983973,
"learning_rate": 7.189150217052455e-06,
"loss": 0.0216,
"num_tokens": 112092986.0,
"step": 245
},
{
"epoch": 2.0,
"grad_norm": 0.2093105825813619,
"learning_rate": 7.1671412717640295e-06,
"loss": 0.0201,
"num_tokens": 112131036.0,
"step": 246
},
{
"epoch": 2.0,
"eval_loss": 0.04159076511859894,
"eval_num_tokens": 112131036.0,
"eval_runtime": 57.7607,
"eval_samples_per_second": 42.226,
"eval_steps_per_second": 5.28,
"step": 246
},
{
"epoch": 2.0081466395112018,
"grad_norm": 0.12416538079579213,
"learning_rate": 7.145086005401287e-06,
"loss": 0.0126,
"num_tokens": 112602682.0,
"step": 247
},
{
"epoch": 2.016293279022403,
"grad_norm": 0.15057303383190754,
"learning_rate": 7.122985030763901e-06,
"loss": 0.0145,
"num_tokens": 113073432.0,
"step": 248
},
{
"epoch": 2.024439918533605,
"grad_norm": 0.14759632900226355,
"learning_rate": 7.10083896192154e-06,
"loss": 0.0124,
"num_tokens": 113577827.0,
"step": 249
},
{
"epoch": 2.0325865580448066,
"grad_norm": 0.14133368502923574,
"learning_rate": 7.078648414196805e-06,
"loss": 0.0128,
"num_tokens": 114048831.0,
"step": 250
},
{
"epoch": 2.0407331975560083,
"grad_norm": 0.15715348160815634,
"learning_rate": 7.056414004148128e-06,
"loss": 0.0136,
"num_tokens": 114548364.0,
"step": 251
},
{
"epoch": 2.0488798370672097,
"grad_norm": 0.17716027065421572,
"learning_rate": 7.034136349552647e-06,
"loss": 0.016,
"num_tokens": 114999500.0,
"step": 252
},
{
"epoch": 2.0570264765784114,
"grad_norm": 0.1589991111261928,
"learning_rate": 7.011816069389034e-06,
"loss": 0.0145,
"num_tokens": 115456071.0,
"step": 253
},
{
"epoch": 2.065173116089613,
"grad_norm": 0.1543372807006171,
"learning_rate": 6.989453783820304e-06,
"loss": 0.0134,
"num_tokens": 115926758.0,
"step": 254
},
{
"epoch": 2.0733197556008145,
"grad_norm": 0.1691364992847739,
"learning_rate": 6.9670501141765825e-06,
"loss": 0.014,
"num_tokens": 116385952.0,
"step": 255
},
{
"epoch": 2.0814663951120163,
"grad_norm": 0.16602983431455004,
"learning_rate": 6.944605682937834e-06,
"loss": 0.0137,
"num_tokens": 116820035.0,
"step": 256
},
{
"epoch": 2.089613034623218,
"grad_norm": 0.18962015294617535,
"learning_rate": 6.92212111371658e-06,
"loss": 0.0143,
"num_tokens": 117297850.0,
"step": 257
},
{
"epoch": 2.0977596741344193,
"grad_norm": 0.17122221487492462,
"learning_rate": 6.8995970312405615e-06,
"loss": 0.0126,
"num_tokens": 117759960.0,
"step": 258
},
{
"epoch": 2.105906313645621,
"grad_norm": 0.17247701616442646,
"learning_rate": 6.877034061335384e-06,
"loss": 0.0139,
"num_tokens": 118229929.0,
"step": 259
},
{
"epoch": 2.114052953156823,
"grad_norm": 0.18706937438179935,
"learning_rate": 6.854432830907135e-06,
"loss": 0.0151,
"num_tokens": 118689637.0,
"step": 260
},
{
"epoch": 2.1221995926680246,
"grad_norm": 0.1756410658036281,
"learning_rate": 6.831793967924953e-06,
"loss": 0.0136,
"num_tokens": 119159530.0,
"step": 261
},
{
"epoch": 2.130346232179226,
"grad_norm": 0.17325809294266983,
"learning_rate": 6.8091181014035935e-06,
"loss": 0.014,
"num_tokens": 119598302.0,
"step": 262
},
{
"epoch": 2.1384928716904277,
"grad_norm": 0.1831164025049776,
"learning_rate": 6.7864058613859395e-06,
"loss": 0.0138,
"num_tokens": 120108425.0,
"step": 263
},
{
"epoch": 2.1466395112016294,
"grad_norm": 0.18048260933108903,
"learning_rate": 6.763657878925508e-06,
"loss": 0.015,
"num_tokens": 120578186.0,
"step": 264
},
{
"epoch": 2.1547861507128308,
"grad_norm": 0.16327229194519086,
"learning_rate": 6.740874786068906e-06,
"loss": 0.0126,
"num_tokens": 121055989.0,
"step": 265
},
{
"epoch": 2.1629327902240325,
"grad_norm": 0.16958620370157418,
"learning_rate": 6.718057215838274e-06,
"loss": 0.0144,
"num_tokens": 121502528.0,
"step": 266
},
{
"epoch": 2.1710794297352343,
"grad_norm": 0.1792898292473741,
"learning_rate": 6.695205802213699e-06,
"loss": 0.0136,
"num_tokens": 121956627.0,
"step": 267
},
{
"epoch": 2.179226069246436,
"grad_norm": 0.15481947737459167,
"learning_rate": 6.672321180115595e-06,
"loss": 0.0125,
"num_tokens": 122426773.0,
"step": 268
},
{
"epoch": 2.1873727087576373,
"grad_norm": 0.16707758315087737,
"learning_rate": 6.6494039853870676e-06,
"loss": 0.0132,
"num_tokens": 122875336.0,
"step": 269
},
{
"epoch": 2.195519348268839,
"grad_norm": 0.16476693800658634,
"learning_rate": 6.6264548547762395e-06,
"loss": 0.0138,
"num_tokens": 123320079.0,
"step": 270
},
{
"epoch": 2.203665987780041,
"grad_norm": 0.16291392396662507,
"learning_rate": 6.603474425918573e-06,
"loss": 0.0136,
"num_tokens": 123791870.0,
"step": 271
},
{
"epoch": 2.211812627291242,
"grad_norm": 0.1703687751088918,
"learning_rate": 6.580463337319128e-06,
"loss": 0.0133,
"num_tokens": 124260736.0,
"step": 272
},
{
"epoch": 2.219959266802444,
"grad_norm": 0.17901374374090187,
"learning_rate": 6.557422228334852e-06,
"loss": 0.0159,
"num_tokens": 124681807.0,
"step": 273
},
{
"epoch": 2.2281059063136457,
"grad_norm": 0.16798711219930113,
"learning_rate": 6.534351739156797e-06,
"loss": 0.0142,
"num_tokens": 125127429.0,
"step": 274
},
{
"epoch": 2.2362525458248474,
"grad_norm": 0.16305596345258705,
"learning_rate": 6.5112525107923296e-06,
"loss": 0.0135,
"num_tokens": 125567336.0,
"step": 275
},
{
"epoch": 2.2443991853360488,
"grad_norm": 0.17643316822000632,
"learning_rate": 6.488125185047334e-06,
"loss": 0.0147,
"num_tokens": 126021900.0,
"step": 276
},
{
"epoch": 2.2525458248472505,
"grad_norm": 0.167997975045288,
"learning_rate": 6.464970404508369e-06,
"loss": 0.0139,
"num_tokens": 126491133.0,
"step": 277
},
{
"epoch": 2.2606924643584523,
"grad_norm": 0.1808990629197575,
"learning_rate": 6.4417888125248195e-06,
"loss": 0.0153,
"num_tokens": 126949660.0,
"step": 278
},
{
"epoch": 2.2688391038696536,
"grad_norm": 0.18179273918150798,
"learning_rate": 6.418581053191017e-06,
"loss": 0.0155,
"num_tokens": 127395046.0,
"step": 279
},
{
"epoch": 2.2769857433808554,
"grad_norm": 0.16186916571289603,
"learning_rate": 6.39534777132835e-06,
"loss": 0.0141,
"num_tokens": 127879266.0,
"step": 280
},
{
"epoch": 2.285132382892057,
"grad_norm": 0.1687611769820901,
"learning_rate": 6.3720896124673356e-06,
"loss": 0.0142,
"num_tokens": 128345971.0,
"step": 281
},
{
"epoch": 2.293279022403259,
"grad_norm": 0.18415607421229815,
"learning_rate": 6.348807222829704e-06,
"loss": 0.0155,
"num_tokens": 128804402.0,
"step": 282
},
{
"epoch": 2.30142566191446,
"grad_norm": 0.16514691991418554,
"learning_rate": 6.325501249310416e-06,
"loss": 0.0146,
"num_tokens": 129261613.0,
"step": 283
},
{
"epoch": 2.309572301425662,
"grad_norm": 0.16769380960540944,
"learning_rate": 6.302172339459717e-06,
"loss": 0.0136,
"num_tokens": 129748258.0,
"step": 284
},
{
"epoch": 2.3177189409368637,
"grad_norm": 0.17542238238137692,
"learning_rate": 6.278821141465126e-06,
"loss": 0.0147,
"num_tokens": 130203139.0,
"step": 285
},
{
"epoch": 2.325865580448065,
"grad_norm": 0.1703028823912319,
"learning_rate": 6.255448304133435e-06,
"loss": 0.0144,
"num_tokens": 130680052.0,
"step": 286
},
{
"epoch": 2.3340122199592668,
"grad_norm": 0.15875518919149162,
"learning_rate": 6.232054476872674e-06,
"loss": 0.013,
"num_tokens": 131145142.0,
"step": 287
},
{
"epoch": 2.3421588594704685,
"grad_norm": 0.1559999046320083,
"learning_rate": 6.208640309674081e-06,
"loss": 0.0138,
"num_tokens": 131606714.0,
"step": 288
},
{
"epoch": 2.35030549898167,
"grad_norm": 0.16638792870478772,
"learning_rate": 6.185206453094026e-06,
"loss": 0.0133,
"num_tokens": 132070874.0,
"step": 289
},
{
"epoch": 2.3584521384928716,
"grad_norm": 0.16556273278032177,
"learning_rate": 6.161753558235945e-06,
"loss": 0.0144,
"num_tokens": 132523899.0,
"step": 290
},
{
"epoch": 2.3665987780040734,
"grad_norm": 0.1627153835397699,
"learning_rate": 6.138282276732251e-06,
"loss": 0.0141,
"num_tokens": 132984150.0,
"step": 291
},
{
"epoch": 2.374745417515275,
"grad_norm": 0.17420180567604815,
"learning_rate": 6.1147932607262215e-06,
"loss": 0.0153,
"num_tokens": 133423004.0,
"step": 292
},
{
"epoch": 2.3828920570264764,
"grad_norm": 0.1715872000392912,
"learning_rate": 6.091287162853883e-06,
"loss": 0.0143,
"num_tokens": 133885515.0,
"step": 293
},
{
"epoch": 2.391038696537678,
"grad_norm": 0.15875189010502294,
"learning_rate": 6.067764636225881e-06,
"loss": 0.0144,
"num_tokens": 134334800.0,
"step": 294
},
{
"epoch": 2.39918533604888,
"grad_norm": 0.14222500494759646,
"learning_rate": 6.0442263344093224e-06,
"loss": 0.0128,
"num_tokens": 134794203.0,
"step": 295
},
{
"epoch": 2.4073319755600817,
"grad_norm": 0.16587875726539164,
"learning_rate": 6.020672911409626e-06,
"loss": 0.014,
"num_tokens": 135246488.0,
"step": 296
},
{
"epoch": 2.415478615071283,
"grad_norm": 0.16945883731715217,
"learning_rate": 5.997105021652355e-06,
"loss": 0.0142,
"num_tokens": 135734483.0,
"step": 297
},
{
"epoch": 2.423625254582485,
"grad_norm": 0.17856893575225632,
"learning_rate": 5.97352331996502e-06,
"loss": 0.0149,
"num_tokens": 136180989.0,
"step": 298
},
{
"epoch": 2.4317718940936865,
"grad_norm": 0.16901738376392064,
"learning_rate": 5.949928461558894e-06,
"loss": 0.0145,
"num_tokens": 136633463.0,
"step": 299
},
{
"epoch": 2.439918533604888,
"grad_norm": 0.19440740998217734,
"learning_rate": 5.926321102010808e-06,
"loss": 0.0153,
"num_tokens": 137065466.0,
"step": 300
},
{
"epoch": 2.4480651731160896,
"grad_norm": 0.17396280168075312,
"learning_rate": 5.902701897244932e-06,
"loss": 0.014,
"num_tokens": 137519052.0,
"step": 301
},
{
"epoch": 2.4562118126272914,
"grad_norm": 0.1962070967726784,
"learning_rate": 5.879071503514555e-06,
"loss": 0.0167,
"num_tokens": 137969737.0,
"step": 302
},
{
"epoch": 2.4643584521384927,
"grad_norm": 0.17287895679065615,
"learning_rate": 5.855430577383842e-06,
"loss": 0.0148,
"num_tokens": 138433151.0,
"step": 303
},
{
"epoch": 2.4725050916496945,
"grad_norm": 0.19400622443946244,
"learning_rate": 5.831779775709606e-06,
"loss": 0.0148,
"num_tokens": 138875359.0,
"step": 304
},
{
"epoch": 2.480651731160896,
"grad_norm": 0.16969104274852342,
"learning_rate": 5.808119755623045e-06,
"loss": 0.0141,
"num_tokens": 139333435.0,
"step": 305
},
{
"epoch": 2.4887983706720975,
"grad_norm": 0.17975044746142824,
"learning_rate": 5.784451174511486e-06,
"loss": 0.0155,
"num_tokens": 139787251.0,
"step": 306
},
{
"epoch": 2.4969450101832993,
"grad_norm": 0.18637909822915394,
"learning_rate": 5.760774690000128e-06,
"loss": 0.014,
"num_tokens": 140263010.0,
"step": 307
},
{
"epoch": 2.505091649694501,
"grad_norm": 0.1755752695664621,
"learning_rate": 5.7370909599337585e-06,
"loss": 0.0143,
"num_tokens": 140730852.0,
"step": 308
},
{
"epoch": 2.513238289205703,
"grad_norm": 0.17738520787824683,
"learning_rate": 5.713400642358483e-06,
"loss": 0.015,
"num_tokens": 141160459.0,
"step": 309
},
{
"epoch": 2.521384928716904,
"grad_norm": 0.17114933786372763,
"learning_rate": 5.689704395503438e-06,
"loss": 0.0137,
"num_tokens": 141652980.0,
"step": 310
},
{
"epoch": 2.529531568228106,
"grad_norm": 0.1702830061303869,
"learning_rate": 5.666002877762506e-06,
"loss": 0.0153,
"num_tokens": 142092423.0,
"step": 311
},
{
"epoch": 2.5376782077393076,
"grad_norm": 0.17360071510124675,
"learning_rate": 5.642296747676016e-06,
"loss": 0.0145,
"num_tokens": 142533489.0,
"step": 312
},
{
"epoch": 2.5458248472505094,
"grad_norm": 0.1607500590426996,
"learning_rate": 5.618586663912452e-06,
"loss": 0.0133,
"num_tokens": 142991787.0,
"step": 313
},
{
"epoch": 2.5539714867617107,
"grad_norm": 0.16048833714516317,
"learning_rate": 5.594873285250151e-06,
"loss": 0.0136,
"num_tokens": 143468508.0,
"step": 314
},
{
"epoch": 2.5621181262729125,
"grad_norm": 0.1838591156346174,
"learning_rate": 5.571157270558995e-06,
"loss": 0.0163,
"num_tokens": 143916886.0,
"step": 315
},
{
"epoch": 2.5702647657841142,
"grad_norm": 0.162005395980572,
"learning_rate": 5.5474392787821096e-06,
"loss": 0.0135,
"num_tokens": 144388134.0,
"step": 316
},
{
"epoch": 2.5784114052953155,
"grad_norm": 0.14863787001529957,
"learning_rate": 5.52371996891755e-06,
"loss": 0.0132,
"num_tokens": 144871370.0,
"step": 317
},
{
"epoch": 2.5865580448065173,
"grad_norm": 0.16754180761222826,
"learning_rate": 5.500000000000001e-06,
"loss": 0.0146,
"num_tokens": 145320563.0,
"step": 318
},
{
"epoch": 2.594704684317719,
"grad_norm": 0.18005757817722826,
"learning_rate": 5.476280031082451e-06,
"loss": 0.016,
"num_tokens": 145758817.0,
"step": 319
},
{
"epoch": 2.6028513238289204,
"grad_norm": 0.17034049730069928,
"learning_rate": 5.452560721217892e-06,
"loss": 0.0155,
"num_tokens": 146189214.0,
"step": 320
},
{
"epoch": 2.610997963340122,
"grad_norm": 0.15413011304140098,
"learning_rate": 5.428842729441008e-06,
"loss": 0.0144,
"num_tokens": 146640888.0,
"step": 321
},
{
"epoch": 2.619144602851324,
"grad_norm": 0.17242345415805765,
"learning_rate": 5.405126714749852e-06,
"loss": 0.0144,
"num_tokens": 147089993.0,
"step": 322
},
{
"epoch": 2.627291242362525,
"grad_norm": 0.15793761105384327,
"learning_rate": 5.38141333608755e-06,
"loss": 0.0137,
"num_tokens": 147549085.0,
"step": 323
},
{
"epoch": 2.635437881873727,
"grad_norm": 0.15260230173501832,
"learning_rate": 5.357703252323985e-06,
"loss": 0.0127,
"num_tokens": 148018238.0,
"step": 324
},
{
"epoch": 2.6435845213849287,
"grad_norm": 0.17616115019719872,
"learning_rate": 5.333997122237497e-06,
"loss": 0.0142,
"num_tokens": 148467378.0,
"step": 325
},
{
"epoch": 2.6517311608961305,
"grad_norm": 0.16869830739625263,
"learning_rate": 5.310295604496563e-06,
"loss": 0.0145,
"num_tokens": 148924273.0,
"step": 326
},
{
"epoch": 2.6598778004073322,
"grad_norm": 0.1516947132562575,
"learning_rate": 5.286599357641519e-06,
"loss": 0.0132,
"num_tokens": 149394678.0,
"step": 327
},
{
"epoch": 2.6680244399185336,
"grad_norm": 0.1644528806031863,
"learning_rate": 5.262909040066243e-06,
"loss": 0.0138,
"num_tokens": 149841850.0,
"step": 328
},
{
"epoch": 2.6761710794297353,
"grad_norm": 0.1958369178369615,
"learning_rate": 5.239225309999875e-06,
"loss": 0.0156,
"num_tokens": 150282571.0,
"step": 329
},
{
"epoch": 2.684317718940937,
"grad_norm": 0.18244134325631398,
"learning_rate": 5.215548825488514e-06,
"loss": 0.0148,
"num_tokens": 150723879.0,
"step": 330
},
{
"epoch": 2.6924643584521384,
"grad_norm": 0.16252807203895167,
"learning_rate": 5.191880244376957e-06,
"loss": 0.015,
"num_tokens": 151164471.0,
"step": 331
},
{
"epoch": 2.70061099796334,
"grad_norm": 0.16907582749071554,
"learning_rate": 5.168220224290395e-06,
"loss": 0.013,
"num_tokens": 151627236.0,
"step": 332
},
{
"epoch": 2.708757637474542,
"grad_norm": 0.1521247341456988,
"learning_rate": 5.144569422616159e-06,
"loss": 0.0128,
"num_tokens": 152112152.0,
"step": 333
},
{
"epoch": 2.716904276985743,
"grad_norm": 0.15854466142189536,
"learning_rate": 5.120928496485448e-06,
"loss": 0.013,
"num_tokens": 152585932.0,
"step": 334
},
{
"epoch": 2.725050916496945,
"grad_norm": 0.17382518355000084,
"learning_rate": 5.097298102755069e-06,
"loss": 0.0139,
"num_tokens": 153055065.0,
"step": 335
},
{
"epoch": 2.7331975560081467,
"grad_norm": 0.17769717946639274,
"learning_rate": 5.073678897989194e-06,
"loss": 0.0146,
"num_tokens": 153518977.0,
"step": 336
},
{
"epoch": 2.741344195519348,
"grad_norm": 0.16803081279999066,
"learning_rate": 5.050071538441107e-06,
"loss": 0.0138,
"num_tokens": 153976769.0,
"step": 337
},
{
"epoch": 2.74949083503055,
"grad_norm": 0.17280566271506004,
"learning_rate": 5.026476680034983e-06,
"loss": 0.0154,
"num_tokens": 154408635.0,
"step": 338
},
{
"epoch": 2.7576374745417516,
"grad_norm": 0.169314934778943,
"learning_rate": 5.002894978347646e-06,
"loss": 0.0145,
"num_tokens": 154856201.0,
"step": 339
},
{
"epoch": 2.765784114052953,
"grad_norm": 0.18005778973651862,
"learning_rate": 4.979327088590375e-06,
"loss": 0.0144,
"num_tokens": 155310653.0,
"step": 340
},
{
"epoch": 2.7739307535641546,
"grad_norm": 0.16841593789310932,
"learning_rate": 4.95577366559068e-06,
"loss": 0.0136,
"num_tokens": 155788563.0,
"step": 341
},
{
"epoch": 2.7820773930753564,
"grad_norm": 0.17699144806638442,
"learning_rate": 4.932235363774121e-06,
"loss": 0.015,
"num_tokens": 156228468.0,
"step": 342
},
{
"epoch": 2.790224032586558,
"grad_norm": 0.15791462498013234,
"learning_rate": 4.908712837146118e-06,
"loss": 0.014,
"num_tokens": 156664176.0,
"step": 343
},
{
"epoch": 2.79837067209776,
"grad_norm": 0.15227035101116576,
"learning_rate": 4.88520673927378e-06,
"loss": 0.0128,
"num_tokens": 157134252.0,
"step": 344
},
{
"epoch": 2.8065173116089612,
"grad_norm": 0.1594189404919013,
"learning_rate": 4.861717723267752e-06,
"loss": 0.0136,
"num_tokens": 157599805.0,
"step": 345
},
{
"epoch": 2.814663951120163,
"grad_norm": 0.15995395220658057,
"learning_rate": 4.838246441764056e-06,
"loss": 0.0134,
"num_tokens": 158053673.0,
"step": 346
},
{
"epoch": 2.8228105906313647,
"grad_norm": 0.18202172640131933,
"learning_rate": 4.814793546905977e-06,
"loss": 0.0157,
"num_tokens": 158485241.0,
"step": 347
},
{
"epoch": 2.830957230142566,
"grad_norm": 0.15339369785350124,
"learning_rate": 4.791359690325921e-06,
"loss": 0.0123,
"num_tokens": 158947625.0,
"step": 348
},
{
"epoch": 2.839103869653768,
"grad_norm": 0.16788836990713416,
"learning_rate": 4.767945523127327e-06,
"loss": 0.0137,
"num_tokens": 159423146.0,
"step": 349
},
{
"epoch": 2.8472505091649696,
"grad_norm": 0.16165924770039114,
"learning_rate": 4.744551695866567e-06,
"loss": 0.0148,
"num_tokens": 159861129.0,
"step": 350
},
{
"epoch": 2.855397148676171,
"grad_norm": 0.1774028782674121,
"learning_rate": 4.721178858534876e-06,
"loss": 0.0148,
"num_tokens": 160329576.0,
"step": 351
},
{
"epoch": 2.8635437881873727,
"grad_norm": 0.16285854302808034,
"learning_rate": 4.697827660540285e-06,
"loss": 0.014,
"num_tokens": 160797840.0,
"step": 352
},
{
"epoch": 2.8716904276985744,
"grad_norm": 0.18877633260447374,
"learning_rate": 4.674498750689585e-06,
"loss": 0.0147,
"num_tokens": 161243065.0,
"step": 353
},
{
"epoch": 2.8798370672097757,
"grad_norm": 0.1601336012550065,
"learning_rate": 4.651192777170298e-06,
"loss": 0.0143,
"num_tokens": 161699619.0,
"step": 354
},
{
"epoch": 2.8879837067209775,
"grad_norm": 0.16259855324262715,
"learning_rate": 4.627910387532663e-06,
"loss": 0.014,
"num_tokens": 162166184.0,
"step": 355
},
{
"epoch": 2.8961303462321792,
"grad_norm": 0.17610385289208558,
"learning_rate": 4.604652228671653e-06,
"loss": 0.0147,
"num_tokens": 162610492.0,
"step": 356
},
{
"epoch": 2.904276985743381,
"grad_norm": 0.1838783740248808,
"learning_rate": 4.581418946808983e-06,
"loss": 0.0153,
"num_tokens": 163056383.0,
"step": 357
},
{
"epoch": 2.9124236252545828,
"grad_norm": 0.15216837256215965,
"learning_rate": 4.558211187475181e-06,
"loss": 0.0123,
"num_tokens": 163543282.0,
"step": 358
},
{
"epoch": 2.920570264765784,
"grad_norm": 0.15127415963746377,
"learning_rate": 4.535029595491632e-06,
"loss": 0.0125,
"num_tokens": 163999105.0,
"step": 359
},
{
"epoch": 2.928716904276986,
"grad_norm": 0.17498160266417795,
"learning_rate": 4.511874814952668e-06,
"loss": 0.0136,
"num_tokens": 164458000.0,
"step": 360
},
{
"epoch": 2.9368635437881876,
"grad_norm": 0.16423843849992176,
"learning_rate": 4.488747489207672e-06,
"loss": 0.0133,
"num_tokens": 164928642.0,
"step": 361
},
{
"epoch": 2.945010183299389,
"grad_norm": 0.15573818980541582,
"learning_rate": 4.4656482608432054e-06,
"loss": 0.0123,
"num_tokens": 165408976.0,
"step": 362
},
{
"epoch": 2.9531568228105907,
"grad_norm": 0.17543688765013044,
"learning_rate": 4.442577771665147e-06,
"loss": 0.014,
"num_tokens": 165886616.0,
"step": 363
},
{
"epoch": 2.9613034623217924,
"grad_norm": 0.16511980145949437,
"learning_rate": 4.419536662680873e-06,
"loss": 0.0127,
"num_tokens": 166343018.0,
"step": 364
},
{
"epoch": 2.9694501018329937,
"grad_norm": 0.1587473778216488,
"learning_rate": 4.39652557408143e-06,
"loss": 0.0124,
"num_tokens": 166801376.0,
"step": 365
},
{
"epoch": 2.9775967413441955,
"grad_norm": 0.17009893552558653,
"learning_rate": 4.373545145223761e-06,
"loss": 0.0145,
"num_tokens": 167280403.0,
"step": 366
},
{
"epoch": 2.9857433808553973,
"grad_norm": 0.15750436171587542,
"learning_rate": 4.350596014612935e-06,
"loss": 0.0134,
"num_tokens": 167726691.0,
"step": 367
},
{
"epoch": 2.9938900203665986,
"grad_norm": 0.17197192413672613,
"learning_rate": 4.327678819884405e-06,
"loss": 0.0143,
"num_tokens": 168173644.0,
"step": 368
},
{
"epoch": 3.0,
"grad_norm": 0.19850738254699854,
"learning_rate": 4.304794197786304e-06,
"loss": 0.0141,
"num_tokens": 168216390.0,
"step": 369
},
{
"epoch": 3.0,
"eval_loss": 0.041459400206804276,
"eval_num_tokens": 168216390.0,
"eval_runtime": 58.1871,
"eval_samples_per_second": 41.917,
"eval_steps_per_second": 5.242,
"step": 369
},
{
"epoch": 3.0081466395112018,
"grad_norm": 0.12585174258784562,
"learning_rate": 4.281942784161728e-06,
"loss": 0.0099,
"num_tokens": 168660422.0,
"step": 370
},
{
"epoch": 3.016293279022403,
"grad_norm": 0.11765413035696883,
"learning_rate": 4.2591252139310945e-06,
"loss": 0.0082,
"num_tokens": 169121635.0,
"step": 371
},
{
"epoch": 3.024439918533605,
"grad_norm": 0.11700242994990097,
"learning_rate": 4.2363421210744925e-06,
"loss": 0.0083,
"num_tokens": 169588292.0,
"step": 372
},
{
"epoch": 3.0325865580448066,
"grad_norm": 0.13410847188727293,
"learning_rate": 4.213594138614062e-06,
"loss": 0.0097,
"num_tokens": 170048576.0,
"step": 373
},
{
"epoch": 3.0407331975560083,
"grad_norm": 0.11184500956394558,
"learning_rate": 4.190881898596409e-06,
"loss": 0.0079,
"num_tokens": 170553649.0,
"step": 374
},
{
"epoch": 3.0488798370672097,
"grad_norm": 0.12083327220094565,
"learning_rate": 4.168206032075048e-06,
"loss": 0.0086,
"num_tokens": 171011806.0,
"step": 375
},
{
"epoch": 3.0570264765784114,
"grad_norm": 0.13145187085930216,
"learning_rate": 4.1455671690928666e-06,
"loss": 0.009,
"num_tokens": 171488462.0,
"step": 376
},
{
"epoch": 3.065173116089613,
"grad_norm": 0.13334793710473314,
"learning_rate": 4.122965938664616e-06,
"loss": 0.0086,
"num_tokens": 171943130.0,
"step": 377
},
{
"epoch": 3.0733197556008145,
"grad_norm": 0.1332625062123775,
"learning_rate": 4.100402968759441e-06,
"loss": 0.0093,
"num_tokens": 172384061.0,
"step": 378
},
{
"epoch": 3.0814663951120163,
"grad_norm": 0.13147800386811567,
"learning_rate": 4.077878886283422e-06,
"loss": 0.0085,
"num_tokens": 172832702.0,
"step": 379
},
{
"epoch": 3.089613034623218,
"grad_norm": 0.1411078689570707,
"learning_rate": 4.055394317062168e-06,
"loss": 0.0104,
"num_tokens": 173290817.0,
"step": 380
},
{
"epoch": 3.0977596741344193,
"grad_norm": 0.1284905098348191,
"learning_rate": 4.03294988582342e-06,
"loss": 0.0079,
"num_tokens": 173766754.0,
"step": 381
},
{
"epoch": 3.105906313645621,
"grad_norm": 0.13291783263584392,
"learning_rate": 4.010546216179697e-06,
"loss": 0.008,
"num_tokens": 174227586.0,
"step": 382
},
{
"epoch": 3.114052953156823,
"grad_norm": 0.13439803780962148,
"learning_rate": 3.988183930610967e-06,
"loss": 0.0084,
"num_tokens": 174684443.0,
"step": 383
},
{
"epoch": 3.1221995926680246,
"grad_norm": 0.1318097744846226,
"learning_rate": 3.965863650447355e-06,
"loss": 0.0081,
"num_tokens": 175153040.0,
"step": 384
},
{
"epoch": 3.130346232179226,
"grad_norm": 0.14505278918262016,
"learning_rate": 3.943585995851872e-06,
"loss": 0.0088,
"num_tokens": 175616900.0,
"step": 385
},
{
"epoch": 3.1384928716904277,
"grad_norm": 0.143736668078946,
"learning_rate": 3.9213515858031984e-06,
"loss": 0.0085,
"num_tokens": 176098251.0,
"step": 386
},
{
"epoch": 3.1466395112016294,
"grad_norm": 0.13749127082571724,
"learning_rate": 3.8991610380784626e-06,
"loss": 0.0076,
"num_tokens": 176570672.0,
"step": 387
},
{
"epoch": 3.1547861507128308,
"grad_norm": 0.15661494242610496,
"learning_rate": 3.877014969236102e-06,
"loss": 0.0101,
"num_tokens": 177008465.0,
"step": 388
},
{
"epoch": 3.1629327902240325,
"grad_norm": 0.15062683514898298,
"learning_rate": 3.854913994598715e-06,
"loss": 0.0089,
"num_tokens": 177466175.0,
"step": 389
},
{
"epoch": 3.1710794297352343,
"grad_norm": 0.1391922011105707,
"learning_rate": 3.832858728235971e-06,
"loss": 0.0093,
"num_tokens": 177917874.0,
"step": 390
},
{
"epoch": 3.179226069246436,
"grad_norm": 0.1552031660404893,
"learning_rate": 3.8108497829475465e-06,
"loss": 0.0105,
"num_tokens": 178367628.0,
"step": 391
},
{
"epoch": 3.1873727087576373,
"grad_norm": 0.13811754646428342,
"learning_rate": 3.7888877702460992e-06,
"loss": 0.0091,
"num_tokens": 178825445.0,
"step": 392
},
{
"epoch": 3.195519348268839,
"grad_norm": 0.12162345237220032,
"learning_rate": 3.7669733003402775e-06,
"loss": 0.0073,
"num_tokens": 179301109.0,
"step": 393
},
{
"epoch": 3.203665987780041,
"grad_norm": 0.13707719742366498,
"learning_rate": 3.7451069821177677e-06,
"loss": 0.0092,
"num_tokens": 179757593.0,
"step": 394
},
{
"epoch": 3.211812627291242,
"grad_norm": 0.13095735092161556,
"learning_rate": 3.7232894231283724e-06,
"loss": 0.0092,
"num_tokens": 180213993.0,
"step": 395
},
{
"epoch": 3.219959266802444,
"grad_norm": 0.13262472070811615,
"learning_rate": 3.701521229567131e-06,
"loss": 0.0085,
"num_tokens": 180668901.0,
"step": 396
},
{
"epoch": 3.2281059063136457,
"grad_norm": 0.13971045948367564,
"learning_rate": 3.6798030062574807e-06,
"loss": 0.0088,
"num_tokens": 181137029.0,
"step": 397
},
{
"epoch": 3.2362525458248474,
"grad_norm": 0.15719898296312626,
"learning_rate": 3.6581353566344447e-06,
"loss": 0.0091,
"num_tokens": 181583795.0,
"step": 398
},
{
"epoch": 3.2443991853360488,
"grad_norm": 0.13349745981088976,
"learning_rate": 3.6365188827278752e-06,
"loss": 0.0083,
"num_tokens": 182040738.0,
"step": 399
},
{
"epoch": 3.2525458248472505,
"grad_norm": 0.1507228385771512,
"learning_rate": 3.6149541851457183e-06,
"loss": 0.0093,
"num_tokens": 182494412.0,
"step": 400
},
{
"epoch": 3.2606924643584523,
"grad_norm": 0.13598098409095466,
"learning_rate": 3.593441863057325e-06,
"loss": 0.0092,
"num_tokens": 182943146.0,
"step": 401
},
{
"epoch": 3.2688391038696536,
"grad_norm": 0.13606743657097284,
"learning_rate": 3.5719825141768128e-06,
"loss": 0.0092,
"num_tokens": 183393591.0,
"step": 402
},
{
"epoch": 3.2769857433808554,
"grad_norm": 0.14156987679154379,
"learning_rate": 3.5505767347464504e-06,
"loss": 0.009,
"num_tokens": 183862449.0,
"step": 403
},
{
"epoch": 3.285132382892057,
"grad_norm": 0.13512553050700174,
"learning_rate": 3.5292251195200932e-06,
"loss": 0.0093,
"num_tokens": 184305229.0,
"step": 404
},
{
"epoch": 3.293279022403259,
"grad_norm": 0.11472791583197466,
"learning_rate": 3.5079282617466594e-06,
"loss": 0.0078,
"num_tokens": 184802522.0,
"step": 405
},
{
"epoch": 3.30142566191446,
"grad_norm": 0.12789474002800086,
"learning_rate": 3.486686753153645e-06,
"loss": 0.0083,
"num_tokens": 185274960.0,
"step": 406
},
{
"epoch": 3.309572301425662,
"grad_norm": 0.1275610588019882,
"learning_rate": 3.4655011839306866e-06,
"loss": 0.009,
"num_tokens": 185709382.0,
"step": 407
},
{
"epoch": 3.3177189409368637,
"grad_norm": 0.1404980269677411,
"learning_rate": 3.4443721427131593e-06,
"loss": 0.0095,
"num_tokens": 186161144.0,
"step": 408
},
{
"epoch": 3.325865580448065,
"grad_norm": 0.13529566839707055,
"learning_rate": 3.423300216565819e-06,
"loss": 0.0086,
"num_tokens": 186619778.0,
"step": 409
},
{
"epoch": 3.3340122199592668,
"grad_norm": 0.1387178170918977,
"learning_rate": 3.4022859909664957e-06,
"loss": 0.0098,
"num_tokens": 187041856.0,
"step": 410
},
{
"epoch": 3.3421588594704685,
"grad_norm": 0.13789162045155967,
"learning_rate": 3.3813300497898326e-06,
"loss": 0.0083,
"num_tokens": 187505631.0,
"step": 411
},
{
"epoch": 3.35030549898167,
"grad_norm": 0.137718313724877,
"learning_rate": 3.3604329752910468e-06,
"loss": 0.0095,
"num_tokens": 187962839.0,
"step": 412
},
{
"epoch": 3.3584521384928716,
"grad_norm": 0.13226236747300735,
"learning_rate": 3.339595348089767e-06,
"loss": 0.0095,
"num_tokens": 188406846.0,
"step": 413
},
{
"epoch": 3.3665987780040734,
"grad_norm": 0.13283015288873243,
"learning_rate": 3.3188177471538864e-06,
"loss": 0.0088,
"num_tokens": 188859539.0,
"step": 414
},
{
"epoch": 3.374745417515275,
"grad_norm": 0.13902664596528255,
"learning_rate": 3.2981007497834922e-06,
"loss": 0.0085,
"num_tokens": 189323101.0,
"step": 415
},
{
"epoch": 3.3828920570264764,
"grad_norm": 0.13509517554370873,
"learning_rate": 3.2774449315948147e-06,
"loss": 0.0089,
"num_tokens": 189823493.0,
"step": 416
},
{
"epoch": 3.391038696537678,
"grad_norm": 0.1366523338854662,
"learning_rate": 3.2568508665042383e-06,
"loss": 0.0084,
"num_tokens": 190301541.0,
"step": 417
},
{
"epoch": 3.39918533604888,
"grad_norm": 0.125577137562613,
"learning_rate": 3.2363191267123517e-06,
"loss": 0.0072,
"num_tokens": 190798114.0,
"step": 418
},
{
"epoch": 3.4073319755600817,
"grad_norm": 0.14591111241424826,
"learning_rate": 3.215850282688055e-06,
"loss": 0.0098,
"num_tokens": 191261005.0,
"step": 419
},
{
"epoch": 3.415478615071283,
"grad_norm": 0.12604467726858234,
"learning_rate": 3.195444903152703e-06,
"loss": 0.008,
"num_tokens": 191709305.0,
"step": 420
},
{
"epoch": 3.423625254582485,
"grad_norm": 0.13382954324399682,
"learning_rate": 3.1751035550643107e-06,
"loss": 0.0084,
"num_tokens": 192209220.0,
"step": 421
},
{
"epoch": 3.4317718940936865,
"grad_norm": 0.13698395980312603,
"learning_rate": 3.1548268036017904e-06,
"loss": 0.0091,
"num_tokens": 192639412.0,
"step": 422
},
{
"epoch": 3.439918533604888,
"grad_norm": 0.13829425626998468,
"learning_rate": 3.134615212149258e-06,
"loss": 0.0092,
"num_tokens": 193098241.0,
"step": 423
},
{
"epoch": 3.4480651731160896,
"grad_norm": 0.11711892810797479,
"learning_rate": 3.114469342280379e-06,
"loss": 0.0084,
"num_tokens": 193574245.0,
"step": 424
},
{
"epoch": 3.4562118126272914,
"grad_norm": 0.1309214084812048,
"learning_rate": 3.094389753742758e-06,
"loss": 0.0088,
"num_tokens": 194017166.0,
"step": 425
},
{
"epoch": 3.4643584521384927,
"grad_norm": 0.14133229462166405,
"learning_rate": 3.0743770044423936e-06,
"loss": 0.0093,
"num_tokens": 194461022.0,
"step": 426
},
{
"epoch": 3.4725050916496945,
"grad_norm": 0.12915594606644895,
"learning_rate": 3.0544316504281677e-06,
"loss": 0.0084,
"num_tokens": 194921886.0,
"step": 427
},
{
"epoch": 3.480651731160896,
"grad_norm": 0.13019588847393995,
"learning_rate": 3.03455424587641e-06,
"loss": 0.0082,
"num_tokens": 195394552.0,
"step": 428
},
{
"epoch": 3.4887983706720975,
"grad_norm": 0.12493252602627915,
"learning_rate": 3.014745343075488e-06,
"loss": 0.009,
"num_tokens": 195853843.0,
"step": 429
},
{
"epoch": 3.4969450101832993,
"grad_norm": 0.13292973796735513,
"learning_rate": 2.995005492410469e-06,
"loss": 0.0085,
"num_tokens": 196316073.0,
"step": 430
},
{
"epoch": 3.505091649694501,
"grad_norm": 0.15361936626468706,
"learning_rate": 2.975335242347822e-06,
"loss": 0.0097,
"num_tokens": 196747650.0,
"step": 431
},
{
"epoch": 3.513238289205703,
"grad_norm": 0.12126261520512835,
"learning_rate": 2.9557351394201855e-06,
"loss": 0.0078,
"num_tokens": 197222644.0,
"step": 432
},
{
"epoch": 3.521384928716904,
"grad_norm": 0.14364063312304898,
"learning_rate": 2.9362057282111754e-06,
"loss": 0.0084,
"num_tokens": 197703977.0,
"step": 433
},
{
"epoch": 3.529531568228106,
"grad_norm": 0.1285606277274214,
"learning_rate": 2.9167475513402592e-06,
"loss": 0.0085,
"num_tokens": 198159184.0,
"step": 434
},
{
"epoch": 3.5376782077393076,
"grad_norm": 0.12784246623295054,
"learning_rate": 2.897361149447679e-06,
"loss": 0.0086,
"num_tokens": 198611287.0,
"step": 435
},
{
"epoch": 3.5458248472505094,
"grad_norm": 0.1297694309800873,
"learning_rate": 2.878047061179422e-06,
"loss": 0.0082,
"num_tokens": 199069757.0,
"step": 436
},
{
"epoch": 3.5539714867617107,
"grad_norm": 0.1292114725276358,
"learning_rate": 2.858805823172264e-06,
"loss": 0.0088,
"num_tokens": 199540737.0,
"step": 437
},
{
"epoch": 3.5621181262729125,
"grad_norm": 0.12887249746822058,
"learning_rate": 2.839637970038861e-06,
"loss": 0.009,
"num_tokens": 199982367.0,
"step": 438
},
{
"epoch": 3.5702647657841142,
"grad_norm": 0.1280981502556342,
"learning_rate": 2.8205440343528856e-06,
"loss": 0.0089,
"num_tokens": 200427445.0,
"step": 439
},
{
"epoch": 3.5784114052953155,
"grad_norm": 0.13979378072527007,
"learning_rate": 2.8015245466342287e-06,
"loss": 0.0089,
"num_tokens": 200889454.0,
"step": 440
},
{
"epoch": 3.5865580448065173,
"grad_norm": 0.14089395360902868,
"learning_rate": 2.7825800353342734e-06,
"loss": 0.0089,
"num_tokens": 201331340.0,
"step": 441
},
{
"epoch": 3.594704684317719,
"grad_norm": 0.1380485690052255,
"learning_rate": 2.763711026821196e-06,
"loss": 0.0087,
"num_tokens": 201788908.0,
"step": 442
},
{
"epoch": 3.6028513238289204,
"grad_norm": 0.13663809301177426,
"learning_rate": 2.7449180453653544e-06,
"loss": 0.009,
"num_tokens": 202225257.0,
"step": 443
},
{
"epoch": 3.610997963340122,
"grad_norm": 0.1473692732003636,
"learning_rate": 2.72620161312471e-06,
"loss": 0.0085,
"num_tokens": 202692568.0,
"step": 444
},
{
"epoch": 3.619144602851324,
"grad_norm": 0.1257774235275037,
"learning_rate": 2.7075622501303255e-06,
"loss": 0.0093,
"num_tokens": 203149741.0,
"step": 445
},
{
"epoch": 3.627291242362525,
"grad_norm": 0.13888313118631118,
"learning_rate": 2.689000474271918e-06,
"loss": 0.008,
"num_tokens": 203602311.0,
"step": 446
},
{
"epoch": 3.635437881873727,
"grad_norm": 0.15749316142966002,
"learning_rate": 2.670516801283464e-06,
"loss": 0.0108,
"num_tokens": 204036522.0,
"step": 447
},
{
"epoch": 3.6435845213849287,
"grad_norm": 0.12907115857092855,
"learning_rate": 2.652111744728876e-06,
"loss": 0.0084,
"num_tokens": 204486691.0,
"step": 448
},
{
"epoch": 3.6517311608961305,
"grad_norm": 0.13596062968350994,
"learning_rate": 2.6337858159877226e-06,
"loss": 0.0081,
"num_tokens": 204952023.0,
"step": 449
},
{
"epoch": 3.6598778004073322,
"grad_norm": 0.13346166766765533,
"learning_rate": 2.615539524241036e-06,
"loss": 0.0081,
"num_tokens": 205402274.0,
"step": 450
},
{
"epoch": 3.6680244399185336,
"grad_norm": 0.12166137102621093,
"learning_rate": 2.5973733764571486e-06,
"loss": 0.0077,
"num_tokens": 205859233.0,
"step": 451
},
{
"epoch": 3.6761710794297353,
"grad_norm": 0.13150089757352357,
"learning_rate": 2.5792878773776225e-06,
"loss": 0.0086,
"num_tokens": 206314665.0,
"step": 452
},
{
"epoch": 3.684317718940937,
"grad_norm": 0.137357932504932,
"learning_rate": 2.561283529503208e-06,
"loss": 0.0094,
"num_tokens": 206766146.0,
"step": 453
},
{
"epoch": 3.6924643584521384,
"grad_norm": 0.13479268397128444,
"learning_rate": 2.5433608330798974e-06,
"loss": 0.0094,
"num_tokens": 207200864.0,
"step": 454
},
{
"epoch": 3.70061099796334,
"grad_norm": 0.11930317957334262,
"learning_rate": 2.5255202860850157e-06,
"loss": 0.0081,
"num_tokens": 207685884.0,
"step": 455
},
{
"epoch": 3.708757637474542,
"grad_norm": 0.13838840583020326,
"learning_rate": 2.5077623842133895e-06,
"loss": 0.0086,
"num_tokens": 208130253.0,
"step": 456
},
{
"epoch": 3.716904276985743,
"grad_norm": 0.11517621045103824,
"learning_rate": 2.490087620863573e-06,
"loss": 0.0069,
"num_tokens": 208602611.0,
"step": 457
},
{
"epoch": 3.725050916496945,
"grad_norm": 0.13060665615710568,
"learning_rate": 2.4724964871241387e-06,
"loss": 0.0091,
"num_tokens": 209056174.0,
"step": 458
},
{
"epoch": 3.7331975560081467,
"grad_norm": 0.12008096972228131,
"learning_rate": 2.454989471760031e-06,
"loss": 0.0081,
"num_tokens": 209524636.0,
"step": 459
},
{
"epoch": 3.741344195519348,
"grad_norm": 0.12402851148890304,
"learning_rate": 2.437567061198991e-06,
"loss": 0.0079,
"num_tokens": 209994196.0,
"step": 460
},
{
"epoch": 3.74949083503055,
"grad_norm": 0.1386116097689955,
"learning_rate": 2.4202297395180353e-06,
"loss": 0.0086,
"num_tokens": 210466756.0,
"step": 461
},
{
"epoch": 3.7576374745417516,
"grad_norm": 0.13027448435015335,
"learning_rate": 2.4029779884300084e-06,
"loss": 0.0075,
"num_tokens": 210950806.0,
"step": 462
},
{
"epoch": 3.765784114052953,
"grad_norm": 0.13406681887661104,
"learning_rate": 2.3858122872702004e-06,
"loss": 0.0085,
"num_tokens": 211404708.0,
"step": 463
},
{
"epoch": 3.7739307535641546,
"grad_norm": 0.12564076418855794,
"learning_rate": 2.3687331129830276e-06,
"loss": 0.0078,
"num_tokens": 211866245.0,
"step": 464
},
{
"epoch": 3.7820773930753564,
"grad_norm": 0.14390639481706682,
"learning_rate": 2.3517409401087787e-06,
"loss": 0.01,
"num_tokens": 212305458.0,
"step": 465
},
{
"epoch": 3.790224032586558,
"grad_norm": 0.12121296971812623,
"learning_rate": 2.3348362407704313e-06,
"loss": 0.0083,
"num_tokens": 212752369.0,
"step": 466
},
{
"epoch": 3.79837067209776,
"grad_norm": 0.13138683401901344,
"learning_rate": 2.3180194846605367e-06,
"loss": 0.0082,
"num_tokens": 213222393.0,
"step": 467
},
{
"epoch": 3.8065173116089612,
"grad_norm": 0.12175266765217344,
"learning_rate": 2.301291139028164e-06,
"loss": 0.0079,
"num_tokens": 213681819.0,
"step": 468
},
{
"epoch": 3.814663951120163,
"grad_norm": 0.1371049624563703,
"learning_rate": 2.284651668665923e-06,
"loss": 0.0086,
"num_tokens": 214137525.0,
"step": 469
},
{
"epoch": 3.8228105906313647,
"grad_norm": 0.12055815969853237,
"learning_rate": 2.268101535897046e-06,
"loss": 0.0083,
"num_tokens": 214589391.0,
"step": 470
},
{
"epoch": 3.830957230142566,
"grad_norm": 0.14290097179006628,
"learning_rate": 2.2516412005625465e-06,
"loss": 0.0097,
"num_tokens": 215032404.0,
"step": 471
},
{
"epoch": 3.839103869653768,
"grad_norm": 0.12498594646510086,
"learning_rate": 2.235271120008439e-06,
"loss": 0.008,
"num_tokens": 215507249.0,
"step": 472
},
{
"epoch": 3.8472505091649696,
"grad_norm": 0.12951287934361264,
"learning_rate": 2.218991749073032e-06,
"loss": 0.0081,
"num_tokens": 215963900.0,
"step": 473
},
{
"epoch": 3.855397148676171,
"grad_norm": 0.1340360089239412,
"learning_rate": 2.2028035400742946e-06,
"loss": 0.0086,
"num_tokens": 216413372.0,
"step": 474
},
{
"epoch": 3.8635437881873727,
"grad_norm": 0.1428815749513004,
"learning_rate": 2.1867069427972814e-06,
"loss": 0.0091,
"num_tokens": 216874891.0,
"step": 475
},
{
"epoch": 3.8716904276985744,
"grad_norm": 0.15258894927804814,
"learning_rate": 2.1707024044816433e-06,
"loss": 0.01,
"num_tokens": 217335057.0,
"step": 476
},
{
"epoch": 3.8798370672097757,
"grad_norm": 0.11854572238956909,
"learning_rate": 2.1547903698091975e-06,
"loss": 0.0083,
"num_tokens": 217810990.0,
"step": 477
},
{
"epoch": 3.8879837067209775,
"grad_norm": 0.12413149337289436,
"learning_rate": 2.13897128089157e-06,
"loss": 0.0078,
"num_tokens": 218271262.0,
"step": 478
},
{
"epoch": 3.8961303462321792,
"grad_norm": 0.13054286386457706,
"learning_rate": 2.1232455772579164e-06,
"loss": 0.0088,
"num_tokens": 218733996.0,
"step": 479
},
{
"epoch": 3.904276985743381,
"grad_norm": 0.14174809290893123,
"learning_rate": 2.107613695842705e-06,
"loss": 0.0083,
"num_tokens": 219193703.0,
"step": 480
},
{
"epoch": 3.9124236252545828,
"grad_norm": 0.13172558951478341,
"learning_rate": 2.09207607097358e-06,
"loss": 0.0091,
"num_tokens": 219637585.0,
"step": 481
},
{
"epoch": 3.920570264765784,
"grad_norm": 0.12825966468997463,
"learning_rate": 2.0766331343592935e-06,
"loss": 0.0087,
"num_tokens": 220100782.0,
"step": 482
},
{
"epoch": 3.928716904276986,
"grad_norm": 0.11619016881857674,
"learning_rate": 2.0612853150777083e-06,
"loss": 0.0074,
"num_tokens": 220548817.0,
"step": 483
},
{
"epoch": 3.9368635437881876,
"grad_norm": 0.12824773954267013,
"learning_rate": 2.0460330395638754e-06,
"loss": 0.0089,
"num_tokens": 220986452.0,
"step": 484
},
{
"epoch": 3.945010183299389,
"grad_norm": 0.12829553056162407,
"learning_rate": 2.030876731598194e-06,
"loss": 0.0083,
"num_tokens": 221480796.0,
"step": 485
},
{
"epoch": 3.9531568228105907,
"grad_norm": 0.12618210454698364,
"learning_rate": 2.0158168122946254e-06,
"loss": 0.0082,
"num_tokens": 221927605.0,
"step": 486
},
{
"epoch": 3.9613034623217924,
"grad_norm": 0.1351329578005386,
"learning_rate": 2.000853700089001e-06,
"loss": 0.0089,
"num_tokens": 222378433.0,
"step": 487
},
{
"epoch": 3.9694501018329937,
"grad_norm": 0.12217015553170964,
"learning_rate": 1.9859878107273884e-06,
"loss": 0.0083,
"num_tokens": 222829736.0,
"step": 488
},
{
"epoch": 3.9775967413441955,
"grad_norm": 0.132799994524403,
"learning_rate": 1.971219557254548e-06,
"loss": 0.0095,
"num_tokens": 223285731.0,
"step": 489
},
{
"epoch": 3.9857433808553973,
"grad_norm": 0.12499217681603624,
"learning_rate": 1.956549350002454e-06,
"loss": 0.0069,
"num_tokens": 223757013.0,
"step": 490
},
{
"epoch": 3.9938900203665986,
"grad_norm": 0.1140878012499302,
"learning_rate": 1.9419775965788897e-06,
"loss": 0.0073,
"num_tokens": 224255873.0,
"step": 491
},
{
"epoch": 4.0,
"grad_norm": 0.16568246569300987,
"learning_rate": 1.9275047018561265e-06,
"loss": 0.0087,
"num_tokens": 224299419.0,
"step": 492
},
{
"epoch": 4.0,
"eval_loss": 0.04265177622437477,
"eval_num_tokens": 224299419.0,
"eval_runtime": 57.842,
"eval_samples_per_second": 42.167,
"eval_steps_per_second": 5.273,
"step": 492
},
{
"epoch": 4.008146639511201,
"grad_norm": 0.10953124483033916,
"learning_rate": 1.913131067959673e-06,
"loss": 0.0064,
"num_tokens": 224740301.0,
"step": 493
},
{
"epoch": 4.0162932790224035,
"grad_norm": 0.09661175616513212,
"learning_rate": 1.8988570942571039e-06,
"loss": 0.0064,
"num_tokens": 225184668.0,
"step": 494
},
{
"epoch": 4.024439918533605,
"grad_norm": 0.09108581236399259,
"learning_rate": 1.8846831773469587e-06,
"loss": 0.0054,
"num_tokens": 225644004.0,
"step": 495
},
{
"epoch": 4.032586558044806,
"grad_norm": 0.08286087324083283,
"learning_rate": 1.8706097110477298e-06,
"loss": 0.0049,
"num_tokens": 226150717.0,
"step": 496
},
{
"epoch": 4.040733197556008,
"grad_norm": 0.0987575217688521,
"learning_rate": 1.8566370863869122e-06,
"loss": 0.0058,
"num_tokens": 226596638.0,
"step": 497
},
{
"epoch": 4.04887983706721,
"grad_norm": 0.10313893410726134,
"learning_rate": 1.8427656915901428e-06,
"loss": 0.0063,
"num_tokens": 227070697.0,
"step": 498
},
{
"epoch": 4.057026476578411,
"grad_norm": 0.08409803434100602,
"learning_rate": 1.8289959120704204e-06,
"loss": 0.0045,
"num_tokens": 227563263.0,
"step": 499
},
{
"epoch": 4.065173116089613,
"grad_norm": 0.10477823554325051,
"learning_rate": 1.8153281304173842e-06,
"loss": 0.0059,
"num_tokens": 228039640.0,
"step": 500
},
{
"epoch": 4.0733197556008145,
"grad_norm": 0.09829199382018614,
"learning_rate": 1.801762726386691e-06,
"loss": 0.0056,
"num_tokens": 228524467.0,
"step": 501
},
{
"epoch": 4.081466395112017,
"grad_norm": 0.09331418832378849,
"learning_rate": 1.7883000768894627e-06,
"loss": 0.0047,
"num_tokens": 228994748.0,
"step": 502
},
{
"epoch": 4.089613034623218,
"grad_norm": 0.09581281591436303,
"learning_rate": 1.7749405559818162e-06,
"loss": 0.0053,
"num_tokens": 229450908.0,
"step": 503
},
{
"epoch": 4.097759674134419,
"grad_norm": 0.10440722069541235,
"learning_rate": 1.7616845348544657e-06,
"loss": 0.0065,
"num_tokens": 229910862.0,
"step": 504
},
{
"epoch": 4.1059063136456215,
"grad_norm": 0.09700791936550231,
"learning_rate": 1.7485323818224126e-06,
"loss": 0.0051,
"num_tokens": 230369276.0,
"step": 505
},
{
"epoch": 4.114052953156823,
"grad_norm": 0.10857998726786411,
"learning_rate": 1.7354844623147116e-06,
"loss": 0.0059,
"num_tokens": 230827234.0,
"step": 506
},
{
"epoch": 4.122199592668024,
"grad_norm": 0.10148060818665218,
"learning_rate": 1.722541138864316e-06,
"loss": 0.0056,
"num_tokens": 231311328.0,
"step": 507
},
{
"epoch": 4.130346232179226,
"grad_norm": 0.09790332100802439,
"learning_rate": 1.7097027710980059e-06,
"loss": 0.0054,
"num_tokens": 231774619.0,
"step": 508
},
{
"epoch": 4.138492871690428,
"grad_norm": 0.11303318804836798,
"learning_rate": 1.6969697157263968e-06,
"loss": 0.0063,
"num_tokens": 232234778.0,
"step": 509
},
{
"epoch": 4.146639511201629,
"grad_norm": 0.11928599820659892,
"learning_rate": 1.6843423265340241e-06,
"loss": 0.0066,
"num_tokens": 232685797.0,
"step": 510
},
{
"epoch": 4.154786150712831,
"grad_norm": 0.10102982747758138,
"learning_rate": 1.6718209543695198e-06,
"loss": 0.0053,
"num_tokens": 233131575.0,
"step": 511
},
{
"epoch": 4.1629327902240325,
"grad_norm": 0.10103824755044703,
"learning_rate": 1.6594059471358603e-06,
"loss": 0.005,
"num_tokens": 233592206.0,
"step": 512
},
{
"epoch": 4.171079429735234,
"grad_norm": 0.09473975634726714,
"learning_rate": 1.6470976497807028e-06,
"loss": 0.0049,
"num_tokens": 234073717.0,
"step": 513
},
{
"epoch": 4.179226069246436,
"grad_norm": 0.10317754634136525,
"learning_rate": 1.6348964042867963e-06,
"loss": 0.0053,
"num_tokens": 234525493.0,
"step": 514
},
{
"epoch": 4.187372708757637,
"grad_norm": 0.10883171792223603,
"learning_rate": 1.6228025496624816e-06,
"loss": 0.0062,
"num_tokens": 234975032.0,
"step": 515
},
{
"epoch": 4.195519348268839,
"grad_norm": 0.10753806313999263,
"learning_rate": 1.6108164219322759e-06,
"loss": 0.0053,
"num_tokens": 235438383.0,
"step": 516
},
{
"epoch": 4.203665987780041,
"grad_norm": 0.1064236411620558,
"learning_rate": 1.598938354127532e-06,
"loss": 0.0059,
"num_tokens": 235879893.0,
"step": 517
},
{
"epoch": 4.211812627291242,
"grad_norm": 0.12022958396721184,
"learning_rate": 1.5871686762771876e-06,
"loss": 0.0059,
"num_tokens": 236349201.0,
"step": 518
},
{
"epoch": 4.219959266802444,
"grad_norm": 0.11124601452546444,
"learning_rate": 1.5755077153985927e-06,
"loss": 0.0057,
"num_tokens": 236800777.0,
"step": 519
},
{
"epoch": 4.228105906313646,
"grad_norm": 0.09236406495488149,
"learning_rate": 1.5639557954884263e-06,
"loss": 0.0043,
"num_tokens": 237293264.0,
"step": 520
},
{
"epoch": 4.236252545824847,
"grad_norm": 0.1273600213134377,
"learning_rate": 1.552513237513694e-06,
"loss": 0.0073,
"num_tokens": 237724964.0,
"step": 521
},
{
"epoch": 4.244399185336049,
"grad_norm": 0.11769756147992531,
"learning_rate": 1.541180359402809e-06,
"loss": 0.0059,
"num_tokens": 238180300.0,
"step": 522
},
{
"epoch": 4.2525458248472505,
"grad_norm": 0.1217965830011373,
"learning_rate": 1.5299574760367564e-06,
"loss": 0.0061,
"num_tokens": 238636931.0,
"step": 523
},
{
"epoch": 4.260692464358452,
"grad_norm": 0.11472619272622367,
"learning_rate": 1.5188448992403504e-06,
"loss": 0.0058,
"num_tokens": 239086905.0,
"step": 524
},
{
"epoch": 4.268839103869654,
"grad_norm": 0.11529863540901476,
"learning_rate": 1.5078429377735626e-06,
"loss": 0.0063,
"num_tokens": 239550473.0,
"step": 525
},
{
"epoch": 4.276985743380855,
"grad_norm": 0.1156481007594638,
"learning_rate": 1.4969518973229526e-06,
"loss": 0.0059,
"num_tokens": 239995374.0,
"step": 526
},
{
"epoch": 4.285132382892057,
"grad_norm": 0.10698685435134675,
"learning_rate": 1.4861720804931665e-06,
"loss": 0.0058,
"num_tokens": 240466754.0,
"step": 527
},
{
"epoch": 4.293279022403259,
"grad_norm": 0.11289580051998427,
"learning_rate": 1.4755037867985285e-06,
"loss": 0.006,
"num_tokens": 240906071.0,
"step": 528
},
{
"epoch": 4.30142566191446,
"grad_norm": 0.11536050235837439,
"learning_rate": 1.4649473126547273e-06,
"loss": 0.0054,
"num_tokens": 241355455.0,
"step": 529
},
{
"epoch": 4.3095723014256615,
"grad_norm": 0.11636487088267386,
"learning_rate": 1.4545029513705735e-06,
"loss": 0.0058,
"num_tokens": 241836525.0,
"step": 530
},
{
"epoch": 4.317718940936864,
"grad_norm": 0.09846554835421734,
"learning_rate": 1.4441709931398513e-06,
"loss": 0.0051,
"num_tokens": 242307462.0,
"step": 531
},
{
"epoch": 4.325865580448065,
"grad_norm": 0.1120813571543054,
"learning_rate": 1.4339517250332565e-06,
"loss": 0.0061,
"num_tokens": 242741978.0,
"step": 532
},
{
"epoch": 4.334012219959266,
"grad_norm": 0.1113819187138935,
"learning_rate": 1.4238454309904205e-06,
"loss": 0.0055,
"num_tokens": 243192201.0,
"step": 533
},
{
"epoch": 4.3421588594704685,
"grad_norm": 0.11764577339647353,
"learning_rate": 1.4138523918120201e-06,
"loss": 0.0065,
"num_tokens": 243636087.0,
"step": 534
},
{
"epoch": 4.35030549898167,
"grad_norm": 0.11164487804753273,
"learning_rate": 1.4039728851519764e-06,
"loss": 0.0055,
"num_tokens": 244110581.0,
"step": 535
},
{
"epoch": 4.358452138492872,
"grad_norm": 0.09698712924798691,
"learning_rate": 1.3942071855097381e-06,
"loss": 0.0049,
"num_tokens": 244572435.0,
"step": 536
},
{
"epoch": 4.366598778004073,
"grad_norm": 0.1104930978310767,
"learning_rate": 1.3845555642226583e-06,
"loss": 0.0056,
"num_tokens": 245032371.0,
"step": 537
},
{
"epoch": 4.374745417515275,
"grad_norm": 0.11183186111310507,
"learning_rate": 1.375018289458453e-06,
"loss": 0.0055,
"num_tokens": 245488372.0,
"step": 538
},
{
"epoch": 4.382892057026477,
"grad_norm": 0.11660880918067139,
"learning_rate": 1.3655956262077502e-06,
"loss": 0.0063,
"num_tokens": 245947576.0,
"step": 539
},
{
"epoch": 4.391038696537678,
"grad_norm": 0.11734436147080707,
"learning_rate": 1.3562878362767296e-06,
"loss": 0.006,
"num_tokens": 246410789.0,
"step": 540
},
{
"epoch": 4.3991853360488795,
"grad_norm": 0.11187947506861028,
"learning_rate": 1.3470951782798432e-06,
"loss": 0.0053,
"num_tokens": 246885080.0,
"step": 541
},
{
"epoch": 4.407331975560082,
"grad_norm": 0.10682796561668163,
"learning_rate": 1.338017907632635e-06,
"loss": 0.0054,
"num_tokens": 247344383.0,
"step": 542
},
{
"epoch": 4.415478615071283,
"grad_norm": 0.11487602768278418,
"learning_rate": 1.329056276544642e-06,
"loss": 0.0054,
"num_tokens": 247825702.0,
"step": 543
},
{
"epoch": 4.423625254582484,
"grad_norm": 0.10954303849780199,
"learning_rate": 1.320210534012388e-06,
"loss": 0.0059,
"num_tokens": 248301334.0,
"step": 544
},
{
"epoch": 4.4317718940936865,
"grad_norm": 0.1065560110571518,
"learning_rate": 1.311480925812461e-06,
"loss": 0.0057,
"num_tokens": 248770660.0,
"step": 545
},
{
"epoch": 4.439918533604888,
"grad_norm": 0.12112306787916738,
"learning_rate": 1.3028676944946916e-06,
"loss": 0.0067,
"num_tokens": 249197698.0,
"step": 546
},
{
"epoch": 4.44806517311609,
"grad_norm": 0.10503970639083068,
"learning_rate": 1.2943710793754082e-06,
"loss": 0.0049,
"num_tokens": 249659509.0,
"step": 547
},
{
"epoch": 4.456211812627291,
"grad_norm": 0.11924557580218739,
"learning_rate": 1.2859913165307886e-06,
"loss": 0.0063,
"num_tokens": 250110156.0,
"step": 548
},
{
"epoch": 4.464358452138493,
"grad_norm": 0.11442982117714874,
"learning_rate": 1.277728638790303e-06,
"loss": 0.0063,
"num_tokens": 250550111.0,
"step": 549
},
{
"epoch": 4.472505091649695,
"grad_norm": 0.12110395302590302,
"learning_rate": 1.2695832757302412e-06,
"loss": 0.0065,
"num_tokens": 251002357.0,
"step": 550
},
{
"epoch": 4.480651731160896,
"grad_norm": 0.11664662464057247,
"learning_rate": 1.2615554536673377e-06,
"loss": 0.0062,
"num_tokens": 251458462.0,
"step": 551
},
{
"epoch": 4.4887983706720975,
"grad_norm": 0.11645513165539287,
"learning_rate": 1.253645395652481e-06,
"loss": 0.0061,
"num_tokens": 251902226.0,
"step": 552
},
{
"epoch": 4.4969450101833,
"grad_norm": 0.11363438791067745,
"learning_rate": 1.2458533214645175e-06,
"loss": 0.0056,
"num_tokens": 252346885.0,
"step": 553
},
{
"epoch": 4.505091649694501,
"grad_norm": 0.12965647026273558,
"learning_rate": 1.2381794476041447e-06,
"loss": 0.0064,
"num_tokens": 252804103.0,
"step": 554
},
{
"epoch": 4.513238289205702,
"grad_norm": 0.10419635456766704,
"learning_rate": 1.2306239872878946e-06,
"loss": 0.0059,
"num_tokens": 253273586.0,
"step": 555
},
{
"epoch": 4.521384928716905,
"grad_norm": 0.11350584533770305,
"learning_rate": 1.2231871504422117e-06,
"loss": 0.0059,
"num_tokens": 253725593.0,
"step": 556
},
{
"epoch": 4.529531568228106,
"grad_norm": 0.13468868599441702,
"learning_rate": 1.215869143697619e-06,
"loss": 0.0073,
"num_tokens": 254156458.0,
"step": 557
},
{
"epoch": 4.537678207739307,
"grad_norm": 0.10259852383741634,
"learning_rate": 1.2086701703829755e-06,
"loss": 0.0054,
"num_tokens": 254617846.0,
"step": 558
},
{
"epoch": 4.545824847250509,
"grad_norm": 0.11651453346375099,
"learning_rate": 1.2015904305198286e-06,
"loss": 0.0063,
"num_tokens": 255052922.0,
"step": 559
},
{
"epoch": 4.553971486761711,
"grad_norm": 0.10384453182105129,
"learning_rate": 1.1946301208168593e-06,
"loss": 0.0051,
"num_tokens": 255534554.0,
"step": 560
},
{
"epoch": 4.562118126272912,
"grad_norm": 0.12828945094057975,
"learning_rate": 1.1877894346644085e-06,
"loss": 0.007,
"num_tokens": 255986625.0,
"step": 561
},
{
"epoch": 4.570264765784114,
"grad_norm": 0.10166841643303247,
"learning_rate": 1.1810685621291135e-06,
"loss": 0.0055,
"num_tokens": 256440817.0,
"step": 562
},
{
"epoch": 4.5784114052953155,
"grad_norm": 0.12163643122042941,
"learning_rate": 1.174467689948618e-06,
"loss": 0.007,
"num_tokens": 256883913.0,
"step": 563
},
{
"epoch": 4.586558044806518,
"grad_norm": 0.11612572338384212,
"learning_rate": 1.1679870015263908e-06,
"loss": 0.0061,
"num_tokens": 257340848.0,
"step": 564
},
{
"epoch": 4.594704684317719,
"grad_norm": 0.09659828775248515,
"learning_rate": 1.1616266769266263e-06,
"loss": 0.0052,
"num_tokens": 257795593.0,
"step": 565
},
{
"epoch": 4.60285132382892,
"grad_norm": 0.10140831312358678,
"learning_rate": 1.1553868928692422e-06,
"loss": 0.0048,
"num_tokens": 258288534.0,
"step": 566
},
{
"epoch": 4.610997963340123,
"grad_norm": 0.11217052895153468,
"learning_rate": 1.1492678227249695e-06,
"loss": 0.0059,
"num_tokens": 258741097.0,
"step": 567
},
{
"epoch": 4.619144602851324,
"grad_norm": 0.1126933577651828,
"learning_rate": 1.143269636510536e-06,
"loss": 0.0061,
"num_tokens": 259193501.0,
"step": 568
},
{
"epoch": 4.627291242362525,
"grad_norm": 0.11797745986694334,
"learning_rate": 1.1373925008839403e-06,
"loss": 0.0063,
"num_tokens": 259649197.0,
"step": 569
},
{
"epoch": 4.635437881873727,
"grad_norm": 0.11303980738140469,
"learning_rate": 1.1316365791398251e-06,
"loss": 0.0061,
"num_tokens": 260088831.0,
"step": 570
},
{
"epoch": 4.643584521384929,
"grad_norm": 0.10873603489504344,
"learning_rate": 1.1260020312049356e-06,
"loss": 0.006,
"num_tokens": 260555536.0,
"step": 571
},
{
"epoch": 4.65173116089613,
"grad_norm": 0.0920006832828397,
"learning_rate": 1.1204890136336784e-06,
"loss": 0.0052,
"num_tokens": 261048454.0,
"step": 572
},
{
"epoch": 4.659877800407332,
"grad_norm": 0.1255806723199747,
"learning_rate": 1.1150976796037736e-06,
"loss": 0.0068,
"num_tokens": 261480295.0,
"step": 573
},
{
"epoch": 4.6680244399185336,
"grad_norm": 0.11533169004614044,
"learning_rate": 1.1098281789119948e-06,
"loss": 0.0057,
"num_tokens": 261942589.0,
"step": 574
},
{
"epoch": 4.676171079429735,
"grad_norm": 0.10129996781843084,
"learning_rate": 1.104680657970009e-06,
"loss": 0.0057,
"num_tokens": 262393944.0,
"step": 575
},
{
"epoch": 4.684317718940937,
"grad_norm": 0.11015833267592207,
"learning_rate": 1.0996552598003088e-06,
"loss": 0.0059,
"num_tokens": 262882312.0,
"step": 576
},
{
"epoch": 4.692464358452138,
"grad_norm": 0.10314595226042249,
"learning_rate": 1.094752124032238e-06,
"loss": 0.0055,
"num_tokens": 263336673.0,
"step": 577
},
{
"epoch": 4.70061099796334,
"grad_norm": 0.11664841890610124,
"learning_rate": 1.0899713868981123e-06,
"loss": 0.0064,
"num_tokens": 263792010.0,
"step": 578
},
{
"epoch": 4.708757637474542,
"grad_norm": 0.09972137290365708,
"learning_rate": 1.0853131812294355e-06,
"loss": 0.0051,
"num_tokens": 264237484.0,
"step": 579
},
{
"epoch": 4.716904276985743,
"grad_norm": 0.10268566206680875,
"learning_rate": 1.0807776364532044e-06,
"loss": 0.0056,
"num_tokens": 264713321.0,
"step": 580
},
{
"epoch": 4.725050916496945,
"grad_norm": 0.10619035337589804,
"learning_rate": 1.0763648785883186e-06,
"loss": 0.0058,
"num_tokens": 265183724.0,
"step": 581
},
{
"epoch": 4.733197556008147,
"grad_norm": 0.10541962557747203,
"learning_rate": 1.0720750302420745e-06,
"loss": 0.0057,
"num_tokens": 265627643.0,
"step": 582
},
{
"epoch": 4.741344195519348,
"grad_norm": 0.11506033498658928,
"learning_rate": 1.0679082106067618e-06,
"loss": 0.0067,
"num_tokens": 266084878.0,
"step": 583
},
{
"epoch": 4.74949083503055,
"grad_norm": 0.11142067796057883,
"learning_rate": 1.0638645354563488e-06,
"loss": 0.0056,
"num_tokens": 266578362.0,
"step": 584
},
{
"epoch": 4.757637474541752,
"grad_norm": 0.12323031771225379,
"learning_rate": 1.0599441171432685e-06,
"loss": 0.0071,
"num_tokens": 267005793.0,
"step": 585
},
{
"epoch": 4.765784114052953,
"grad_norm": 0.10911498957082988,
"learning_rate": 1.0561470645952939e-06,
"loss": 0.0059,
"num_tokens": 267445983.0,
"step": 586
},
{
"epoch": 4.773930753564155,
"grad_norm": 0.10589151493278187,
"learning_rate": 1.0524734833125155e-06,
"loss": 0.006,
"num_tokens": 267934787.0,
"step": 587
},
{
"epoch": 4.782077393075356,
"grad_norm": 0.0961251286065213,
"learning_rate": 1.0489234753644075e-06,
"loss": 0.0047,
"num_tokens": 268404039.0,
"step": 588
},
{
"epoch": 4.790224032586558,
"grad_norm": 0.11570808115862555,
"learning_rate": 1.0454971393869895e-06,
"loss": 0.0061,
"num_tokens": 268871776.0,
"step": 589
},
{
"epoch": 4.79837067209776,
"grad_norm": 0.11996049644781787,
"learning_rate": 1.0421945705800913e-06,
"loss": 0.006,
"num_tokens": 269329939.0,
"step": 590
},
{
"epoch": 4.806517311608961,
"grad_norm": 0.11015784556640101,
"learning_rate": 1.0390158607047029e-06,
"loss": 0.0059,
"num_tokens": 269796155.0,
"step": 591
},
{
"epoch": 4.814663951120163,
"grad_norm": 0.10516381427732067,
"learning_rate": 1.0359610980804286e-06,
"loss": 0.0051,
"num_tokens": 270260800.0,
"step": 592
},
{
"epoch": 4.822810590631365,
"grad_norm": 0.11057933848917369,
"learning_rate": 1.0330303675830306e-06,
"loss": 0.0054,
"num_tokens": 270718037.0,
"step": 593
},
{
"epoch": 4.830957230142566,
"grad_norm": 0.12034159438309625,
"learning_rate": 1.0302237506420722e-06,
"loss": 0.0063,
"num_tokens": 271163129.0,
"step": 594
},
{
"epoch": 4.839103869653767,
"grad_norm": 0.1298369000893159,
"learning_rate": 1.0275413252386545e-06,
"loss": 0.0077,
"num_tokens": 271586088.0,
"step": 595
},
{
"epoch": 4.84725050916497,
"grad_norm": 0.11485648605447368,
"learning_rate": 1.0249831659032494e-06,
"loss": 0.0067,
"num_tokens": 272031287.0,
"step": 596
},
{
"epoch": 4.855397148676171,
"grad_norm": 0.11585325382556429,
"learning_rate": 1.0225493437136302e-06,
"loss": 0.0067,
"num_tokens": 272474742.0,
"step": 597
},
{
"epoch": 4.863543788187373,
"grad_norm": 0.1239008691750004,
"learning_rate": 1.020239926292895e-06,
"loss": 0.0067,
"num_tokens": 272932607.0,
"step": 598
},
{
"epoch": 4.871690427698574,
"grad_norm": 0.10254555243859467,
"learning_rate": 1.018054977807589e-06,
"loss": 0.0057,
"num_tokens": 273415530.0,
"step": 599
},
{
"epoch": 4.879837067209776,
"grad_norm": 0.10546701888018833,
"learning_rate": 1.0159945589659223e-06,
"loss": 0.0056,
"num_tokens": 273885366.0,
"step": 600
},
{
"epoch": 4.887983706720978,
"grad_norm": 0.12031804835663963,
"learning_rate": 1.0140587270160806e-06,
"loss": 0.0066,
"num_tokens": 274335421.0,
"step": 601
},
{
"epoch": 4.896130346232179,
"grad_norm": 0.11002140545903802,
"learning_rate": 1.0122475357446372e-06,
"loss": 0.0061,
"num_tokens": 274789915.0,
"step": 602
},
{
"epoch": 4.904276985743381,
"grad_norm": 0.10524124599370216,
"learning_rate": 1.0105610354750566e-06,
"loss": 0.0055,
"num_tokens": 275269107.0,
"step": 603
},
{
"epoch": 4.912423625254583,
"grad_norm": 0.1279839524575316,
"learning_rate": 1.0089992730662983e-06,
"loss": 0.007,
"num_tokens": 275714557.0,
"step": 604
},
{
"epoch": 4.920570264765784,
"grad_norm": 0.11521764229191792,
"learning_rate": 1.0075622919115133e-06,
"loss": 0.0059,
"num_tokens": 276134943.0,
"step": 605
},
{
"epoch": 4.928716904276985,
"grad_norm": 0.10797809839128278,
"learning_rate": 1.0062501319368376e-06,
"loss": 0.005,
"num_tokens": 276628333.0,
"step": 606
},
{
"epoch": 4.936863543788188,
"grad_norm": 0.103832639157195,
"learning_rate": 1.0050628296002864e-06,
"loss": 0.0055,
"num_tokens": 277092549.0,
"step": 607
},
{
"epoch": 4.945010183299389,
"grad_norm": 0.10579829877653406,
"learning_rate": 1.0040004178907364e-06,
"loss": 0.0059,
"num_tokens": 277564414.0,
"step": 608
},
{
"epoch": 4.953156822810591,
"grad_norm": 0.1129191145686251,
"learning_rate": 1.0030629263270133e-06,
"loss": 0.0057,
"num_tokens": 278043267.0,
"step": 609
},
{
"epoch": 4.961303462321792,
"grad_norm": 0.11428351556872687,
"learning_rate": 1.0022503809570692e-06,
"loss": 0.0058,
"num_tokens": 278500208.0,
"step": 610
},
{
"epoch": 4.969450101832994,
"grad_norm": 0.12454757064462266,
"learning_rate": 1.0015628043572607e-06,
"loss": 0.0067,
"num_tokens": 278958350.0,
"step": 611
},
{
"epoch": 4.977596741344195,
"grad_norm": 0.11985284209865818,
"learning_rate": 1.0010002156317187e-06,
"loss": 0.0055,
"num_tokens": 279426149.0,
"step": 612
},
{
"epoch": 4.985743380855397,
"grad_norm": 0.11653540817309618,
"learning_rate": 1.0005626304118208e-06,
"loss": 0.0062,
"num_tokens": 279874989.0,
"step": 613
},
{
"epoch": 4.993890020366599,
"grad_norm": 0.12398585004184347,
"learning_rate": 1.0002500608557558e-06,
"loss": 0.0064,
"num_tokens": 280320581.0,
"step": 614
},
{
"epoch": 5.0,
"grad_norm": 0.12467063210289439,
"learning_rate": 1.0000625156481842e-06,
"loss": 0.0065,
"num_tokens": 280366492.0,
"step": 615
},
{
"epoch": 5.0,
"eval_loss": 0.04499583691358566,
"eval_num_tokens": 280366492.0,
"eval_runtime": 57.8338,
"eval_samples_per_second": 42.173,
"eval_steps_per_second": 5.274,
"step": 615
},
{
"epoch": 5.0,
"step": 615,
"total_flos": 9.471448716243108e+17,
"train_loss": 0.017112477973285245,
"train_runtime": 9496.6509,
"train_samples_per_second": 8.264,
"train_steps_per_second": 0.065
}
],
"logging_steps": 1,
"max_steps": 615,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.471448716243108e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}