hungnm's picture
Model save
ee81b15 verified
{
"best_metric": 0.8012369099843738,
"best_model_checkpoint": "/data/hungnm/unisentiment/modernBERT-base-sentiment/checkpoint-4611",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 7685,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0032530904359141183,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 2.4862,
"step": 5
},
{
"epoch": 0.006506180871828237,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 2.4619,
"step": 10
},
{
"epoch": 0.009759271307742356,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 2.474,
"step": 15
},
{
"epoch": 0.013012361743656473,
"grad_norm": 5.975010395050049,
"learning_rate": 2.5974025974025976e-06,
"loss": 2.4748,
"step": 20
},
{
"epoch": 0.01626545217957059,
"grad_norm": 4.729438781738281,
"learning_rate": 5.194805194805195e-06,
"loss": 2.4383,
"step": 25
},
{
"epoch": 0.01951854261548471,
"grad_norm": 4.140679359436035,
"learning_rate": 8.441558441558442e-06,
"loss": 2.2384,
"step": 30
},
{
"epoch": 0.02277163305139883,
"grad_norm": 2.7495357990264893,
"learning_rate": 1.1688311688311688e-05,
"loss": 2.16,
"step": 35
},
{
"epoch": 0.026024723487312947,
"grad_norm": 1.4239497184753418,
"learning_rate": 1.4935064935064936e-05,
"loss": 2.0898,
"step": 40
},
{
"epoch": 0.029277813923227064,
"grad_norm": 1.3778964281082153,
"learning_rate": 1.8181818181818182e-05,
"loss": 2.037,
"step": 45
},
{
"epoch": 0.03253090435914118,
"grad_norm": 1.6160250902175903,
"learning_rate": 2.1428571428571428e-05,
"loss": 2.0056,
"step": 50
},
{
"epoch": 0.035783994795055306,
"grad_norm": 1.090104579925537,
"learning_rate": 2.4675324675324678e-05,
"loss": 1.9513,
"step": 55
},
{
"epoch": 0.03903708523096942,
"grad_norm": 2.1062819957733154,
"learning_rate": 2.792207792207792e-05,
"loss": 1.9023,
"step": 60
},
{
"epoch": 0.04229017566688354,
"grad_norm": 3.310304880142212,
"learning_rate": 3.1168831168831166e-05,
"loss": 1.877,
"step": 65
},
{
"epoch": 0.04554326610279766,
"grad_norm": 5.446138858795166,
"learning_rate": 3.4415584415584416e-05,
"loss": 1.822,
"step": 70
},
{
"epoch": 0.048796356538711776,
"grad_norm": 1.910844087600708,
"learning_rate": 3.7662337662337665e-05,
"loss": 1.7707,
"step": 75
},
{
"epoch": 0.05204944697462589,
"grad_norm": 5.207052707672119,
"learning_rate": 4.0909090909090915e-05,
"loss": 1.7986,
"step": 80
},
{
"epoch": 0.05530253741054001,
"grad_norm": 4.687050819396973,
"learning_rate": 4.415584415584416e-05,
"loss": 1.7189,
"step": 85
},
{
"epoch": 0.05855562784645413,
"grad_norm": 4.655097961425781,
"learning_rate": 4.740259740259741e-05,
"loss": 1.7185,
"step": 90
},
{
"epoch": 0.06180871828236825,
"grad_norm": 5.834875106811523,
"learning_rate": 4.999999786858144e-05,
"loss": 1.6804,
"step": 95
},
{
"epoch": 0.06506180871828236,
"grad_norm": 2.986246109008789,
"learning_rate": 4.99999232689698e-05,
"loss": 1.6772,
"step": 100
},
{
"epoch": 0.06831489915419649,
"grad_norm": 1.4194883108139038,
"learning_rate": 4.999974209879331e-05,
"loss": 1.602,
"step": 105
},
{
"epoch": 0.07156798959011061,
"grad_norm": 3.983574628829956,
"learning_rate": 4.999945435882428e-05,
"loss": 1.5656,
"step": 110
},
{
"epoch": 0.07482108002602472,
"grad_norm": 1.342112421989441,
"learning_rate": 4.9999060050289286e-05,
"loss": 1.511,
"step": 115
},
{
"epoch": 0.07807417046193885,
"grad_norm": 2.197117805480957,
"learning_rate": 4.999855917486921e-05,
"loss": 1.4768,
"step": 120
},
{
"epoch": 0.08132726089785296,
"grad_norm": 1.8786858320236206,
"learning_rate": 4.999795173469919e-05,
"loss": 1.473,
"step": 125
},
{
"epoch": 0.08458035133376708,
"grad_norm": 2.5618531703948975,
"learning_rate": 4.9997237732368645e-05,
"loss": 1.4527,
"step": 130
},
{
"epoch": 0.08783344176968119,
"grad_norm": 1.8612209558486938,
"learning_rate": 4.999641717092126e-05,
"loss": 1.4092,
"step": 135
},
{
"epoch": 0.09108653220559532,
"grad_norm": 1.912489891052246,
"learning_rate": 4.999549005385494e-05,
"loss": 1.3939,
"step": 140
},
{
"epoch": 0.09433962264150944,
"grad_norm": 2.8550467491149902,
"learning_rate": 4.999445638512185e-05,
"loss": 1.3562,
"step": 145
},
{
"epoch": 0.09759271307742355,
"grad_norm": 1.902714729309082,
"learning_rate": 4.9993316169128334e-05,
"loss": 1.3427,
"step": 150
},
{
"epoch": 0.10084580351333768,
"grad_norm": 3.12044620513916,
"learning_rate": 4.999206941073496e-05,
"loss": 1.3634,
"step": 155
},
{
"epoch": 0.10409889394925179,
"grad_norm": 2.6095197200775146,
"learning_rate": 4.999071611525643e-05,
"loss": 1.3605,
"step": 160
},
{
"epoch": 0.10735198438516591,
"grad_norm": 2.5530121326446533,
"learning_rate": 4.998925628846164e-05,
"loss": 1.3444,
"step": 165
},
{
"epoch": 0.11060507482108002,
"grad_norm": 1.9909695386886597,
"learning_rate": 4.99876899365736e-05,
"loss": 1.3192,
"step": 170
},
{
"epoch": 0.11385816525699415,
"grad_norm": 1.21974515914917,
"learning_rate": 4.998601706626938e-05,
"loss": 1.3085,
"step": 175
},
{
"epoch": 0.11711125569290826,
"grad_norm": 1.2985081672668457,
"learning_rate": 4.9984237684680194e-05,
"loss": 1.2848,
"step": 180
},
{
"epoch": 0.12036434612882238,
"grad_norm": 2.141941785812378,
"learning_rate": 4.998235179939122e-05,
"loss": 1.2729,
"step": 185
},
{
"epoch": 0.1236174365647365,
"grad_norm": 1.9323813915252686,
"learning_rate": 4.998035941844167e-05,
"loss": 1.275,
"step": 190
},
{
"epoch": 0.12687052700065063,
"grad_norm": 2.6978371143341064,
"learning_rate": 4.997826055032476e-05,
"loss": 1.2825,
"step": 195
},
{
"epoch": 0.13012361743656473,
"grad_norm": 2.018090009689331,
"learning_rate": 4.997605520398762e-05,
"loss": 1.2656,
"step": 200
},
{
"epoch": 0.13337670787247885,
"grad_norm": 1.0469837188720703,
"learning_rate": 4.997374338883127e-05,
"loss": 1.2584,
"step": 205
},
{
"epoch": 0.13662979830839297,
"grad_norm": 1.2959955930709839,
"learning_rate": 4.99713251147106e-05,
"loss": 1.2494,
"step": 210
},
{
"epoch": 0.1398828887443071,
"grad_norm": 2.215878486633301,
"learning_rate": 4.996880039193431e-05,
"loss": 1.2482,
"step": 215
},
{
"epoch": 0.14313597918022122,
"grad_norm": 1.711484432220459,
"learning_rate": 4.996616923126488e-05,
"loss": 1.2258,
"step": 220
},
{
"epoch": 0.14638906961613532,
"grad_norm": 1.5809857845306396,
"learning_rate": 4.996343164391853e-05,
"loss": 1.223,
"step": 225
},
{
"epoch": 0.14964216005204944,
"grad_norm": 1.6745812892913818,
"learning_rate": 4.9960587641565125e-05,
"loss": 1.2151,
"step": 230
},
{
"epoch": 0.15289525048796357,
"grad_norm": 1.5372675657272339,
"learning_rate": 4.9957637236328195e-05,
"loss": 1.1983,
"step": 235
},
{
"epoch": 0.1561483409238777,
"grad_norm": 1.5290815830230713,
"learning_rate": 4.995458044078482e-05,
"loss": 1.24,
"step": 240
},
{
"epoch": 0.1594014313597918,
"grad_norm": 1.4023972749710083,
"learning_rate": 4.9951417267965626e-05,
"loss": 1.1897,
"step": 245
},
{
"epoch": 0.16265452179570591,
"grad_norm": 1.8283660411834717,
"learning_rate": 4.99481477313547e-05,
"loss": 1.2029,
"step": 250
},
{
"epoch": 0.16590761223162004,
"grad_norm": 1.8741523027420044,
"learning_rate": 4.9944771844889524e-05,
"loss": 1.19,
"step": 255
},
{
"epoch": 0.16916070266753416,
"grad_norm": 1.552556037902832,
"learning_rate": 4.994128962296097e-05,
"loss": 1.1946,
"step": 260
},
{
"epoch": 0.1724137931034483,
"grad_norm": 2.1094107627868652,
"learning_rate": 4.9937701080413165e-05,
"loss": 1.1756,
"step": 265
},
{
"epoch": 0.17566688353936238,
"grad_norm": 1.7123149633407593,
"learning_rate": 4.993400623254347e-05,
"loss": 1.1789,
"step": 270
},
{
"epoch": 0.1789199739752765,
"grad_norm": 1.2891788482666016,
"learning_rate": 4.993020509510243e-05,
"loss": 1.1833,
"step": 275
},
{
"epoch": 0.18217306441119063,
"grad_norm": 1.2659103870391846,
"learning_rate": 4.992629768429367e-05,
"loss": 1.1697,
"step": 280
},
{
"epoch": 0.18542615484710476,
"grad_norm": 1.602931022644043,
"learning_rate": 4.992228401677382e-05,
"loss": 1.16,
"step": 285
},
{
"epoch": 0.18867924528301888,
"grad_norm": 1.1984357833862305,
"learning_rate": 4.99181641096525e-05,
"loss": 1.1415,
"step": 290
},
{
"epoch": 0.19193233571893298,
"grad_norm": 2.036529302597046,
"learning_rate": 4.991393798049219e-05,
"loss": 1.168,
"step": 295
},
{
"epoch": 0.1951854261548471,
"grad_norm": 1.9513144493103027,
"learning_rate": 4.990960564730819e-05,
"loss": 1.1623,
"step": 300
},
{
"epoch": 0.19843851659076123,
"grad_norm": 1.2966268062591553,
"learning_rate": 4.9905167128568516e-05,
"loss": 1.143,
"step": 305
},
{
"epoch": 0.20169160702667535,
"grad_norm": 1.3897426128387451,
"learning_rate": 4.990062244319387e-05,
"loss": 1.1431,
"step": 310
},
{
"epoch": 0.20494469746258945,
"grad_norm": 1.7485623359680176,
"learning_rate": 4.989597161055746e-05,
"loss": 1.1507,
"step": 315
},
{
"epoch": 0.20819778789850357,
"grad_norm": 1.1369644403457642,
"learning_rate": 4.989121465048505e-05,
"loss": 1.1447,
"step": 320
},
{
"epoch": 0.2114508783344177,
"grad_norm": 1.292037844657898,
"learning_rate": 4.988635158325476e-05,
"loss": 1.1289,
"step": 325
},
{
"epoch": 0.21470396877033182,
"grad_norm": 1.1460140943527222,
"learning_rate": 4.988138242959707e-05,
"loss": 1.1314,
"step": 330
},
{
"epoch": 0.21795705920624595,
"grad_norm": 1.9661816358566284,
"learning_rate": 4.987630721069465e-05,
"loss": 1.147,
"step": 335
},
{
"epoch": 0.22121014964216004,
"grad_norm": 1.3988662958145142,
"learning_rate": 4.987112594818232e-05,
"loss": 1.1443,
"step": 340
},
{
"epoch": 0.22446324007807417,
"grad_norm": 1.6520105600357056,
"learning_rate": 4.986583866414696e-05,
"loss": 1.1089,
"step": 345
},
{
"epoch": 0.2277163305139883,
"grad_norm": 1.6153268814086914,
"learning_rate": 4.9860445381127385e-05,
"loss": 1.1279,
"step": 350
},
{
"epoch": 0.23096942094990242,
"grad_norm": 1.0572576522827148,
"learning_rate": 4.985494612211429e-05,
"loss": 1.1073,
"step": 355
},
{
"epoch": 0.2342225113858165,
"grad_norm": 1.1980561017990112,
"learning_rate": 4.984934091055009e-05,
"loss": 1.1161,
"step": 360
},
{
"epoch": 0.23747560182173064,
"grad_norm": 3.1612489223480225,
"learning_rate": 4.98436297703289e-05,
"loss": 1.1473,
"step": 365
},
{
"epoch": 0.24072869225764476,
"grad_norm": 1.7351305484771729,
"learning_rate": 4.983781272579636e-05,
"loss": 1.1282,
"step": 370
},
{
"epoch": 0.24398178269355889,
"grad_norm": 1.4272353649139404,
"learning_rate": 4.983188980174958e-05,
"loss": 1.1486,
"step": 375
},
{
"epoch": 0.247234873129473,
"grad_norm": 1.6868839263916016,
"learning_rate": 4.9825861023437016e-05,
"loss": 1.1224,
"step": 380
},
{
"epoch": 0.2504879635653871,
"grad_norm": 1.1032485961914062,
"learning_rate": 4.981972641655835e-05,
"loss": 1.1186,
"step": 385
},
{
"epoch": 0.25374105400130126,
"grad_norm": 1.0825129747390747,
"learning_rate": 4.981348600726441e-05,
"loss": 1.093,
"step": 390
},
{
"epoch": 0.25699414443721535,
"grad_norm": 1.0156402587890625,
"learning_rate": 4.980713982215703e-05,
"loss": 1.0873,
"step": 395
},
{
"epoch": 0.26024723487312945,
"grad_norm": 2.106105089187622,
"learning_rate": 4.9800687888288964e-05,
"loss": 1.0924,
"step": 400
},
{
"epoch": 0.2635003253090436,
"grad_norm": 1.6301723718643188,
"learning_rate": 4.9794130233163735e-05,
"loss": 1.1063,
"step": 405
},
{
"epoch": 0.2667534157449577,
"grad_norm": 1.30489981174469,
"learning_rate": 4.978746688473556e-05,
"loss": 1.0993,
"step": 410
},
{
"epoch": 0.27000650618087185,
"grad_norm": 1.1064469814300537,
"learning_rate": 4.978069787140919e-05,
"loss": 1.093,
"step": 415
},
{
"epoch": 0.27325959661678595,
"grad_norm": 1.1742445230484009,
"learning_rate": 4.977382322203982e-05,
"loss": 1.0848,
"step": 420
},
{
"epoch": 0.27651268705270005,
"grad_norm": 1.0716508626937866,
"learning_rate": 4.976684296593295e-05,
"loss": 1.1157,
"step": 425
},
{
"epoch": 0.2797657774886142,
"grad_norm": 1.4256720542907715,
"learning_rate": 4.9759757132844256e-05,
"loss": 1.0835,
"step": 430
},
{
"epoch": 0.2830188679245283,
"grad_norm": 1.2922230958938599,
"learning_rate": 4.975256575297949e-05,
"loss": 1.0804,
"step": 435
},
{
"epoch": 0.28627195836044245,
"grad_norm": 1.5222572088241577,
"learning_rate": 4.974526885699432e-05,
"loss": 1.077,
"step": 440
},
{
"epoch": 0.28952504879635654,
"grad_norm": 1.023868441581726,
"learning_rate": 4.973786647599422e-05,
"loss": 1.0782,
"step": 445
},
{
"epoch": 0.29277813923227064,
"grad_norm": 1.7092077732086182,
"learning_rate": 4.9730358641534324e-05,
"loss": 1.1011,
"step": 450
},
{
"epoch": 0.2960312296681848,
"grad_norm": 1.0816203355789185,
"learning_rate": 4.9722745385619285e-05,
"loss": 1.0857,
"step": 455
},
{
"epoch": 0.2992843201040989,
"grad_norm": 0.9598567485809326,
"learning_rate": 4.971502674070317e-05,
"loss": 1.0874,
"step": 460
},
{
"epoch": 0.302537410540013,
"grad_norm": 1.1397418975830078,
"learning_rate": 4.970720273968929e-05,
"loss": 1.0743,
"step": 465
},
{
"epoch": 0.30579050097592714,
"grad_norm": 1.6813876628875732,
"learning_rate": 4.969927341593008e-05,
"loss": 1.0587,
"step": 470
},
{
"epoch": 0.30904359141184123,
"grad_norm": 1.4590063095092773,
"learning_rate": 4.9691238803226944e-05,
"loss": 1.0706,
"step": 475
},
{
"epoch": 0.3122966818477554,
"grad_norm": 0.988750696182251,
"learning_rate": 4.9683098935830115e-05,
"loss": 1.0569,
"step": 480
},
{
"epoch": 0.3155497722836695,
"grad_norm": 1.0971347093582153,
"learning_rate": 4.9674853848438506e-05,
"loss": 1.0441,
"step": 485
},
{
"epoch": 0.3188028627195836,
"grad_norm": 1.0693708658218384,
"learning_rate": 4.9666503576199574e-05,
"loss": 1.0644,
"step": 490
},
{
"epoch": 0.32205595315549773,
"grad_norm": 1.2514370679855347,
"learning_rate": 4.965804815470916e-05,
"loss": 1.0609,
"step": 495
},
{
"epoch": 0.32530904359141183,
"grad_norm": 1.5080784559249878,
"learning_rate": 4.964948762001133e-05,
"loss": 1.0682,
"step": 500
},
{
"epoch": 0.328562134027326,
"grad_norm": 1.1908406019210815,
"learning_rate": 4.964082200859824e-05,
"loss": 1.0418,
"step": 505
},
{
"epoch": 0.3318152244632401,
"grad_norm": 1.6586133241653442,
"learning_rate": 4.963205135740997e-05,
"loss": 1.0668,
"step": 510
},
{
"epoch": 0.3350683148991542,
"grad_norm": 0.7452509999275208,
"learning_rate": 4.962317570383436e-05,
"loss": 1.0508,
"step": 515
},
{
"epoch": 0.3383214053350683,
"grad_norm": 1.3133275508880615,
"learning_rate": 4.961419508570686e-05,
"loss": 1.0543,
"step": 520
},
{
"epoch": 0.3415744957709824,
"grad_norm": 1.1373653411865234,
"learning_rate": 4.960510954131038e-05,
"loss": 1.0711,
"step": 525
},
{
"epoch": 0.3448275862068966,
"grad_norm": 1.12503981590271,
"learning_rate": 4.95959191093751e-05,
"loss": 1.0486,
"step": 530
},
{
"epoch": 0.34808067664281067,
"grad_norm": 0.921503484249115,
"learning_rate": 4.95866238290783e-05,
"loss": 1.0543,
"step": 535
},
{
"epoch": 0.35133376707872477,
"grad_norm": 0.9198605418205261,
"learning_rate": 4.957722374004427e-05,
"loss": 1.0438,
"step": 540
},
{
"epoch": 0.3545868575146389,
"grad_norm": 1.630878210067749,
"learning_rate": 4.9567718882344015e-05,
"loss": 1.0544,
"step": 545
},
{
"epoch": 0.357839947950553,
"grad_norm": 2.2188167572021484,
"learning_rate": 4.95581092964952e-05,
"loss": 1.0541,
"step": 550
},
{
"epoch": 0.36109303838646717,
"grad_norm": 0.9371961355209351,
"learning_rate": 4.95483950234619e-05,
"loss": 1.0723,
"step": 555
},
{
"epoch": 0.36434612882238127,
"grad_norm": 1.0933233499526978,
"learning_rate": 4.9538576104654466e-05,
"loss": 1.052,
"step": 560
},
{
"epoch": 0.36759921925829536,
"grad_norm": 1.1232990026474,
"learning_rate": 4.9528652581929335e-05,
"loss": 1.0354,
"step": 565
},
{
"epoch": 0.3708523096942095,
"grad_norm": 1.000786542892456,
"learning_rate": 4.951862449758885e-05,
"loss": 1.0407,
"step": 570
},
{
"epoch": 0.3741054001301236,
"grad_norm": 0.939582884311676,
"learning_rate": 4.9508491894381104e-05,
"loss": 1.0206,
"step": 575
},
{
"epoch": 0.37735849056603776,
"grad_norm": 1.264381766319275,
"learning_rate": 4.9498254815499694e-05,
"loss": 1.0362,
"step": 580
},
{
"epoch": 0.38061158100195186,
"grad_norm": 0.673314094543457,
"learning_rate": 4.948791330458363e-05,
"loss": 1.0381,
"step": 585
},
{
"epoch": 0.38386467143786596,
"grad_norm": 1.441362738609314,
"learning_rate": 4.947746740571706e-05,
"loss": 1.0354,
"step": 590
},
{
"epoch": 0.3871177618737801,
"grad_norm": 1.1851030588150024,
"learning_rate": 4.9466917163429124e-05,
"loss": 1.0146,
"step": 595
},
{
"epoch": 0.3903708523096942,
"grad_norm": 0.9171844124794006,
"learning_rate": 4.94562626226938e-05,
"loss": 1.0103,
"step": 600
},
{
"epoch": 0.3936239427456083,
"grad_norm": 1.5662965774536133,
"learning_rate": 4.944550382892962e-05,
"loss": 1.0466,
"step": 605
},
{
"epoch": 0.39687703318152245,
"grad_norm": 1.1077489852905273,
"learning_rate": 4.943464082799955e-05,
"loss": 1.0458,
"step": 610
},
{
"epoch": 0.40013012361743655,
"grad_norm": 1.5997633934020996,
"learning_rate": 4.942367366621081e-05,
"loss": 1.0464,
"step": 615
},
{
"epoch": 0.4033832140533507,
"grad_norm": 1.0540611743927002,
"learning_rate": 4.9412602390314585e-05,
"loss": 1.0242,
"step": 620
},
{
"epoch": 0.4066363044892648,
"grad_norm": 1.1247586011886597,
"learning_rate": 4.94014270475059e-05,
"loss": 1.0232,
"step": 625
},
{
"epoch": 0.4098893949251789,
"grad_norm": 1.065820336341858,
"learning_rate": 4.939014768542342e-05,
"loss": 1.0137,
"step": 630
},
{
"epoch": 0.41314248536109305,
"grad_norm": 0.8374763131141663,
"learning_rate": 4.93787643521492e-05,
"loss": 1.0203,
"step": 635
},
{
"epoch": 0.41639557579700714,
"grad_norm": 0.7515140771865845,
"learning_rate": 4.936727709620853e-05,
"loss": 1.0176,
"step": 640
},
{
"epoch": 0.4196486662329213,
"grad_norm": 0.8034088015556335,
"learning_rate": 4.9355685966569684e-05,
"loss": 1.0322,
"step": 645
},
{
"epoch": 0.4229017566688354,
"grad_norm": 1.2314985990524292,
"learning_rate": 4.934399101264375e-05,
"loss": 1.0198,
"step": 650
},
{
"epoch": 0.4261548471047495,
"grad_norm": 1.342058539390564,
"learning_rate": 4.93321922842844e-05,
"loss": 1.0133,
"step": 655
},
{
"epoch": 0.42940793754066364,
"grad_norm": 0.8881794214248657,
"learning_rate": 4.932028983178766e-05,
"loss": 1.0255,
"step": 660
},
{
"epoch": 0.43266102797657774,
"grad_norm": 1.3695508241653442,
"learning_rate": 4.9308283705891736e-05,
"loss": 1.0293,
"step": 665
},
{
"epoch": 0.4359141184124919,
"grad_norm": 0.9350308179855347,
"learning_rate": 4.9296173957776776e-05,
"loss": 1.03,
"step": 670
},
{
"epoch": 0.439167208848406,
"grad_norm": 0.9181856513023376,
"learning_rate": 4.928396063906463e-05,
"loss": 1.0234,
"step": 675
},
{
"epoch": 0.4424202992843201,
"grad_norm": 1.352927803993225,
"learning_rate": 4.927164380181869e-05,
"loss": 1.0474,
"step": 680
},
{
"epoch": 0.44567338972023424,
"grad_norm": 1.176147222518921,
"learning_rate": 4.9259223498543597e-05,
"loss": 1.0329,
"step": 685
},
{
"epoch": 0.44892648015614833,
"grad_norm": 1.0797678232192993,
"learning_rate": 4.9246699782185055e-05,
"loss": 1.0141,
"step": 690
},
{
"epoch": 0.4521795705920625,
"grad_norm": 0.9696300029754639,
"learning_rate": 4.9234072706129627e-05,
"loss": 0.999,
"step": 695
},
{
"epoch": 0.4554326610279766,
"grad_norm": 0.9436845779418945,
"learning_rate": 4.922134232420445e-05,
"loss": 1.0003,
"step": 700
},
{
"epoch": 0.4586857514638907,
"grad_norm": 1.1857705116271973,
"learning_rate": 4.920850869067706e-05,
"loss": 0.9831,
"step": 705
},
{
"epoch": 0.46193884189980483,
"grad_norm": 0.9158900380134583,
"learning_rate": 4.919557186025512e-05,
"loss": 1.0201,
"step": 710
},
{
"epoch": 0.4651919323357189,
"grad_norm": 0.8820152282714844,
"learning_rate": 4.9182531888086205e-05,
"loss": 0.9852,
"step": 715
},
{
"epoch": 0.468445022771633,
"grad_norm": 1.5595647096633911,
"learning_rate": 4.916938882975759e-05,
"loss": 1.0002,
"step": 720
},
{
"epoch": 0.4716981132075472,
"grad_norm": 1.1958764791488647,
"learning_rate": 4.915614274129597e-05,
"loss": 1.0375,
"step": 725
},
{
"epoch": 0.4749512036434613,
"grad_norm": 1.1134103536605835,
"learning_rate": 4.914279367916724e-05,
"loss": 1.0208,
"step": 730
},
{
"epoch": 0.4782042940793754,
"grad_norm": 0.8463726043701172,
"learning_rate": 4.9129341700276266e-05,
"loss": 0.9955,
"step": 735
},
{
"epoch": 0.4814573845152895,
"grad_norm": 0.8405961394309998,
"learning_rate": 4.911578686196661e-05,
"loss": 0.9754,
"step": 740
},
{
"epoch": 0.4847104749512036,
"grad_norm": 1.0310126543045044,
"learning_rate": 4.9102129222020324e-05,
"loss": 1.0213,
"step": 745
},
{
"epoch": 0.48796356538711777,
"grad_norm": 1.058269739151001,
"learning_rate": 4.908836883865768e-05,
"loss": 0.9966,
"step": 750
},
{
"epoch": 0.49121665582303187,
"grad_norm": 0.9762022495269775,
"learning_rate": 4.907450577053694e-05,
"loss": 1.0059,
"step": 755
},
{
"epoch": 0.494469746258946,
"grad_norm": 0.8593292832374573,
"learning_rate": 4.906054007675408e-05,
"loss": 0.9922,
"step": 760
},
{
"epoch": 0.4977228366948601,
"grad_norm": 1.3241448402404785,
"learning_rate": 4.9046471816842565e-05,
"loss": 1.007,
"step": 765
},
{
"epoch": 0.5009759271307742,
"grad_norm": 0.9241655468940735,
"learning_rate": 4.903230105077306e-05,
"loss": 1.0204,
"step": 770
},
{
"epoch": 0.5042290175666884,
"grad_norm": 0.8068680763244629,
"learning_rate": 4.9018027838953226e-05,
"loss": 0.9932,
"step": 775
},
{
"epoch": 0.5074821080026025,
"grad_norm": 1.2541546821594238,
"learning_rate": 4.900365224222742e-05,
"loss": 0.9945,
"step": 780
},
{
"epoch": 0.5107351984385166,
"grad_norm": 0.925835907459259,
"learning_rate": 4.898917432187644e-05,
"loss": 0.9745,
"step": 785
},
{
"epoch": 0.5139882888744307,
"grad_norm": 0.7561518549919128,
"learning_rate": 4.897459413961729e-05,
"loss": 1.0065,
"step": 790
},
{
"epoch": 0.5172413793103449,
"grad_norm": 1.056420922279358,
"learning_rate": 4.8959911757602885e-05,
"loss": 0.974,
"step": 795
},
{
"epoch": 0.5204944697462589,
"grad_norm": 1.219141960144043,
"learning_rate": 4.89451272384218e-05,
"loss": 0.9926,
"step": 800
},
{
"epoch": 0.523747560182173,
"grad_norm": 0.9372319579124451,
"learning_rate": 4.8930240645098027e-05,
"loss": 1.0141,
"step": 805
},
{
"epoch": 0.5270006506180872,
"grad_norm": 1.0118193626403809,
"learning_rate": 4.891525204109065e-05,
"loss": 0.9996,
"step": 810
},
{
"epoch": 0.5302537410540012,
"grad_norm": 0.91470867395401,
"learning_rate": 4.890016149029365e-05,
"loss": 0.9851,
"step": 815
},
{
"epoch": 0.5335068314899154,
"grad_norm": 0.787122368812561,
"learning_rate": 4.888496905703554e-05,
"loss": 0.9969,
"step": 820
},
{
"epoch": 0.5367599219258296,
"grad_norm": 0.8628039956092834,
"learning_rate": 4.886967480607918e-05,
"loss": 1.0024,
"step": 825
},
{
"epoch": 0.5400130123617437,
"grad_norm": 1.450460433959961,
"learning_rate": 4.885427880262144e-05,
"loss": 0.9743,
"step": 830
},
{
"epoch": 0.5432661027976577,
"grad_norm": 1.0362318754196167,
"learning_rate": 4.883878111229296e-05,
"loss": 0.9723,
"step": 835
},
{
"epoch": 0.5465191932335719,
"grad_norm": 0.9855751991271973,
"learning_rate": 4.8823181801157844e-05,
"loss": 0.9898,
"step": 840
},
{
"epoch": 0.549772283669486,
"grad_norm": 1.0782288312911987,
"learning_rate": 4.880748093571339e-05,
"loss": 0.9727,
"step": 845
},
{
"epoch": 0.5530253741054001,
"grad_norm": 1.5194872617721558,
"learning_rate": 4.879167858288982e-05,
"loss": 0.9922,
"step": 850
},
{
"epoch": 0.5562784645413142,
"grad_norm": 1.5501078367233276,
"learning_rate": 4.877577481004995e-05,
"loss": 0.9705,
"step": 855
},
{
"epoch": 0.5595315549772284,
"grad_norm": 1.5971125364303589,
"learning_rate": 4.875976968498895e-05,
"loss": 1.0078,
"step": 860
},
{
"epoch": 0.5627846454131424,
"grad_norm": 0.9124265313148499,
"learning_rate": 4.874366327593406e-05,
"loss": 0.9737,
"step": 865
},
{
"epoch": 0.5660377358490566,
"grad_norm": 0.8439720273017883,
"learning_rate": 4.872745565154424e-05,
"loss": 0.9967,
"step": 870
},
{
"epoch": 0.5692908262849707,
"grad_norm": 0.9340474009513855,
"learning_rate": 4.871114688090992e-05,
"loss": 0.9934,
"step": 875
},
{
"epoch": 0.5725439167208849,
"grad_norm": 0.8820469975471497,
"learning_rate": 4.869473703355273e-05,
"loss": 0.9917,
"step": 880
},
{
"epoch": 0.5757970071567989,
"grad_norm": 0.8724156618118286,
"learning_rate": 4.867822617942514e-05,
"loss": 0.9762,
"step": 885
},
{
"epoch": 0.5790500975927131,
"grad_norm": 0.9085761308670044,
"learning_rate": 4.866161438891022e-05,
"loss": 0.9686,
"step": 890
},
{
"epoch": 0.5823031880286272,
"grad_norm": 0.7215405106544495,
"learning_rate": 4.864490173282128e-05,
"loss": 0.9858,
"step": 895
},
{
"epoch": 0.5855562784645413,
"grad_norm": 1.0854041576385498,
"learning_rate": 4.862808828240164e-05,
"loss": 0.9935,
"step": 900
},
{
"epoch": 0.5888093689004554,
"grad_norm": 0.8779392242431641,
"learning_rate": 4.861117410932429e-05,
"loss": 0.9816,
"step": 905
},
{
"epoch": 0.5920624593363696,
"grad_norm": 1.2866002321243286,
"learning_rate": 4.8594159285691546e-05,
"loss": 0.9818,
"step": 910
},
{
"epoch": 0.5953155497722836,
"grad_norm": 0.7991343140602112,
"learning_rate": 4.8577043884034826e-05,
"loss": 0.9592,
"step": 915
},
{
"epoch": 0.5985686402081978,
"grad_norm": 0.9553494453430176,
"learning_rate": 4.8559827977314254e-05,
"loss": 0.9943,
"step": 920
},
{
"epoch": 0.6018217306441119,
"grad_norm": 1.2053009271621704,
"learning_rate": 4.854251163891843e-05,
"loss": 0.946,
"step": 925
},
{
"epoch": 0.605074821080026,
"grad_norm": 0.744791567325592,
"learning_rate": 4.852509494266405e-05,
"loss": 0.9804,
"step": 930
},
{
"epoch": 0.6083279115159401,
"grad_norm": 1.2371433973312378,
"learning_rate": 4.850757796279563e-05,
"loss": 0.9902,
"step": 935
},
{
"epoch": 0.6115810019518543,
"grad_norm": 0.723250150680542,
"learning_rate": 4.8489960773985174e-05,
"loss": 0.9839,
"step": 940
},
{
"epoch": 0.6148340923877684,
"grad_norm": 0.7003908753395081,
"learning_rate": 4.847224345133188e-05,
"loss": 0.9712,
"step": 945
},
{
"epoch": 0.6180871828236825,
"grad_norm": 0.8090314865112305,
"learning_rate": 4.845442607036176e-05,
"loss": 0.9631,
"step": 950
},
{
"epoch": 0.6213402732595966,
"grad_norm": 0.7971912622451782,
"learning_rate": 4.8436508707027384e-05,
"loss": 0.9722,
"step": 955
},
{
"epoch": 0.6245933636955108,
"grad_norm": 0.7696447968482971,
"learning_rate": 4.841849143770754e-05,
"loss": 0.9712,
"step": 960
},
{
"epoch": 0.6278464541314248,
"grad_norm": 0.9497612714767456,
"learning_rate": 4.840037433920688e-05,
"loss": 0.9653,
"step": 965
},
{
"epoch": 0.631099544567339,
"grad_norm": 1.1326346397399902,
"learning_rate": 4.838215748875562e-05,
"loss": 0.9648,
"step": 970
},
{
"epoch": 0.6343526350032531,
"grad_norm": 0.8858407139778137,
"learning_rate": 4.83638409640092e-05,
"loss": 0.9765,
"step": 975
},
{
"epoch": 0.6376057254391672,
"grad_norm": 0.9079559445381165,
"learning_rate": 4.834542484304795e-05,
"loss": 0.958,
"step": 980
},
{
"epoch": 0.6408588158750813,
"grad_norm": 0.9221760630607605,
"learning_rate": 4.8326909204376776e-05,
"loss": 0.9675,
"step": 985
},
{
"epoch": 0.6441119063109955,
"grad_norm": 0.8072174787521362,
"learning_rate": 4.8308294126924794e-05,
"loss": 0.9745,
"step": 990
},
{
"epoch": 0.6473649967469096,
"grad_norm": 0.9354230165481567,
"learning_rate": 4.828957969004502e-05,
"loss": 0.9581,
"step": 995
},
{
"epoch": 0.6506180871828237,
"grad_norm": 0.8067158460617065,
"learning_rate": 4.827076597351403e-05,
"loss": 0.9669,
"step": 1000
},
{
"epoch": 0.6538711776187378,
"grad_norm": 1.0591189861297607,
"learning_rate": 4.825185305753161e-05,
"loss": 0.9682,
"step": 1005
},
{
"epoch": 0.657124268054652,
"grad_norm": 0.7701990604400635,
"learning_rate": 4.823284102272041e-05,
"loss": 0.9756,
"step": 1010
},
{
"epoch": 0.660377358490566,
"grad_norm": 0.9886049628257751,
"learning_rate": 4.82137299501256e-05,
"loss": 0.9646,
"step": 1015
},
{
"epoch": 0.6636304489264802,
"grad_norm": 0.966618537902832,
"learning_rate": 4.819451992121454e-05,
"loss": 0.9673,
"step": 1020
},
{
"epoch": 0.6668835393623943,
"grad_norm": 0.987940788269043,
"learning_rate": 4.817521101787646e-05,
"loss": 0.9647,
"step": 1025
},
{
"epoch": 0.6701366297983083,
"grad_norm": 0.752627432346344,
"learning_rate": 4.815580332242199e-05,
"loss": 0.9545,
"step": 1030
},
{
"epoch": 0.6733897202342225,
"grad_norm": 1.0263205766677856,
"learning_rate": 4.813629691758299e-05,
"loss": 0.9479,
"step": 1035
},
{
"epoch": 0.6766428106701367,
"grad_norm": 0.8434374332427979,
"learning_rate": 4.811669188651204e-05,
"loss": 0.9747,
"step": 1040
},
{
"epoch": 0.6798959011060507,
"grad_norm": 0.8626881837844849,
"learning_rate": 4.8096988312782174e-05,
"loss": 0.9713,
"step": 1045
},
{
"epoch": 0.6831489915419648,
"grad_norm": 0.8781446814537048,
"learning_rate": 4.8077186280386475e-05,
"loss": 0.964,
"step": 1050
},
{
"epoch": 0.686402081977879,
"grad_norm": 0.8338606953620911,
"learning_rate": 4.8057285873737765e-05,
"loss": 0.9916,
"step": 1055
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.8619135022163391,
"learning_rate": 4.803728717766821e-05,
"loss": 0.9562,
"step": 1060
},
{
"epoch": 0.6929082628497072,
"grad_norm": 0.8325028419494629,
"learning_rate": 4.8017190277428956e-05,
"loss": 0.9494,
"step": 1065
},
{
"epoch": 0.6961613532856213,
"grad_norm": 0.772607684135437,
"learning_rate": 4.799699525868979e-05,
"loss": 0.9783,
"step": 1070
},
{
"epoch": 0.6994144437215355,
"grad_norm": 0.7735521793365479,
"learning_rate": 4.797670220753876e-05,
"loss": 0.966,
"step": 1075
},
{
"epoch": 0.7026675341574495,
"grad_norm": 0.8032121062278748,
"learning_rate": 4.79563112104818e-05,
"loss": 0.9569,
"step": 1080
},
{
"epoch": 0.7059206245933637,
"grad_norm": 0.9248620271682739,
"learning_rate": 4.7935822354442397e-05,
"loss": 0.9676,
"step": 1085
},
{
"epoch": 0.7091737150292778,
"grad_norm": 0.6317049264907837,
"learning_rate": 4.7915235726761154e-05,
"loss": 0.9443,
"step": 1090
},
{
"epoch": 0.7124268054651919,
"grad_norm": 0.9738350510597229,
"learning_rate": 4.789455141519551e-05,
"loss": 0.9693,
"step": 1095
},
{
"epoch": 0.715679895901106,
"grad_norm": 0.7499257922172546,
"learning_rate": 4.7873769507919266e-05,
"loss": 0.958,
"step": 1100
},
{
"epoch": 0.7189329863370202,
"grad_norm": 0.8857749700546265,
"learning_rate": 4.785289009352227e-05,
"loss": 0.9596,
"step": 1105
},
{
"epoch": 0.7221860767729343,
"grad_norm": 0.7081575393676758,
"learning_rate": 4.7831913261010066e-05,
"loss": 0.9454,
"step": 1110
},
{
"epoch": 0.7254391672088484,
"grad_norm": 0.8387717604637146,
"learning_rate": 4.781083909980342e-05,
"loss": 0.9472,
"step": 1115
},
{
"epoch": 0.7286922576447625,
"grad_norm": 0.9755154848098755,
"learning_rate": 4.778966769973802e-05,
"loss": 0.9668,
"step": 1120
},
{
"epoch": 0.7319453480806767,
"grad_norm": 0.7101641893386841,
"learning_rate": 4.7768399151064076e-05,
"loss": 0.9457,
"step": 1125
},
{
"epoch": 0.7351984385165907,
"grad_norm": 0.9372628331184387,
"learning_rate": 4.774703354444591e-05,
"loss": 0.9709,
"step": 1130
},
{
"epoch": 0.7384515289525049,
"grad_norm": 0.9276643991470337,
"learning_rate": 4.7725570970961586e-05,
"loss": 0.9586,
"step": 1135
},
{
"epoch": 0.741704619388419,
"grad_norm": 0.7329192757606506,
"learning_rate": 4.770401152210253e-05,
"loss": 0.9608,
"step": 1140
},
{
"epoch": 0.7449577098243331,
"grad_norm": 0.7759012579917908,
"learning_rate": 4.768235528977314e-05,
"loss": 0.9469,
"step": 1145
},
{
"epoch": 0.7482108002602472,
"grad_norm": 1.2127937078475952,
"learning_rate": 4.766060236629037e-05,
"loss": 0.9542,
"step": 1150
},
{
"epoch": 0.7514638906961614,
"grad_norm": 0.7369085550308228,
"learning_rate": 4.763875284438336e-05,
"loss": 0.9643,
"step": 1155
},
{
"epoch": 0.7547169811320755,
"grad_norm": 0.7963067293167114,
"learning_rate": 4.7616806817193024e-05,
"loss": 0.9678,
"step": 1160
},
{
"epoch": 0.7579700715679896,
"grad_norm": 0.7773886919021606,
"learning_rate": 4.759476437827168e-05,
"loss": 0.9603,
"step": 1165
},
{
"epoch": 0.7612231620039037,
"grad_norm": 0.8198060393333435,
"learning_rate": 4.757262562158262e-05,
"loss": 0.9759,
"step": 1170
},
{
"epoch": 0.7644762524398179,
"grad_norm": 0.7127149701118469,
"learning_rate": 4.7550390641499715e-05,
"loss": 0.9244,
"step": 1175
},
{
"epoch": 0.7677293428757319,
"grad_norm": 1.236286997795105,
"learning_rate": 4.7528059532807045e-05,
"loss": 0.9313,
"step": 1180
},
{
"epoch": 0.7709824333116461,
"grad_norm": 0.6795628070831299,
"learning_rate": 4.750563239069845e-05,
"loss": 0.9586,
"step": 1185
},
{
"epoch": 0.7742355237475602,
"grad_norm": 0.8040820956230164,
"learning_rate": 4.7483109310777165e-05,
"loss": 0.9483,
"step": 1190
},
{
"epoch": 0.7774886141834743,
"grad_norm": 0.8001431226730347,
"learning_rate": 4.7460490389055355e-05,
"loss": 0.9408,
"step": 1195
},
{
"epoch": 0.7807417046193884,
"grad_norm": 0.969782292842865,
"learning_rate": 4.743777572195378e-05,
"loss": 0.9778,
"step": 1200
},
{
"epoch": 0.7839947950553026,
"grad_norm": 1.0955541133880615,
"learning_rate": 4.741496540630134e-05,
"loss": 0.9385,
"step": 1205
},
{
"epoch": 0.7872478854912166,
"grad_norm": 0.7429236173629761,
"learning_rate": 4.739205953933464e-05,
"loss": 0.9642,
"step": 1210
},
{
"epoch": 0.7905009759271308,
"grad_norm": 1.0475250482559204,
"learning_rate": 4.736905821869765e-05,
"loss": 0.9437,
"step": 1215
},
{
"epoch": 0.7937540663630449,
"grad_norm": 0.7216660380363464,
"learning_rate": 4.734596154244121e-05,
"loss": 0.9289,
"step": 1220
},
{
"epoch": 0.7970071567989591,
"grad_norm": 0.8584089279174805,
"learning_rate": 4.732276960902267e-05,
"loss": 0.9246,
"step": 1225
},
{
"epoch": 0.8002602472348731,
"grad_norm": 0.8769578337669373,
"learning_rate": 4.7299482517305404e-05,
"loss": 0.9298,
"step": 1230
},
{
"epoch": 0.8035133376707873,
"grad_norm": 0.7453442811965942,
"learning_rate": 4.7276100366558474e-05,
"loss": 0.9491,
"step": 1235
},
{
"epoch": 0.8067664281067014,
"grad_norm": 0.906287431716919,
"learning_rate": 4.7252623256456144e-05,
"loss": 0.9539,
"step": 1240
},
{
"epoch": 0.8100195185426154,
"grad_norm": 1.0656296014785767,
"learning_rate": 4.722905128707749e-05,
"loss": 0.9405,
"step": 1245
},
{
"epoch": 0.8132726089785296,
"grad_norm": 0.6985450983047485,
"learning_rate": 4.720538455890591e-05,
"loss": 0.9369,
"step": 1250
},
{
"epoch": 0.8165256994144438,
"grad_norm": 0.6577023267745972,
"learning_rate": 4.718162317282882e-05,
"loss": 0.9346,
"step": 1255
},
{
"epoch": 0.8197787898503578,
"grad_norm": 0.7832421064376831,
"learning_rate": 4.7157767230137064e-05,
"loss": 0.9256,
"step": 1260
},
{
"epoch": 0.8230318802862719,
"grad_norm": 0.7928493618965149,
"learning_rate": 4.713381683252463e-05,
"loss": 0.9477,
"step": 1265
},
{
"epoch": 0.8262849707221861,
"grad_norm": 0.8775043487548828,
"learning_rate": 4.710977208208812e-05,
"loss": 0.9313,
"step": 1270
},
{
"epoch": 0.8295380611581002,
"grad_norm": 0.7714875936508179,
"learning_rate": 4.708563308132636e-05,
"loss": 0.9469,
"step": 1275
},
{
"epoch": 0.8327911515940143,
"grad_norm": 0.7258083820343018,
"learning_rate": 4.706139993313994e-05,
"loss": 0.9294,
"step": 1280
},
{
"epoch": 0.8360442420299284,
"grad_norm": 0.7745918035507202,
"learning_rate": 4.7037072740830785e-05,
"loss": 0.9365,
"step": 1285
},
{
"epoch": 0.8392973324658426,
"grad_norm": 0.7213959097862244,
"learning_rate": 4.701265160810172e-05,
"loss": 0.947,
"step": 1290
},
{
"epoch": 0.8425504229017566,
"grad_norm": 0.825713038444519,
"learning_rate": 4.6988136639056025e-05,
"loss": 0.9404,
"step": 1295
},
{
"epoch": 0.8458035133376708,
"grad_norm": 0.6750174164772034,
"learning_rate": 4.696352793819698e-05,
"loss": 0.9364,
"step": 1300
},
{
"epoch": 0.8490566037735849,
"grad_norm": 0.8314560055732727,
"learning_rate": 4.693882561042743e-05,
"loss": 0.9521,
"step": 1305
},
{
"epoch": 0.852309694209499,
"grad_norm": 1.0009961128234863,
"learning_rate": 4.6914029761049357e-05,
"loss": 0.9297,
"step": 1310
},
{
"epoch": 0.8555627846454131,
"grad_norm": 0.7527256011962891,
"learning_rate": 4.688914049576337e-05,
"loss": 0.9269,
"step": 1315
},
{
"epoch": 0.8588158750813273,
"grad_norm": 0.9169411659240723,
"learning_rate": 4.686415792066833e-05,
"loss": 0.9312,
"step": 1320
},
{
"epoch": 0.8620689655172413,
"grad_norm": 0.9165216088294983,
"learning_rate": 4.683908214226084e-05,
"loss": 0.9524,
"step": 1325
},
{
"epoch": 0.8653220559531555,
"grad_norm": 0.9357953071594238,
"learning_rate": 4.6813913267434835e-05,
"loss": 0.9245,
"step": 1330
},
{
"epoch": 0.8685751463890696,
"grad_norm": 0.6473081707954407,
"learning_rate": 4.678865140348108e-05,
"loss": 0.9584,
"step": 1335
},
{
"epoch": 0.8718282368249838,
"grad_norm": 0.884191632270813,
"learning_rate": 4.676329665808677e-05,
"loss": 0.9569,
"step": 1340
},
{
"epoch": 0.8750813272608978,
"grad_norm": 1.0534435510635376,
"learning_rate": 4.673784913933499e-05,
"loss": 0.9178,
"step": 1345
},
{
"epoch": 0.878334417696812,
"grad_norm": 0.8140066266059875,
"learning_rate": 4.6712308955704346e-05,
"loss": 0.9536,
"step": 1350
},
{
"epoch": 0.8815875081327261,
"grad_norm": 0.71702641248703,
"learning_rate": 4.668667621606845e-05,
"loss": 0.947,
"step": 1355
},
{
"epoch": 0.8848405985686402,
"grad_norm": 0.6529531478881836,
"learning_rate": 4.666095102969544e-05,
"loss": 0.9107,
"step": 1360
},
{
"epoch": 0.8880936890045543,
"grad_norm": 0.9059852957725525,
"learning_rate": 4.6635133506247585e-05,
"loss": 0.9399,
"step": 1365
},
{
"epoch": 0.8913467794404685,
"grad_norm": 0.8972651958465576,
"learning_rate": 4.660922375578073e-05,
"loss": 0.9511,
"step": 1370
},
{
"epoch": 0.8945998698763825,
"grad_norm": 1.0316717624664307,
"learning_rate": 4.658322188874388e-05,
"loss": 0.9335,
"step": 1375
},
{
"epoch": 0.8978529603122967,
"grad_norm": 0.7475149035453796,
"learning_rate": 4.6557128015978726e-05,
"loss": 0.9262,
"step": 1380
},
{
"epoch": 0.9011060507482108,
"grad_norm": 1.035979986190796,
"learning_rate": 4.653094224871916e-05,
"loss": 0.9115,
"step": 1385
},
{
"epoch": 0.904359141184125,
"grad_norm": 0.8210706114768982,
"learning_rate": 4.650466469859079e-05,
"loss": 0.9535,
"step": 1390
},
{
"epoch": 0.907612231620039,
"grad_norm": 0.9931228160858154,
"learning_rate": 4.647829547761053e-05,
"loss": 0.9335,
"step": 1395
},
{
"epoch": 0.9108653220559532,
"grad_norm": 0.7681549191474915,
"learning_rate": 4.6451834698186e-05,
"loss": 0.9434,
"step": 1400
},
{
"epoch": 0.9141184124918673,
"grad_norm": 0.7461596727371216,
"learning_rate": 4.642528247311518e-05,
"loss": 0.9487,
"step": 1405
},
{
"epoch": 0.9173715029277814,
"grad_norm": 1.4867486953735352,
"learning_rate": 4.6398638915585835e-05,
"loss": 0.9074,
"step": 1410
},
{
"epoch": 0.9206245933636955,
"grad_norm": 0.890620231628418,
"learning_rate": 4.637190413917506e-05,
"loss": 0.9467,
"step": 1415
},
{
"epoch": 0.9238776837996097,
"grad_norm": 0.6205281615257263,
"learning_rate": 4.634507825784882e-05,
"loss": 0.9242,
"step": 1420
},
{
"epoch": 0.9271307742355237,
"grad_norm": 0.8957470655441284,
"learning_rate": 4.631816138596145e-05,
"loss": 0.94,
"step": 1425
},
{
"epoch": 0.9303838646714379,
"grad_norm": 0.8642396330833435,
"learning_rate": 4.629115363825514e-05,
"loss": 0.9142,
"step": 1430
},
{
"epoch": 0.933636955107352,
"grad_norm": 0.6721086502075195,
"learning_rate": 4.626405512985948e-05,
"loss": 0.9205,
"step": 1435
},
{
"epoch": 0.936890045543266,
"grad_norm": 0.8930765986442566,
"learning_rate": 4.623686597629098e-05,
"loss": 0.9235,
"step": 1440
},
{
"epoch": 0.9401431359791802,
"grad_norm": 0.9480865597724915,
"learning_rate": 4.62095862934525e-05,
"loss": 0.9309,
"step": 1445
},
{
"epoch": 0.9433962264150944,
"grad_norm": 0.9130436778068542,
"learning_rate": 4.618221619763287e-05,
"loss": 0.9257,
"step": 1450
},
{
"epoch": 0.9466493168510085,
"grad_norm": 0.63996821641922,
"learning_rate": 4.6154755805506294e-05,
"loss": 0.9364,
"step": 1455
},
{
"epoch": 0.9499024072869225,
"grad_norm": 0.786276638507843,
"learning_rate": 4.612720523413193e-05,
"loss": 0.9389,
"step": 1460
},
{
"epoch": 0.9531554977228367,
"grad_norm": 0.8122700452804565,
"learning_rate": 4.609956460095332e-05,
"loss": 0.9296,
"step": 1465
},
{
"epoch": 0.9564085881587508,
"grad_norm": 1.0054434537887573,
"learning_rate": 4.607183402379794e-05,
"loss": 0.9118,
"step": 1470
},
{
"epoch": 0.9596616785946649,
"grad_norm": 0.9399415850639343,
"learning_rate": 4.6044013620876706e-05,
"loss": 0.9311,
"step": 1475
},
{
"epoch": 0.962914769030579,
"grad_norm": 0.6693314909934998,
"learning_rate": 4.60161035107834e-05,
"loss": 0.9322,
"step": 1480
},
{
"epoch": 0.9661678594664932,
"grad_norm": 0.7549735903739929,
"learning_rate": 4.598810381249425e-05,
"loss": 0.9246,
"step": 1485
},
{
"epoch": 0.9694209499024072,
"grad_norm": 0.8314823508262634,
"learning_rate": 4.596001464536737e-05,
"loss": 0.9335,
"step": 1490
},
{
"epoch": 0.9726740403383214,
"grad_norm": 0.7478086948394775,
"learning_rate": 4.593183612914225e-05,
"loss": 0.9341,
"step": 1495
},
{
"epoch": 0.9759271307742355,
"grad_norm": 0.9777085185050964,
"learning_rate": 4.5903568383939284e-05,
"loss": 0.9323,
"step": 1500
},
{
"epoch": 0.9791802212101497,
"grad_norm": 0.893374502658844,
"learning_rate": 4.587521153025922e-05,
"loss": 0.939,
"step": 1505
},
{
"epoch": 0.9824333116460637,
"grad_norm": 0.6938668489456177,
"learning_rate": 4.584676568898267e-05,
"loss": 0.9437,
"step": 1510
},
{
"epoch": 0.9856864020819779,
"grad_norm": 0.6903214454650879,
"learning_rate": 4.5818230981369584e-05,
"loss": 0.9332,
"step": 1515
},
{
"epoch": 0.988939492517892,
"grad_norm": 0.817034125328064,
"learning_rate": 4.5789607529058715e-05,
"loss": 0.9375,
"step": 1520
},
{
"epoch": 0.9921925829538061,
"grad_norm": 0.8222942352294922,
"learning_rate": 4.5760895454067154e-05,
"loss": 0.9316,
"step": 1525
},
{
"epoch": 0.9954456733897202,
"grad_norm": 0.7549692392349243,
"learning_rate": 4.5732094878789756e-05,
"loss": 0.9221,
"step": 1530
},
{
"epoch": 0.9986987638256344,
"grad_norm": 0.8544319868087769,
"learning_rate": 4.570320592599863e-05,
"loss": 0.9287,
"step": 1535
},
{
"epoch": 1.0,
"eval_f1": 0.7910057808991992,
"eval_loss": 0.462646484375,
"eval_precision": 0.7940469727119374,
"eval_recall": 0.7896973937143991,
"eval_runtime": 247.1562,
"eval_samples_per_second": 1591.847,
"eval_steps_per_second": 1.558,
"step": 1537
},
{
"epoch": 1.0019518542615484,
"grad_norm": 0.7457589507102966,
"learning_rate": 4.567422871884265e-05,
"loss": 0.9279,
"step": 1540
},
{
"epoch": 1.0052049446974627,
"grad_norm": 0.8609625697135925,
"learning_rate": 4.564516338084688e-05,
"loss": 0.8765,
"step": 1545
},
{
"epoch": 1.0084580351333767,
"grad_norm": 0.8822636008262634,
"learning_rate": 4.561601003591208e-05,
"loss": 0.8427,
"step": 1550
},
{
"epoch": 1.0117111255692908,
"grad_norm": 0.7266067266464233,
"learning_rate": 4.558676880831417e-05,
"loss": 0.8828,
"step": 1555
},
{
"epoch": 1.014964216005205,
"grad_norm": 0.6970102787017822,
"learning_rate": 4.555743982270369e-05,
"loss": 0.8842,
"step": 1560
},
{
"epoch": 1.018217306441119,
"grad_norm": 0.6802201867103577,
"learning_rate": 4.5528023204105306e-05,
"loss": 0.872,
"step": 1565
},
{
"epoch": 1.0214703968770331,
"grad_norm": 0.7830452919006348,
"learning_rate": 4.549851907791722e-05,
"loss": 0.8624,
"step": 1570
},
{
"epoch": 1.0247234873129474,
"grad_norm": 0.6845102906227112,
"learning_rate": 4.5468927569910663e-05,
"loss": 0.8744,
"step": 1575
},
{
"epoch": 1.0279765777488614,
"grad_norm": 0.8832181692123413,
"learning_rate": 4.5439248806229386e-05,
"loss": 0.8722,
"step": 1580
},
{
"epoch": 1.0312296681847755,
"grad_norm": 0.7359802722930908,
"learning_rate": 4.5409482913389065e-05,
"loss": 0.8567,
"step": 1585
},
{
"epoch": 1.0344827586206897,
"grad_norm": 0.7686721086502075,
"learning_rate": 4.5379630018276834e-05,
"loss": 0.8509,
"step": 1590
},
{
"epoch": 1.0377358490566038,
"grad_norm": 0.77400141954422,
"learning_rate": 4.534969024815066e-05,
"loss": 0.8676,
"step": 1595
},
{
"epoch": 1.0409889394925178,
"grad_norm": 0.8024744987487793,
"learning_rate": 4.531966373063886e-05,
"loss": 0.8772,
"step": 1600
},
{
"epoch": 1.044242029928432,
"grad_norm": 0.7155640721321106,
"learning_rate": 4.528955059373956e-05,
"loss": 0.8608,
"step": 1605
},
{
"epoch": 1.047495120364346,
"grad_norm": 0.8553564548492432,
"learning_rate": 4.52593509658201e-05,
"loss": 0.8614,
"step": 1610
},
{
"epoch": 1.0507482108002602,
"grad_norm": 0.6926222443580627,
"learning_rate": 4.522906497561655e-05,
"loss": 0.8582,
"step": 1615
},
{
"epoch": 1.0540013012361744,
"grad_norm": 0.8300968408584595,
"learning_rate": 4.519869275223309e-05,
"loss": 0.8838,
"step": 1620
},
{
"epoch": 1.0572543916720885,
"grad_norm": 0.8907480835914612,
"learning_rate": 4.516823442514153e-05,
"loss": 0.8656,
"step": 1625
},
{
"epoch": 1.0605074821080025,
"grad_norm": 1.035863995552063,
"learning_rate": 4.513769012418071e-05,
"loss": 0.8814,
"step": 1630
},
{
"epoch": 1.0637605725439168,
"grad_norm": 0.9308491945266724,
"learning_rate": 4.510705997955596e-05,
"loss": 0.8831,
"step": 1635
},
{
"epoch": 1.0670136629798308,
"grad_norm": 1.0290710926055908,
"learning_rate": 4.507634412183856e-05,
"loss": 0.8566,
"step": 1640
},
{
"epoch": 1.070266753415745,
"grad_norm": 0.9163823127746582,
"learning_rate": 4.504554268196516e-05,
"loss": 0.8646,
"step": 1645
},
{
"epoch": 1.073519843851659,
"grad_norm": 0.7528260946273804,
"learning_rate": 4.5014655791237245e-05,
"loss": 0.8681,
"step": 1650
},
{
"epoch": 1.0767729342875731,
"grad_norm": 0.9018992781639099,
"learning_rate": 4.498368358132055e-05,
"loss": 0.8667,
"step": 1655
},
{
"epoch": 1.0800260247234874,
"grad_norm": 1.000990390777588,
"learning_rate": 4.4952626184244504e-05,
"loss": 0.8627,
"step": 1660
},
{
"epoch": 1.0832791151594015,
"grad_norm": 1.1555023193359375,
"learning_rate": 4.492148373240171e-05,
"loss": 0.8488,
"step": 1665
},
{
"epoch": 1.0865322055953155,
"grad_norm": 0.9759275913238525,
"learning_rate": 4.4890256358547304e-05,
"loss": 0.8775,
"step": 1670
},
{
"epoch": 1.0897852960312298,
"grad_norm": 0.7439780235290527,
"learning_rate": 4.485894419579846e-05,
"loss": 0.8758,
"step": 1675
},
{
"epoch": 1.0930383864671438,
"grad_norm": 0.8394938111305237,
"learning_rate": 4.482754737763378e-05,
"loss": 0.8797,
"step": 1680
},
{
"epoch": 1.0962914769030578,
"grad_norm": 0.8299522399902344,
"learning_rate": 4.4796066037892734e-05,
"loss": 0.864,
"step": 1685
},
{
"epoch": 1.099544567338972,
"grad_norm": 0.8585712909698486,
"learning_rate": 4.4764500310775116e-05,
"loss": 0.8586,
"step": 1690
},
{
"epoch": 1.1027976577748861,
"grad_norm": 1.0859423875808716,
"learning_rate": 4.473285033084043e-05,
"loss": 0.8773,
"step": 1695
},
{
"epoch": 1.1060507482108002,
"grad_norm": 0.7827959060668945,
"learning_rate": 4.4701116233007314e-05,
"loss": 0.8423,
"step": 1700
},
{
"epoch": 1.1093038386467144,
"grad_norm": 0.7498010993003845,
"learning_rate": 4.466929815255304e-05,
"loss": 0.884,
"step": 1705
},
{
"epoch": 1.1125569290826285,
"grad_norm": 0.7543908357620239,
"learning_rate": 4.4637396225112846e-05,
"loss": 0.8606,
"step": 1710
},
{
"epoch": 1.1158100195185425,
"grad_norm": 1.3613898754119873,
"learning_rate": 4.460541058667942e-05,
"loss": 0.8909,
"step": 1715
},
{
"epoch": 1.1190631099544568,
"grad_norm": 0.8409460783004761,
"learning_rate": 4.457334137360226e-05,
"loss": 0.8892,
"step": 1720
},
{
"epoch": 1.1223162003903708,
"grad_norm": 0.9072450995445251,
"learning_rate": 4.4541188722587165e-05,
"loss": 0.8714,
"step": 1725
},
{
"epoch": 1.1255692908262849,
"grad_norm": 1.02306067943573,
"learning_rate": 4.450895277069561e-05,
"loss": 0.8813,
"step": 1730
},
{
"epoch": 1.1288223812621991,
"grad_norm": 1.0199263095855713,
"learning_rate": 4.4476633655344144e-05,
"loss": 0.8693,
"step": 1735
},
{
"epoch": 1.1320754716981132,
"grad_norm": 0.7447525262832642,
"learning_rate": 4.444423151430386e-05,
"loss": 0.8894,
"step": 1740
},
{
"epoch": 1.1353285621340272,
"grad_norm": 1.062179446220398,
"learning_rate": 4.4411746485699744e-05,
"loss": 0.8425,
"step": 1745
},
{
"epoch": 1.1385816525699415,
"grad_norm": 0.7509242296218872,
"learning_rate": 4.437917870801015e-05,
"loss": 0.8666,
"step": 1750
},
{
"epoch": 1.1418347430058555,
"grad_norm": 1.1955047845840454,
"learning_rate": 4.434652832006616e-05,
"loss": 0.8798,
"step": 1755
},
{
"epoch": 1.1450878334417696,
"grad_norm": 1.1089417934417725,
"learning_rate": 4.431379546105101e-05,
"loss": 0.8808,
"step": 1760
},
{
"epoch": 1.1483409238776838,
"grad_norm": 0.7296579480171204,
"learning_rate": 4.4280980270499494e-05,
"loss": 0.854,
"step": 1765
},
{
"epoch": 1.1515940143135979,
"grad_norm": 1.0274302959442139,
"learning_rate": 4.424808288829739e-05,
"loss": 0.8775,
"step": 1770
},
{
"epoch": 1.1548471047495121,
"grad_norm": 0.8249827027320862,
"learning_rate": 4.421510345468082e-05,
"loss": 0.8825,
"step": 1775
},
{
"epoch": 1.1581001951854262,
"grad_norm": 0.814564049243927,
"learning_rate": 4.4182042110235686e-05,
"loss": 0.8354,
"step": 1780
},
{
"epoch": 1.1613532856213402,
"grad_norm": 0.8738640546798706,
"learning_rate": 4.414889899589709e-05,
"loss": 0.8667,
"step": 1785
},
{
"epoch": 1.1646063760572545,
"grad_norm": 0.873928427696228,
"learning_rate": 4.411567425294867e-05,
"loss": 0.8589,
"step": 1790
},
{
"epoch": 1.1678594664931685,
"grad_norm": 1.0771477222442627,
"learning_rate": 4.408236802302203e-05,
"loss": 0.8677,
"step": 1795
},
{
"epoch": 1.1711125569290826,
"grad_norm": 1.026843786239624,
"learning_rate": 4.404898044809618e-05,
"loss": 0.8613,
"step": 1800
},
{
"epoch": 1.1743656473649968,
"grad_norm": 1.2807365655899048,
"learning_rate": 4.401551167049686e-05,
"loss": 0.8612,
"step": 1805
},
{
"epoch": 1.1776187378009109,
"grad_norm": 1.086053729057312,
"learning_rate": 4.398196183289595e-05,
"loss": 0.8679,
"step": 1810
},
{
"epoch": 1.180871828236825,
"grad_norm": 1.2245922088623047,
"learning_rate": 4.394833107831091e-05,
"loss": 0.8666,
"step": 1815
},
{
"epoch": 1.1841249186727392,
"grad_norm": 0.788972020149231,
"learning_rate": 4.3914619550104125e-05,
"loss": 0.8549,
"step": 1820
},
{
"epoch": 1.1873780091086532,
"grad_norm": 0.7560495734214783,
"learning_rate": 4.388082739198229e-05,
"loss": 0.8689,
"step": 1825
},
{
"epoch": 1.1906310995445673,
"grad_norm": 0.9753955006599426,
"learning_rate": 4.3846954747995825e-05,
"loss": 0.8676,
"step": 1830
},
{
"epoch": 1.1938841899804815,
"grad_norm": 0.7910217642784119,
"learning_rate": 4.381300176253825e-05,
"loss": 0.872,
"step": 1835
},
{
"epoch": 1.1971372804163956,
"grad_norm": 0.9588011503219604,
"learning_rate": 4.377896858034557e-05,
"loss": 0.8903,
"step": 1840
},
{
"epoch": 1.2003903708523098,
"grad_norm": 0.9886934757232666,
"learning_rate": 4.374485534649562e-05,
"loss": 0.879,
"step": 1845
},
{
"epoch": 1.2036434612882239,
"grad_norm": 0.896848738193512,
"learning_rate": 4.371066220640754e-05,
"loss": 0.854,
"step": 1850
},
{
"epoch": 1.206896551724138,
"grad_norm": 1.7082849740982056,
"learning_rate": 4.367638930584105e-05,
"loss": 0.8877,
"step": 1855
},
{
"epoch": 1.2101496421600522,
"grad_norm": 1.307518482208252,
"learning_rate": 4.36420367908959e-05,
"loss": 0.8637,
"step": 1860
},
{
"epoch": 1.2134027325959662,
"grad_norm": 0.9649641513824463,
"learning_rate": 4.3607604808011213e-05,
"loss": 0.8644,
"step": 1865
},
{
"epoch": 1.2166558230318802,
"grad_norm": 0.958816409111023,
"learning_rate": 4.357309350396488e-05,
"loss": 0.8771,
"step": 1870
},
{
"epoch": 1.2199089134677945,
"grad_norm": 0.7665415406227112,
"learning_rate": 4.353850302587291e-05,
"loss": 0.8559,
"step": 1875
},
{
"epoch": 1.2231620039037086,
"grad_norm": 0.8145641088485718,
"learning_rate": 4.3503833521188844e-05,
"loss": 0.8776,
"step": 1880
},
{
"epoch": 1.2264150943396226,
"grad_norm": 1.0663881301879883,
"learning_rate": 4.346908513770306e-05,
"loss": 0.8643,
"step": 1885
},
{
"epoch": 1.2296681847755369,
"grad_norm": 0.7401409149169922,
"learning_rate": 4.343425802354222e-05,
"loss": 0.8646,
"step": 1890
},
{
"epoch": 1.232921275211451,
"grad_norm": 0.7239570021629333,
"learning_rate": 4.3399352327168595e-05,
"loss": 0.8885,
"step": 1895
},
{
"epoch": 1.236174365647365,
"grad_norm": 1.0525251626968384,
"learning_rate": 4.3364368197379426e-05,
"loss": 0.8817,
"step": 1900
},
{
"epoch": 1.2394274560832792,
"grad_norm": 0.8934289813041687,
"learning_rate": 4.33293057833063e-05,
"loss": 0.8699,
"step": 1905
},
{
"epoch": 1.2426805465191932,
"grad_norm": 0.8614199757575989,
"learning_rate": 4.329416523441454e-05,
"loss": 0.866,
"step": 1910
},
{
"epoch": 1.2459336369551073,
"grad_norm": 0.884955644607544,
"learning_rate": 4.3258946700502535e-05,
"loss": 0.8641,
"step": 1915
},
{
"epoch": 1.2491867273910215,
"grad_norm": 0.8655734062194824,
"learning_rate": 4.322365033170109e-05,
"loss": 0.8393,
"step": 1920
},
{
"epoch": 1.2524398178269356,
"grad_norm": 1.0718590021133423,
"learning_rate": 4.318827627847284e-05,
"loss": 0.8788,
"step": 1925
},
{
"epoch": 1.2556929082628496,
"grad_norm": 0.9467219710350037,
"learning_rate": 4.315282469161156e-05,
"loss": 0.8758,
"step": 1930
},
{
"epoch": 1.258945998698764,
"grad_norm": 1.0598018169403076,
"learning_rate": 4.311729572224153e-05,
"loss": 0.8872,
"step": 1935
},
{
"epoch": 1.262199089134678,
"grad_norm": 0.7586490511894226,
"learning_rate": 4.308168952181691e-05,
"loss": 0.8749,
"step": 1940
},
{
"epoch": 1.265452179570592,
"grad_norm": 0.8791137933731079,
"learning_rate": 4.304600624212109e-05,
"loss": 0.8833,
"step": 1945
},
{
"epoch": 1.2687052700065062,
"grad_norm": 1.0280482769012451,
"learning_rate": 4.3017404223497385e-05,
"loss": 0.893,
"step": 1950
},
{
"epoch": 1.2719583604424203,
"grad_norm": 0.8759311437606812,
"learning_rate": 4.298158258465592e-05,
"loss": 0.8833,
"step": 1955
},
{
"epoch": 1.2752114508783343,
"grad_norm": 0.8623502850532532,
"learning_rate": 4.2945684293282685e-05,
"loss": 0.8533,
"step": 1960
},
{
"epoch": 1.2784645413142486,
"grad_norm": 0.9812124967575073,
"learning_rate": 4.290970950240617e-05,
"loss": 0.8832,
"step": 1965
},
{
"epoch": 1.2817176317501626,
"grad_norm": 0.8114174008369446,
"learning_rate": 4.2873658365381026e-05,
"loss": 0.8657,
"step": 1970
},
{
"epoch": 1.2849707221860767,
"grad_norm": 0.7681922912597656,
"learning_rate": 4.2837531035887305e-05,
"loss": 0.8563,
"step": 1975
},
{
"epoch": 1.288223812621991,
"grad_norm": 0.9911778569221497,
"learning_rate": 4.280132766792989e-05,
"loss": 0.8401,
"step": 1980
},
{
"epoch": 1.291476903057905,
"grad_norm": 0.7618448138237,
"learning_rate": 4.276504841583778e-05,
"loss": 0.8727,
"step": 1985
},
{
"epoch": 1.294729993493819,
"grad_norm": 0.7748595476150513,
"learning_rate": 4.2728693434263476e-05,
"loss": 0.8726,
"step": 1990
},
{
"epoch": 1.2979830839297333,
"grad_norm": 0.995187520980835,
"learning_rate": 4.269226287818228e-05,
"loss": 0.8606,
"step": 1995
},
{
"epoch": 1.3012361743656473,
"grad_norm": 0.9184800386428833,
"learning_rate": 4.2655756902891665e-05,
"loss": 0.8881,
"step": 2000
},
{
"epoch": 1.3044892648015614,
"grad_norm": 0.6605210304260254,
"learning_rate": 4.261917566401061e-05,
"loss": 0.8452,
"step": 2005
},
{
"epoch": 1.3077423552374756,
"grad_norm": 0.9930521249771118,
"learning_rate": 4.258251931747893e-05,
"loss": 0.8661,
"step": 2010
},
{
"epoch": 1.3109954456733897,
"grad_norm": 0.6971027255058289,
"learning_rate": 4.25457880195566e-05,
"loss": 0.8607,
"step": 2015
},
{
"epoch": 1.3142485361093037,
"grad_norm": 0.8052083253860474,
"learning_rate": 4.250898192682311e-05,
"loss": 0.8407,
"step": 2020
},
{
"epoch": 1.317501626545218,
"grad_norm": 0.7318537831306458,
"learning_rate": 4.247210119617679e-05,
"loss": 0.8703,
"step": 2025
},
{
"epoch": 1.320754716981132,
"grad_norm": 1.0614877939224243,
"learning_rate": 4.243514598483412e-05,
"loss": 0.854,
"step": 2030
},
{
"epoch": 1.3240078074170463,
"grad_norm": 1.2773613929748535,
"learning_rate": 4.23981164503291e-05,
"loss": 0.8728,
"step": 2035
},
{
"epoch": 1.3272608978529603,
"grad_norm": 1.41408371925354,
"learning_rate": 4.236101275051256e-05,
"loss": 0.859,
"step": 2040
},
{
"epoch": 1.3305139882888743,
"grad_norm": 0.7571334838867188,
"learning_rate": 4.232383504355147e-05,
"loss": 0.8588,
"step": 2045
},
{
"epoch": 1.3337670787247886,
"grad_norm": 0.7090466618537903,
"learning_rate": 4.228658348792828e-05,
"loss": 0.8672,
"step": 2050
},
{
"epoch": 1.3370201691607027,
"grad_norm": 0.826134204864502,
"learning_rate": 4.224925824244025e-05,
"loss": 0.8552,
"step": 2055
},
{
"epoch": 1.340273259596617,
"grad_norm": 0.8876454830169678,
"learning_rate": 4.2211859466198785e-05,
"loss": 0.8733,
"step": 2060
},
{
"epoch": 1.343526350032531,
"grad_norm": 0.7836646437644958,
"learning_rate": 4.217438731862871e-05,
"loss": 0.8643,
"step": 2065
},
{
"epoch": 1.346779440468445,
"grad_norm": 0.795116662979126,
"learning_rate": 4.213684195946762e-05,
"loss": 0.8759,
"step": 2070
},
{
"epoch": 1.3500325309043593,
"grad_norm": 0.9851782321929932,
"learning_rate": 4.2099223548765224e-05,
"loss": 0.872,
"step": 2075
},
{
"epoch": 1.3532856213402733,
"grad_norm": 0.9454843997955322,
"learning_rate": 4.206153224688264e-05,
"loss": 0.8709,
"step": 2080
},
{
"epoch": 1.3565387117761873,
"grad_norm": 0.7972314953804016,
"learning_rate": 4.202376821449167e-05,
"loss": 0.881,
"step": 2085
},
{
"epoch": 1.3597918022121016,
"grad_norm": 0.7645969390869141,
"learning_rate": 4.1985931612574186e-05,
"loss": 0.8729,
"step": 2090
},
{
"epoch": 1.3630448926480156,
"grad_norm": 1.1820120811462402,
"learning_rate": 4.194802260242141e-05,
"loss": 0.8556,
"step": 2095
},
{
"epoch": 1.3662979830839297,
"grad_norm": 0.9157008528709412,
"learning_rate": 4.191004134563322e-05,
"loss": 0.8721,
"step": 2100
},
{
"epoch": 1.369551073519844,
"grad_norm": 0.8286409974098206,
"learning_rate": 4.187198800411748e-05,
"loss": 0.8756,
"step": 2105
},
{
"epoch": 1.372804163955758,
"grad_norm": 0.8742622137069702,
"learning_rate": 4.183386274008932e-05,
"loss": 0.8592,
"step": 2110
},
{
"epoch": 1.376057254391672,
"grad_norm": 0.8968034386634827,
"learning_rate": 4.1795665716070474e-05,
"loss": 0.8641,
"step": 2115
},
{
"epoch": 1.3793103448275863,
"grad_norm": 0.8291420340538025,
"learning_rate": 4.1757397094888594e-05,
"loss": 0.8529,
"step": 2120
},
{
"epoch": 1.3825634352635003,
"grad_norm": 0.919009268283844,
"learning_rate": 4.1719057039676515e-05,
"loss": 0.8636,
"step": 2125
},
{
"epoch": 1.3858165256994144,
"grad_norm": 1.0421229600906372,
"learning_rate": 4.168064571387159e-05,
"loss": 0.8681,
"step": 2130
},
{
"epoch": 1.3890696161353286,
"grad_norm": 0.7388564944267273,
"learning_rate": 4.1642163281214984e-05,
"loss": 0.8513,
"step": 2135
},
{
"epoch": 1.3923227065712427,
"grad_norm": 0.6921651363372803,
"learning_rate": 4.160360990575099e-05,
"loss": 0.8723,
"step": 2140
},
{
"epoch": 1.3955757970071567,
"grad_norm": 0.7668315768241882,
"learning_rate": 4.156498575182633e-05,
"loss": 0.8621,
"step": 2145
},
{
"epoch": 1.398828887443071,
"grad_norm": 0.7497116327285767,
"learning_rate": 4.152629098408939e-05,
"loss": 0.8604,
"step": 2150
},
{
"epoch": 1.402081977878985,
"grad_norm": 0.7256556749343872,
"learning_rate": 4.1487525767489635e-05,
"loss": 0.8638,
"step": 2155
},
{
"epoch": 1.405335068314899,
"grad_norm": 1.1155390739440918,
"learning_rate": 4.144869026727681e-05,
"loss": 0.8547,
"step": 2160
},
{
"epoch": 1.4085881587508133,
"grad_norm": 0.9044195413589478,
"learning_rate": 4.140978464900025e-05,
"loss": 0.8792,
"step": 2165
},
{
"epoch": 1.4118412491867274,
"grad_norm": 0.7881206274032593,
"learning_rate": 4.137080907850823e-05,
"loss": 0.874,
"step": 2170
},
{
"epoch": 1.4150943396226414,
"grad_norm": 0.851743757724762,
"learning_rate": 4.13317637219472e-05,
"loss": 0.8551,
"step": 2175
},
{
"epoch": 1.4183474300585557,
"grad_norm": 0.8619376420974731,
"learning_rate": 4.129264874576111e-05,
"loss": 0.8757,
"step": 2180
},
{
"epoch": 1.4216005204944697,
"grad_norm": 1.2099318504333496,
"learning_rate": 4.125346431669065e-05,
"loss": 0.8567,
"step": 2185
},
{
"epoch": 1.4248536109303838,
"grad_norm": 0.8172369599342346,
"learning_rate": 4.121421060177263e-05,
"loss": 0.8625,
"step": 2190
},
{
"epoch": 1.428106701366298,
"grad_norm": 1.1485086679458618,
"learning_rate": 4.1174887768339164e-05,
"loss": 0.8681,
"step": 2195
},
{
"epoch": 1.431359791802212,
"grad_norm": 0.8006755709648132,
"learning_rate": 4.113549598401704e-05,
"loss": 0.8657,
"step": 2200
},
{
"epoch": 1.434612882238126,
"grad_norm": 0.7858587503433228,
"learning_rate": 4.1096035416726966e-05,
"loss": 0.8681,
"step": 2205
},
{
"epoch": 1.4378659726740404,
"grad_norm": 1.0397981405258179,
"learning_rate": 4.105650623468284e-05,
"loss": 0.871,
"step": 2210
},
{
"epoch": 1.4411190631099544,
"grad_norm": 1.409725546836853,
"learning_rate": 4.101690860639108e-05,
"loss": 0.8525,
"step": 2215
},
{
"epoch": 1.4443721535458685,
"grad_norm": 1.0374292135238647,
"learning_rate": 4.097724270064988e-05,
"loss": 0.8561,
"step": 2220
},
{
"epoch": 1.4476252439817827,
"grad_norm": 1.10367751121521,
"learning_rate": 4.0937508686548455e-05,
"loss": 0.8608,
"step": 2225
},
{
"epoch": 1.4508783344176968,
"grad_norm": 0.9354111552238464,
"learning_rate": 4.089770673346639e-05,
"loss": 0.8556,
"step": 2230
},
{
"epoch": 1.4541314248536108,
"grad_norm": 0.7732600569725037,
"learning_rate": 4.085783701107288e-05,
"loss": 0.8664,
"step": 2235
},
{
"epoch": 1.457384515289525,
"grad_norm": 0.7464646697044373,
"learning_rate": 4.0817899689325975e-05,
"loss": 0.8544,
"step": 2240
},
{
"epoch": 1.460637605725439,
"grad_norm": 0.7917648553848267,
"learning_rate": 4.077789493847194e-05,
"loss": 0.849,
"step": 2245
},
{
"epoch": 1.4638906961613534,
"grad_norm": 0.8593052625656128,
"learning_rate": 4.073782292904445e-05,
"loss": 0.905,
"step": 2250
},
{
"epoch": 1.4671437865972674,
"grad_norm": 0.7432965636253357,
"learning_rate": 4.0697683831863877e-05,
"loss": 0.8606,
"step": 2255
},
{
"epoch": 1.4703968770331814,
"grad_norm": 1.0467164516448975,
"learning_rate": 4.065747781803662e-05,
"loss": 0.8733,
"step": 2260
},
{
"epoch": 1.4736499674690957,
"grad_norm": 0.8533846735954285,
"learning_rate": 4.06172050589543e-05,
"loss": 0.8411,
"step": 2265
},
{
"epoch": 1.4769030579050098,
"grad_norm": 0.7896531224250793,
"learning_rate": 4.057686572629307e-05,
"loss": 0.8732,
"step": 2270
},
{
"epoch": 1.480156148340924,
"grad_norm": 0.7728810906410217,
"learning_rate": 4.053645999201287e-05,
"loss": 0.8822,
"step": 2275
},
{
"epoch": 1.483409238776838,
"grad_norm": 0.791527271270752,
"learning_rate": 4.0495988028356725e-05,
"loss": 0.8692,
"step": 2280
},
{
"epoch": 1.486662329212752,
"grad_norm": 1.7369199991226196,
"learning_rate": 4.0455450007849945e-05,
"loss": 0.878,
"step": 2285
},
{
"epoch": 1.4899154196486664,
"grad_norm": 0.8174150586128235,
"learning_rate": 4.041484610329945e-05,
"loss": 0.8843,
"step": 2290
},
{
"epoch": 1.4931685100845804,
"grad_norm": 0.8122901916503906,
"learning_rate": 4.037417648779304e-05,
"loss": 0.8511,
"step": 2295
},
{
"epoch": 1.4964216005204944,
"grad_norm": 0.856270968914032,
"learning_rate": 4.033344133469857e-05,
"loss": 0.8576,
"step": 2300
},
{
"epoch": 1.4996746909564087,
"grad_norm": 0.7714033126831055,
"learning_rate": 4.029264081766333e-05,
"loss": 0.8563,
"step": 2305
},
{
"epoch": 1.5029277813923227,
"grad_norm": 0.7557379007339478,
"learning_rate": 4.02517751106132e-05,
"loss": 0.8632,
"step": 2310
},
{
"epoch": 1.5061808718282368,
"grad_norm": 0.9310267567634583,
"learning_rate": 4.021084438775199e-05,
"loss": 0.8756,
"step": 2315
},
{
"epoch": 1.509433962264151,
"grad_norm": 1.1613460779190063,
"learning_rate": 4.016984882356063e-05,
"loss": 0.8581,
"step": 2320
},
{
"epoch": 1.512687052700065,
"grad_norm": 0.8737664222717285,
"learning_rate": 4.0128788592796484e-05,
"loss": 0.8463,
"step": 2325
},
{
"epoch": 1.5159401431359791,
"grad_norm": 1.137432336807251,
"learning_rate": 4.008766387049257e-05,
"loss": 0.8668,
"step": 2330
},
{
"epoch": 1.5191932335718934,
"grad_norm": 1.205127239227295,
"learning_rate": 4.004647483195682e-05,
"loss": 0.854,
"step": 2335
},
{
"epoch": 1.5224463240078074,
"grad_norm": 1.2103711366653442,
"learning_rate": 4.0005221652771326e-05,
"loss": 0.8599,
"step": 2340
},
{
"epoch": 1.5256994144437215,
"grad_norm": 0.8847302794456482,
"learning_rate": 3.996390450879163e-05,
"loss": 0.8902,
"step": 2345
},
{
"epoch": 1.5289525048796357,
"grad_norm": 0.9139837622642517,
"learning_rate": 3.992252357614591e-05,
"loss": 0.8537,
"step": 2350
},
{
"epoch": 1.5322055953155498,
"grad_norm": 0.6250112056732178,
"learning_rate": 3.9881079031234295e-05,
"loss": 0.8625,
"step": 2355
},
{
"epoch": 1.5354586857514638,
"grad_norm": 1.3147530555725098,
"learning_rate": 3.983957105072806e-05,
"loss": 0.8594,
"step": 2360
},
{
"epoch": 1.538711776187378,
"grad_norm": 0.8052361607551575,
"learning_rate": 3.9797999811568916e-05,
"loss": 0.8613,
"step": 2365
},
{
"epoch": 1.5419648666232921,
"grad_norm": 0.963198721408844,
"learning_rate": 3.9756365490968216e-05,
"loss": 0.8846,
"step": 2370
},
{
"epoch": 1.5452179570592062,
"grad_norm": 0.7471247911453247,
"learning_rate": 3.971466826640622e-05,
"loss": 0.8559,
"step": 2375
},
{
"epoch": 1.5484710474951204,
"grad_norm": 0.9139803051948547,
"learning_rate": 3.967290831563137e-05,
"loss": 0.8734,
"step": 2380
},
{
"epoch": 1.5517241379310345,
"grad_norm": 0.8502246141433716,
"learning_rate": 3.963108581665945e-05,
"loss": 0.8517,
"step": 2385
},
{
"epoch": 1.5549772283669485,
"grad_norm": 1.010526418685913,
"learning_rate": 3.958920094777292e-05,
"loss": 0.8699,
"step": 2390
},
{
"epoch": 1.5582303188028628,
"grad_norm": 0.9621404409408569,
"learning_rate": 3.954725388752006e-05,
"loss": 0.8715,
"step": 2395
},
{
"epoch": 1.5614834092387768,
"grad_norm": 0.931891679763794,
"learning_rate": 3.950524481471434e-05,
"loss": 0.8639,
"step": 2400
},
{
"epoch": 1.5647364996746909,
"grad_norm": 0.9025523066520691,
"learning_rate": 3.94631739084335e-05,
"loss": 0.8407,
"step": 2405
},
{
"epoch": 1.5679895901106051,
"grad_norm": 0.7679696679115295,
"learning_rate": 3.942104134801892e-05,
"loss": 0.8703,
"step": 2410
},
{
"epoch": 1.5712426805465192,
"grad_norm": 0.7461057901382446,
"learning_rate": 3.937884731307477e-05,
"loss": 0.8508,
"step": 2415
},
{
"epoch": 1.5744957709824332,
"grad_norm": 0.8891671895980835,
"learning_rate": 3.9336591983467296e-05,
"loss": 0.8392,
"step": 2420
},
{
"epoch": 1.5777488614183475,
"grad_norm": 0.7495052218437195,
"learning_rate": 3.929427553932402e-05,
"loss": 0.8617,
"step": 2425
},
{
"epoch": 1.5810019518542615,
"grad_norm": 0.8563068509101868,
"learning_rate": 3.925189816103298e-05,
"loss": 0.8682,
"step": 2430
},
{
"epoch": 1.5842550422901756,
"grad_norm": 0.8730781674385071,
"learning_rate": 3.9209460029242e-05,
"loss": 0.8634,
"step": 2435
},
{
"epoch": 1.5875081327260898,
"grad_norm": 1.0046974420547485,
"learning_rate": 3.916696132485783e-05,
"loss": 0.8423,
"step": 2440
},
{
"epoch": 1.5907612231620039,
"grad_norm": 0.8691470623016357,
"learning_rate": 3.9124402229045495e-05,
"loss": 0.8443,
"step": 2445
},
{
"epoch": 1.594014313597918,
"grad_norm": 0.7887680530548096,
"learning_rate": 3.90817829232274e-05,
"loss": 0.8796,
"step": 2450
},
{
"epoch": 1.5972674040338322,
"grad_norm": 0.8779820203781128,
"learning_rate": 3.903910358908267e-05,
"loss": 0.8808,
"step": 2455
},
{
"epoch": 1.6005204944697464,
"grad_norm": 0.9116110801696777,
"learning_rate": 3.8996364408546284e-05,
"loss": 0.8539,
"step": 2460
},
{
"epoch": 1.6037735849056602,
"grad_norm": 0.8549916744232178,
"learning_rate": 3.895356556380833e-05,
"loss": 0.8714,
"step": 2465
},
{
"epoch": 1.6070266753415745,
"grad_norm": 0.7568048238754272,
"learning_rate": 3.8910707237313274e-05,
"loss": 0.8545,
"step": 2470
},
{
"epoch": 1.6102797657774888,
"grad_norm": 0.873261034488678,
"learning_rate": 3.886778961175909e-05,
"loss": 0.861,
"step": 2475
},
{
"epoch": 1.6135328562134026,
"grad_norm": 0.8435690999031067,
"learning_rate": 3.8824812870096585e-05,
"loss": 0.849,
"step": 2480
},
{
"epoch": 1.6167859466493169,
"grad_norm": 0.7543259263038635,
"learning_rate": 3.878177719552854e-05,
"loss": 0.8389,
"step": 2485
},
{
"epoch": 1.6200390370852311,
"grad_norm": 0.6784664392471313,
"learning_rate": 3.8738682771508975e-05,
"loss": 0.862,
"step": 2490
},
{
"epoch": 1.623292127521145,
"grad_norm": 0.735149085521698,
"learning_rate": 3.869552978174232e-05,
"loss": 0.86,
"step": 2495
},
{
"epoch": 1.6265452179570592,
"grad_norm": 1.1492180824279785,
"learning_rate": 3.8652318410182696e-05,
"loss": 0.8682,
"step": 2500
},
{
"epoch": 1.6297983083929735,
"grad_norm": 1.2123005390167236,
"learning_rate": 3.860904884103307e-05,
"loss": 0.8767,
"step": 2505
},
{
"epoch": 1.6330513988288873,
"grad_norm": 1.0573855638504028,
"learning_rate": 3.85657212587445e-05,
"loss": 0.8784,
"step": 2510
},
{
"epoch": 1.6363044892648015,
"grad_norm": 0.7657274603843689,
"learning_rate": 3.8522335848015354e-05,
"loss": 0.8614,
"step": 2515
},
{
"epoch": 1.6395575797007158,
"grad_norm": 0.7586051225662231,
"learning_rate": 3.847889279379052e-05,
"loss": 0.8522,
"step": 2520
},
{
"epoch": 1.6428106701366298,
"grad_norm": 0.8660874366760254,
"learning_rate": 3.843539228126058e-05,
"loss": 0.8491,
"step": 2525
},
{
"epoch": 1.6460637605725439,
"grad_norm": 0.8181445002555847,
"learning_rate": 3.8391834495861104e-05,
"loss": 0.8774,
"step": 2530
},
{
"epoch": 1.6493168510084582,
"grad_norm": 0.8161119222640991,
"learning_rate": 3.834821962327173e-05,
"loss": 0.8446,
"step": 2535
},
{
"epoch": 1.6525699414443722,
"grad_norm": 0.7471867203712463,
"learning_rate": 3.830454784941552e-05,
"loss": 0.8743,
"step": 2540
},
{
"epoch": 1.6558230318802862,
"grad_norm": 0.8243322372436523,
"learning_rate": 3.8260819360458066e-05,
"loss": 0.8582,
"step": 2545
},
{
"epoch": 1.6590761223162005,
"grad_norm": 0.7759085297584534,
"learning_rate": 3.8217034342806726e-05,
"loss": 0.8634,
"step": 2550
},
{
"epoch": 1.6623292127521145,
"grad_norm": 0.7820890545845032,
"learning_rate": 3.817319298310984e-05,
"loss": 0.849,
"step": 2555
},
{
"epoch": 1.6655823031880286,
"grad_norm": 0.7369856238365173,
"learning_rate": 3.812929546825591e-05,
"loss": 0.851,
"step": 2560
},
{
"epoch": 1.6688353936239428,
"grad_norm": 0.6760427355766296,
"learning_rate": 3.8085341985372847e-05,
"loss": 0.8526,
"step": 2565
},
{
"epoch": 1.6720884840598569,
"grad_norm": 0.7964663505554199,
"learning_rate": 3.804133272182711e-05,
"loss": 0.8369,
"step": 2570
},
{
"epoch": 1.675341574495771,
"grad_norm": 0.7458584308624268,
"learning_rate": 3.7997267865222966e-05,
"loss": 0.858,
"step": 2575
},
{
"epoch": 1.6785946649316852,
"grad_norm": 0.7713748812675476,
"learning_rate": 3.795314760340165e-05,
"loss": 0.8422,
"step": 2580
},
{
"epoch": 1.6818477553675992,
"grad_norm": 1.1121766567230225,
"learning_rate": 3.79089721244406e-05,
"loss": 0.8564,
"step": 2585
},
{
"epoch": 1.6851008458035133,
"grad_norm": 0.7054054141044617,
"learning_rate": 3.786474161665261e-05,
"loss": 0.8503,
"step": 2590
},
{
"epoch": 1.6883539362394275,
"grad_norm": 0.8231985569000244,
"learning_rate": 3.782045626858508e-05,
"loss": 0.8459,
"step": 2595
},
{
"epoch": 1.6916070266753416,
"grad_norm": 0.8120073676109314,
"learning_rate": 3.7776116269019164e-05,
"loss": 0.8579,
"step": 2600
},
{
"epoch": 1.6948601171112556,
"grad_norm": 0.7463471293449402,
"learning_rate": 3.773172180696899e-05,
"loss": 0.8685,
"step": 2605
},
{
"epoch": 1.6981132075471699,
"grad_norm": 0.9310842752456665,
"learning_rate": 3.7687273071680875e-05,
"loss": 0.8657,
"step": 2610
},
{
"epoch": 1.701366297983084,
"grad_norm": 0.7997697591781616,
"learning_rate": 3.7642770252632445e-05,
"loss": 0.8536,
"step": 2615
},
{
"epoch": 1.704619388418998,
"grad_norm": 0.9354361295700073,
"learning_rate": 3.7598213539531924e-05,
"loss": 0.8584,
"step": 2620
},
{
"epoch": 1.7078724788549122,
"grad_norm": 0.8442994356155396,
"learning_rate": 3.755360312231726e-05,
"loss": 0.8509,
"step": 2625
},
{
"epoch": 1.7111255692908263,
"grad_norm": 0.7156201601028442,
"learning_rate": 3.7508939191155315e-05,
"loss": 0.8587,
"step": 2630
},
{
"epoch": 1.7143786597267403,
"grad_norm": 0.8114856481552124,
"learning_rate": 3.7464221936441094e-05,
"loss": 0.8575,
"step": 2635
},
{
"epoch": 1.7176317501626546,
"grad_norm": 0.9958142042160034,
"learning_rate": 3.741945154879691e-05,
"loss": 0.8291,
"step": 2640
},
{
"epoch": 1.7208848405985686,
"grad_norm": 0.8814706206321716,
"learning_rate": 3.7374628219071576e-05,
"loss": 0.8756,
"step": 2645
},
{
"epoch": 1.7241379310344827,
"grad_norm": 0.9752816557884216,
"learning_rate": 3.732975213833957e-05,
"loss": 0.8526,
"step": 2650
},
{
"epoch": 1.727391021470397,
"grad_norm": 1.069827914237976,
"learning_rate": 3.728482349790025e-05,
"loss": 0.85,
"step": 2655
},
{
"epoch": 1.730644111906311,
"grad_norm": 0.7829200029373169,
"learning_rate": 3.723984248927704e-05,
"loss": 0.8775,
"step": 2660
},
{
"epoch": 1.733897202342225,
"grad_norm": 0.9264289140701294,
"learning_rate": 3.719480930421657e-05,
"loss": 0.8561,
"step": 2665
},
{
"epoch": 1.7371502927781393,
"grad_norm": 1.0062094926834106,
"learning_rate": 3.7149724134687915e-05,
"loss": 0.8734,
"step": 2670
},
{
"epoch": 1.7404033832140533,
"grad_norm": 1.15998375415802,
"learning_rate": 3.710458717288176e-05,
"loss": 0.8817,
"step": 2675
},
{
"epoch": 1.7436564736499673,
"grad_norm": 0.8632653951644897,
"learning_rate": 3.705939861120952e-05,
"loss": 0.8467,
"step": 2680
},
{
"epoch": 1.7469095640858816,
"grad_norm": 0.9579365849494934,
"learning_rate": 3.7014158642302645e-05,
"loss": 0.8516,
"step": 2685
},
{
"epoch": 1.7501626545217959,
"grad_norm": 0.7893072962760925,
"learning_rate": 3.6968867459011675e-05,
"loss": 0.8533,
"step": 2690
},
{
"epoch": 1.7534157449577097,
"grad_norm": 0.8436265587806702,
"learning_rate": 3.692352525440548e-05,
"loss": 0.8661,
"step": 2695
},
{
"epoch": 1.756668835393624,
"grad_norm": 0.7928500175476074,
"learning_rate": 3.687813222177042e-05,
"loss": 0.8617,
"step": 2700
},
{
"epoch": 1.7599219258295382,
"grad_norm": 1.0979465246200562,
"learning_rate": 3.683268855460955e-05,
"loss": 0.8457,
"step": 2705
},
{
"epoch": 1.763175016265452,
"grad_norm": 0.9280642867088318,
"learning_rate": 3.678719444664174e-05,
"loss": 0.8698,
"step": 2710
},
{
"epoch": 1.7664281067013663,
"grad_norm": 0.7560756206512451,
"learning_rate": 3.674165009180091e-05,
"loss": 0.8476,
"step": 2715
},
{
"epoch": 1.7696811971372806,
"grad_norm": 1.6937271356582642,
"learning_rate": 3.669605568423515e-05,
"loss": 0.8601,
"step": 2720
},
{
"epoch": 1.7729342875731944,
"grad_norm": 0.7721190452575684,
"learning_rate": 3.665041141830594e-05,
"loss": 0.8479,
"step": 2725
},
{
"epoch": 1.7761873780091086,
"grad_norm": 0.691184401512146,
"learning_rate": 3.660471748858728e-05,
"loss": 0.846,
"step": 2730
},
{
"epoch": 1.779440468445023,
"grad_norm": 0.8458099961280823,
"learning_rate": 3.655897408986487e-05,
"loss": 0.8543,
"step": 2735
},
{
"epoch": 1.7826935588809367,
"grad_norm": 0.7717384696006775,
"learning_rate": 3.651318141713532e-05,
"loss": 0.8555,
"step": 2740
},
{
"epoch": 1.785946649316851,
"grad_norm": 0.7364319562911987,
"learning_rate": 3.646733966560527e-05,
"loss": 0.8693,
"step": 2745
},
{
"epoch": 1.7891997397527653,
"grad_norm": 0.7715139389038086,
"learning_rate": 3.642144903069055e-05,
"loss": 0.8575,
"step": 2750
},
{
"epoch": 1.7924528301886793,
"grad_norm": 0.7801803350448608,
"learning_rate": 3.637550970801543e-05,
"loss": 0.8832,
"step": 2755
},
{
"epoch": 1.7957059206245933,
"grad_norm": 0.8797639012336731,
"learning_rate": 3.632952189341166e-05,
"loss": 0.8787,
"step": 2760
},
{
"epoch": 1.7989590110605076,
"grad_norm": 0.8655262589454651,
"learning_rate": 3.628348578291776e-05,
"loss": 0.8527,
"step": 2765
},
{
"epoch": 1.8022121014964216,
"grad_norm": 0.7039540410041809,
"learning_rate": 3.623740157277811e-05,
"loss": 0.8023,
"step": 2770
},
{
"epoch": 1.8054651919323357,
"grad_norm": 0.8364835977554321,
"learning_rate": 3.619126945944209e-05,
"loss": 0.8428,
"step": 2775
},
{
"epoch": 1.80871828236825,
"grad_norm": 0.8477578163146973,
"learning_rate": 3.614508963956335e-05,
"loss": 0.8364,
"step": 2780
},
{
"epoch": 1.811971372804164,
"grad_norm": 0.790069043636322,
"learning_rate": 3.609886230999886e-05,
"loss": 0.8557,
"step": 2785
},
{
"epoch": 1.815224463240078,
"grad_norm": 1.1685853004455566,
"learning_rate": 3.605258766780815e-05,
"loss": 0.8639,
"step": 2790
},
{
"epoch": 1.8184775536759923,
"grad_norm": 0.6820409297943115,
"learning_rate": 3.600626591025239e-05,
"loss": 0.8561,
"step": 2795
},
{
"epoch": 1.8217306441119063,
"grad_norm": 0.6816509366035461,
"learning_rate": 3.595989723479363e-05,
"loss": 0.8595,
"step": 2800
},
{
"epoch": 1.8249837345478204,
"grad_norm": 0.6458393335342407,
"learning_rate": 3.591348183909391e-05,
"loss": 0.852,
"step": 2805
},
{
"epoch": 1.8282368249837346,
"grad_norm": 0.8720667958259583,
"learning_rate": 3.586701992101446e-05,
"loss": 0.8493,
"step": 2810
},
{
"epoch": 1.8314899154196487,
"grad_norm": 0.8076214790344238,
"learning_rate": 3.582051167861477e-05,
"loss": 0.8399,
"step": 2815
},
{
"epoch": 1.8347430058555627,
"grad_norm": 1.1117894649505615,
"learning_rate": 3.577395731015184e-05,
"loss": 0.8462,
"step": 2820
},
{
"epoch": 1.837996096291477,
"grad_norm": 0.8749067783355713,
"learning_rate": 3.57273570140793e-05,
"loss": 0.8484,
"step": 2825
},
{
"epoch": 1.841249186727391,
"grad_norm": 0.9115192890167236,
"learning_rate": 3.5680710989046565e-05,
"loss": 0.8379,
"step": 2830
},
{
"epoch": 1.844502277163305,
"grad_norm": 0.7345873117446899,
"learning_rate": 3.5634019433897964e-05,
"loss": 0.8521,
"step": 2835
},
{
"epoch": 1.8477553675992193,
"grad_norm": 0.8665250539779663,
"learning_rate": 3.558728254767192e-05,
"loss": 0.8591,
"step": 2840
},
{
"epoch": 1.8510084580351334,
"grad_norm": 0.6966584324836731,
"learning_rate": 3.5540500529600096e-05,
"loss": 0.8633,
"step": 2845
},
{
"epoch": 1.8542615484710474,
"grad_norm": 0.9217740893363953,
"learning_rate": 3.5493673579106555e-05,
"loss": 0.8581,
"step": 2850
},
{
"epoch": 1.8575146389069617,
"grad_norm": 1.1653602123260498,
"learning_rate": 3.5446801895806904e-05,
"loss": 0.8429,
"step": 2855
},
{
"epoch": 1.8607677293428757,
"grad_norm": 1.0861412286758423,
"learning_rate": 3.539988567950741e-05,
"loss": 0.8385,
"step": 2860
},
{
"epoch": 1.8640208197787898,
"grad_norm": 0.9099658727645874,
"learning_rate": 3.53529251302042e-05,
"loss": 0.8727,
"step": 2865
},
{
"epoch": 1.867273910214704,
"grad_norm": 0.8507881760597229,
"learning_rate": 3.530592044808237e-05,
"loss": 0.8601,
"step": 2870
},
{
"epoch": 1.870527000650618,
"grad_norm": 0.7487595677375793,
"learning_rate": 3.525887183351517e-05,
"loss": 0.8453,
"step": 2875
},
{
"epoch": 1.873780091086532,
"grad_norm": 0.7527421116828918,
"learning_rate": 3.521177948706311e-05,
"loss": 0.856,
"step": 2880
},
{
"epoch": 1.8770331815224464,
"grad_norm": 1.198721170425415,
"learning_rate": 3.5164643609473114e-05,
"loss": 0.8322,
"step": 2885
},
{
"epoch": 1.8802862719583604,
"grad_norm": 0.7312609553337097,
"learning_rate": 3.51174644016777e-05,
"loss": 0.8571,
"step": 2890
},
{
"epoch": 1.8835393623942744,
"grad_norm": 0.813762903213501,
"learning_rate": 3.507024206479406e-05,
"loss": 0.8485,
"step": 2895
},
{
"epoch": 1.8867924528301887,
"grad_norm": 0.6589996814727783,
"learning_rate": 3.502297680012327e-05,
"loss": 0.8199,
"step": 2900
},
{
"epoch": 1.8900455432661027,
"grad_norm": 0.8973954319953918,
"learning_rate": 3.4975668809149375e-05,
"loss": 0.8595,
"step": 2905
},
{
"epoch": 1.8932986337020168,
"grad_norm": 0.8979359269142151,
"learning_rate": 3.492831829353857e-05,
"loss": 0.8637,
"step": 2910
},
{
"epoch": 1.896551724137931,
"grad_norm": 0.7665019035339355,
"learning_rate": 3.488092545513833e-05,
"loss": 0.8753,
"step": 2915
},
{
"epoch": 1.8998048145738453,
"grad_norm": 1.2857329845428467,
"learning_rate": 3.483349049597653e-05,
"loss": 0.8394,
"step": 2920
},
{
"epoch": 1.9030579050097591,
"grad_norm": 0.7651403546333313,
"learning_rate": 3.4786013618260615e-05,
"loss": 0.846,
"step": 2925
},
{
"epoch": 1.9063109954456734,
"grad_norm": 0.818390429019928,
"learning_rate": 3.47384950243767e-05,
"loss": 0.8919,
"step": 2930
},
{
"epoch": 1.9095640858815877,
"grad_norm": 0.8343967795372009,
"learning_rate": 3.4690934916888754e-05,
"loss": 0.8451,
"step": 2935
},
{
"epoch": 1.9128171763175015,
"grad_norm": 0.8200094699859619,
"learning_rate": 3.464333349853769e-05,
"loss": 0.8468,
"step": 2940
},
{
"epoch": 1.9160702667534157,
"grad_norm": 0.8766981959342957,
"learning_rate": 3.459569097224054e-05,
"loss": 0.8455,
"step": 2945
},
{
"epoch": 1.91932335718933,
"grad_norm": 0.7592107057571411,
"learning_rate": 3.454800754108957e-05,
"loss": 0.8564,
"step": 2950
},
{
"epoch": 1.9225764476252438,
"grad_norm": 0.7694371938705444,
"learning_rate": 3.45002834083514e-05,
"loss": 0.8579,
"step": 2955
},
{
"epoch": 1.925829538061158,
"grad_norm": 0.9310813546180725,
"learning_rate": 3.445251877746616e-05,
"loss": 0.853,
"step": 2960
},
{
"epoch": 1.9290826284970723,
"grad_norm": 0.7357284426689148,
"learning_rate": 3.440471385204664e-05,
"loss": 0.843,
"step": 2965
},
{
"epoch": 1.9323357189329864,
"grad_norm": 1.0630100965499878,
"learning_rate": 3.4356868835877376e-05,
"loss": 0.8656,
"step": 2970
},
{
"epoch": 1.9355888093689004,
"grad_norm": 1.3015029430389404,
"learning_rate": 3.430898393291381e-05,
"loss": 0.8681,
"step": 2975
},
{
"epoch": 1.9388418998048147,
"grad_norm": 0.941599428653717,
"learning_rate": 3.426105934728141e-05,
"loss": 0.8374,
"step": 2980
},
{
"epoch": 1.9420949902407287,
"grad_norm": 0.827949583530426,
"learning_rate": 3.4213095283274807e-05,
"loss": 0.8342,
"step": 2985
},
{
"epoch": 1.9453480806766428,
"grad_norm": 0.7155514359474182,
"learning_rate": 3.416509194535693e-05,
"loss": 0.8604,
"step": 2990
},
{
"epoch": 1.948601171112557,
"grad_norm": 0.6395983099937439,
"learning_rate": 3.411704953815813e-05,
"loss": 0.8545,
"step": 2995
},
{
"epoch": 1.951854261548471,
"grad_norm": 1.0403225421905518,
"learning_rate": 3.406896826647528e-05,
"loss": 0.8317,
"step": 3000
},
{
"epoch": 1.9551073519843851,
"grad_norm": 0.809688925743103,
"learning_rate": 3.4020848335270944e-05,
"loss": 0.8459,
"step": 3005
},
{
"epoch": 1.9583604424202994,
"grad_norm": 0.7284942865371704,
"learning_rate": 3.397268994967248e-05,
"loss": 0.8609,
"step": 3010
},
{
"epoch": 1.9616135328562134,
"grad_norm": 0.8415728807449341,
"learning_rate": 3.392449331497117e-05,
"loss": 0.8421,
"step": 3015
},
{
"epoch": 1.9648666232921275,
"grad_norm": 0.7867475152015686,
"learning_rate": 3.387625863662137e-05,
"loss": 0.8537,
"step": 3020
},
{
"epoch": 1.9681197137280417,
"grad_norm": 0.8730093240737915,
"learning_rate": 3.3827986120239556e-05,
"loss": 0.8453,
"step": 3025
},
{
"epoch": 1.9713728041639558,
"grad_norm": 1.0075076818466187,
"learning_rate": 3.377967597160355e-05,
"loss": 0.8485,
"step": 3030
},
{
"epoch": 1.9746258945998698,
"grad_norm": 0.7558779716491699,
"learning_rate": 3.373132839665159e-05,
"loss": 0.8283,
"step": 3035
},
{
"epoch": 1.977878985035784,
"grad_norm": 0.8635545969009399,
"learning_rate": 3.368294360148141e-05,
"loss": 0.8445,
"step": 3040
},
{
"epoch": 1.9811320754716981,
"grad_norm": 0.7366521954536438,
"learning_rate": 3.363452179234946e-05,
"loss": 0.8377,
"step": 3045
},
{
"epoch": 1.9843851659076122,
"grad_norm": 0.895798921585083,
"learning_rate": 3.3586063175669957e-05,
"loss": 0.8517,
"step": 3050
},
{
"epoch": 1.9876382563435264,
"grad_norm": 0.8703877329826355,
"learning_rate": 3.353756795801402e-05,
"loss": 0.8635,
"step": 3055
},
{
"epoch": 1.9908913467794405,
"grad_norm": 0.8399415612220764,
"learning_rate": 3.348903634610879e-05,
"loss": 0.8469,
"step": 3060
},
{
"epoch": 1.9941444372153545,
"grad_norm": 0.6633405685424805,
"learning_rate": 3.344046854683656e-05,
"loss": 0.8265,
"step": 3065
},
{
"epoch": 1.9973975276512688,
"grad_norm": 0.8422790765762329,
"learning_rate": 3.3391864767233874e-05,
"loss": 0.8356,
"step": 3070
},
{
"epoch": 2.0,
"eval_f1": 0.8011475160594294,
"eval_loss": 0.444091796875,
"eval_precision": 0.8009366991425545,
"eval_recall": 0.8015108608319047,
"eval_runtime": 238.6273,
"eval_samples_per_second": 1648.743,
"eval_steps_per_second": 1.613,
"step": 3074
},
{
"epoch": 2.000650618087183,
"grad_norm": 0.9484532475471497,
"learning_rate": 3.334322521449066e-05,
"loss": 0.8414,
"step": 3075
},
{
"epoch": 2.003903708523097,
"grad_norm": 1.058498740196228,
"learning_rate": 3.3294550095949325e-05,
"loss": 0.7647,
"step": 3080
},
{
"epoch": 2.007156798959011,
"grad_norm": 1.1817635297775269,
"learning_rate": 3.3245839619103916e-05,
"loss": 0.7739,
"step": 3085
},
{
"epoch": 2.0104098893949254,
"grad_norm": 0.9960103034973145,
"learning_rate": 3.319709399159919e-05,
"loss": 0.7627,
"step": 3090
},
{
"epoch": 2.013662979830839,
"grad_norm": 0.7337830066680908,
"learning_rate": 3.314831342122974e-05,
"loss": 0.7736,
"step": 3095
},
{
"epoch": 2.0169160702667535,
"grad_norm": 0.8539023995399475,
"learning_rate": 3.309949811593914e-05,
"loss": 0.7677,
"step": 3100
},
{
"epoch": 2.0201691607026677,
"grad_norm": 0.812573254108429,
"learning_rate": 3.3050648283818985e-05,
"loss": 0.7688,
"step": 3105
},
{
"epoch": 2.0234222511385815,
"grad_norm": 0.8771811127662659,
"learning_rate": 3.30017641331081e-05,
"loss": 0.7873,
"step": 3110
},
{
"epoch": 2.026675341574496,
"grad_norm": 0.8817070126533508,
"learning_rate": 3.295284587219159e-05,
"loss": 0.7516,
"step": 3115
},
{
"epoch": 2.02992843201041,
"grad_norm": 0.8555654287338257,
"learning_rate": 3.290389370959995e-05,
"loss": 0.7245,
"step": 3120
},
{
"epoch": 2.033181522446324,
"grad_norm": 0.9785915017127991,
"learning_rate": 3.285490785400822e-05,
"loss": 0.7591,
"step": 3125
},
{
"epoch": 2.036434612882238,
"grad_norm": 1.1170217990875244,
"learning_rate": 3.280588851423504e-05,
"loss": 0.7545,
"step": 3130
},
{
"epoch": 2.0396877033181524,
"grad_norm": 0.889552652835846,
"learning_rate": 3.275683589924181e-05,
"loss": 0.7509,
"step": 3135
},
{
"epoch": 2.0429407937540662,
"grad_norm": 0.9748543500900269,
"learning_rate": 3.270775021813177e-05,
"loss": 0.7419,
"step": 3140
},
{
"epoch": 2.0461938841899805,
"grad_norm": 0.9157707691192627,
"learning_rate": 3.26586316801491e-05,
"loss": 0.7476,
"step": 3145
},
{
"epoch": 2.0494469746258948,
"grad_norm": 1.3593250513076782,
"learning_rate": 3.2609480494678055e-05,
"loss": 0.778,
"step": 3150
},
{
"epoch": 2.0527000650618086,
"grad_norm": 0.8584513664245605,
"learning_rate": 3.256029687124209e-05,
"loss": 0.7634,
"step": 3155
},
{
"epoch": 2.055953155497723,
"grad_norm": 1.1206103563308716,
"learning_rate": 3.2511081019502875e-05,
"loss": 0.7612,
"step": 3160
},
{
"epoch": 2.059206245933637,
"grad_norm": 1.1010791063308716,
"learning_rate": 3.2461833149259516e-05,
"loss": 0.7631,
"step": 3165
},
{
"epoch": 2.062459336369551,
"grad_norm": 1.0924779176712036,
"learning_rate": 3.241255347044759e-05,
"loss": 0.7592,
"step": 3170
},
{
"epoch": 2.065712426805465,
"grad_norm": 0.9586931467056274,
"learning_rate": 3.236324219313826e-05,
"loss": 0.7591,
"step": 3175
},
{
"epoch": 2.0689655172413794,
"grad_norm": 1.0838814973831177,
"learning_rate": 3.231389952753742e-05,
"loss": 0.7724,
"step": 3180
},
{
"epoch": 2.0722186076772933,
"grad_norm": 0.9030594229698181,
"learning_rate": 3.226452568398471e-05,
"loss": 0.7627,
"step": 3185
},
{
"epoch": 2.0754716981132075,
"grad_norm": 1.0417284965515137,
"learning_rate": 3.221512087295275e-05,
"loss": 0.765,
"step": 3190
},
{
"epoch": 2.078724788549122,
"grad_norm": 1.3411697149276733,
"learning_rate": 3.216568530504611e-05,
"loss": 0.7718,
"step": 3195
},
{
"epoch": 2.0819778789850356,
"grad_norm": 1.1210920810699463,
"learning_rate": 3.21162191910005e-05,
"loss": 0.7578,
"step": 3200
},
{
"epoch": 2.08523096942095,
"grad_norm": 1.0522574186325073,
"learning_rate": 3.2066722741681845e-05,
"loss": 0.7645,
"step": 3205
},
{
"epoch": 2.088484059856864,
"grad_norm": 0.9024161100387573,
"learning_rate": 3.2017196168085345e-05,
"loss": 0.7542,
"step": 3210
},
{
"epoch": 2.091737150292778,
"grad_norm": 0.93799889087677,
"learning_rate": 3.196763968133466e-05,
"loss": 0.7675,
"step": 3215
},
{
"epoch": 2.094990240728692,
"grad_norm": 0.9059098362922668,
"learning_rate": 3.191805349268097e-05,
"loss": 0.774,
"step": 3220
},
{
"epoch": 2.0982433311646065,
"grad_norm": 0.954647958278656,
"learning_rate": 3.1868437813502026e-05,
"loss": 0.7591,
"step": 3225
},
{
"epoch": 2.1014964216005203,
"grad_norm": 0.956679105758667,
"learning_rate": 3.1818792855301316e-05,
"loss": 0.7585,
"step": 3230
},
{
"epoch": 2.1047495120364346,
"grad_norm": 0.8911952376365662,
"learning_rate": 3.1769118829707156e-05,
"loss": 0.7736,
"step": 3235
},
{
"epoch": 2.108002602472349,
"grad_norm": 1.1105453968048096,
"learning_rate": 3.171941594847173e-05,
"loss": 0.746,
"step": 3240
},
{
"epoch": 2.1112556929082626,
"grad_norm": 1.0151236057281494,
"learning_rate": 3.1669684423470275e-05,
"loss": 0.7628,
"step": 3245
},
{
"epoch": 2.114508783344177,
"grad_norm": 1.0137097835540771,
"learning_rate": 3.16199244667001e-05,
"loss": 0.7611,
"step": 3250
},
{
"epoch": 2.117761873780091,
"grad_norm": 0.9404064416885376,
"learning_rate": 3.157013629027972e-05,
"loss": 0.7601,
"step": 3255
},
{
"epoch": 2.121014964216005,
"grad_norm": 1.3806120157241821,
"learning_rate": 3.152032010644796e-05,
"loss": 0.7647,
"step": 3260
},
{
"epoch": 2.1242680546519193,
"grad_norm": 0.9700812697410583,
"learning_rate": 3.147047612756302e-05,
"loss": 0.766,
"step": 3265
},
{
"epoch": 2.1275211450878335,
"grad_norm": 1.1779789924621582,
"learning_rate": 3.142060456610159e-05,
"loss": 0.7571,
"step": 3270
},
{
"epoch": 2.130774235523748,
"grad_norm": 1.1766470670700073,
"learning_rate": 3.137070563465796e-05,
"loss": 0.7587,
"step": 3275
},
{
"epoch": 2.1340273259596616,
"grad_norm": 1.1181317567825317,
"learning_rate": 3.1320779545943034e-05,
"loss": 0.7514,
"step": 3280
},
{
"epoch": 2.137280416395576,
"grad_norm": 1.520752191543579,
"learning_rate": 3.127082651278357e-05,
"loss": 0.7383,
"step": 3285
},
{
"epoch": 2.14053350683149,
"grad_norm": 1.1578936576843262,
"learning_rate": 3.1220846748121105e-05,
"loss": 0.7736,
"step": 3290
},
{
"epoch": 2.143786597267404,
"grad_norm": 1.3091363906860352,
"learning_rate": 3.117084046501119e-05,
"loss": 0.7615,
"step": 3295
},
{
"epoch": 2.147039687703318,
"grad_norm": 0.9620407223701477,
"learning_rate": 3.112080787662237e-05,
"loss": 0.7924,
"step": 3300
},
{
"epoch": 2.1502927781392325,
"grad_norm": 0.9089716672897339,
"learning_rate": 3.107074919623536e-05,
"loss": 0.7455,
"step": 3305
},
{
"epoch": 2.1535458685751463,
"grad_norm": 1.1510998010635376,
"learning_rate": 3.102066463724209e-05,
"loss": 0.765,
"step": 3310
},
{
"epoch": 2.1567989590110606,
"grad_norm": 1.8722169399261475,
"learning_rate": 3.0970554413144805e-05,
"loss": 0.7627,
"step": 3315
},
{
"epoch": 2.160052049446975,
"grad_norm": 1.0691964626312256,
"learning_rate": 3.0920418737555144e-05,
"loss": 0.7753,
"step": 3320
},
{
"epoch": 2.1633051398828886,
"grad_norm": 0.9641361832618713,
"learning_rate": 3.0870257824193263e-05,
"loss": 0.7516,
"step": 3325
},
{
"epoch": 2.166558230318803,
"grad_norm": 1.0590273141860962,
"learning_rate": 3.08200718868869e-05,
"loss": 0.7859,
"step": 3330
},
{
"epoch": 2.169811320754717,
"grad_norm": 1.2373055219650269,
"learning_rate": 3.076986113957044e-05,
"loss": 0.772,
"step": 3335
},
{
"epoch": 2.173064411190631,
"grad_norm": 1.160982608795166,
"learning_rate": 3.071962579628408e-05,
"loss": 0.7673,
"step": 3340
},
{
"epoch": 2.1763175016265452,
"grad_norm": 0.8511375188827515,
"learning_rate": 3.066936607117279e-05,
"loss": 0.7558,
"step": 3345
},
{
"epoch": 2.1795705920624595,
"grad_norm": 0.9551635384559631,
"learning_rate": 3.061908217848556e-05,
"loss": 0.7641,
"step": 3350
},
{
"epoch": 2.1828236824983733,
"grad_norm": 0.9262502789497375,
"learning_rate": 3.056877433257434e-05,
"loss": 0.7667,
"step": 3355
},
{
"epoch": 2.1860767729342876,
"grad_norm": 1.2747892141342163,
"learning_rate": 3.051844274789321e-05,
"loss": 0.7497,
"step": 3360
},
{
"epoch": 2.189329863370202,
"grad_norm": 1.2817254066467285,
"learning_rate": 3.046808763899745e-05,
"loss": 0.7743,
"step": 3365
},
{
"epoch": 2.1925829538061157,
"grad_norm": 1.3123672008514404,
"learning_rate": 3.041770922054262e-05,
"loss": 0.7681,
"step": 3370
},
{
"epoch": 2.19583604424203,
"grad_norm": 1.0206502676010132,
"learning_rate": 3.0367307707283626e-05,
"loss": 0.7833,
"step": 3375
},
{
"epoch": 2.199089134677944,
"grad_norm": 1.0204437971115112,
"learning_rate": 3.0326970012795626e-05,
"loss": 0.7575,
"step": 3380
},
{
"epoch": 2.202342225113858,
"grad_norm": 1.0020246505737305,
"learning_rate": 3.027652747038522e-05,
"loss": 0.7702,
"step": 3385
},
{
"epoch": 2.2055953155497723,
"grad_norm": 1.045996904373169,
"learning_rate": 3.022606243500526e-05,
"loss": 0.7609,
"step": 3390
},
{
"epoch": 2.2088484059856865,
"grad_norm": 0.9325571060180664,
"learning_rate": 3.0175575121779886e-05,
"loss": 0.7363,
"step": 3395
},
{
"epoch": 2.2121014964216004,
"grad_norm": 1.2504099607467651,
"learning_rate": 3.012506574592825e-05,
"loss": 0.7742,
"step": 3400
},
{
"epoch": 2.2153545868575146,
"grad_norm": 1.0567350387573242,
"learning_rate": 3.007453452276349e-05,
"loss": 0.7544,
"step": 3405
},
{
"epoch": 2.218607677293429,
"grad_norm": 0.9951023459434509,
"learning_rate": 3.0023981667691926e-05,
"loss": 0.7432,
"step": 3410
},
{
"epoch": 2.2218607677293427,
"grad_norm": 1.0222620964050293,
"learning_rate": 2.997340739621206e-05,
"loss": 0.794,
"step": 3415
},
{
"epoch": 2.225113858165257,
"grad_norm": 0.8401185870170593,
"learning_rate": 2.9922811923913714e-05,
"loss": 0.751,
"step": 3420
},
{
"epoch": 2.2283669486011712,
"grad_norm": 1.1666043996810913,
"learning_rate": 2.9872195466477054e-05,
"loss": 0.7592,
"step": 3425
},
{
"epoch": 2.231620039037085,
"grad_norm": 0.95232754945755,
"learning_rate": 2.9821558239671744e-05,
"loss": 0.7639,
"step": 3430
},
{
"epoch": 2.2348731294729993,
"grad_norm": 0.8971825242042542,
"learning_rate": 2.977090045935594e-05,
"loss": 0.7553,
"step": 3435
},
{
"epoch": 2.2381262199089136,
"grad_norm": 1.0237399339675903,
"learning_rate": 2.9720222341475445e-05,
"loss": 0.7504,
"step": 3440
},
{
"epoch": 2.2413793103448274,
"grad_norm": 1.1775766611099243,
"learning_rate": 2.966952410206275e-05,
"loss": 0.7449,
"step": 3445
},
{
"epoch": 2.2446324007807417,
"grad_norm": 0.885957658290863,
"learning_rate": 2.9618805957236113e-05,
"loss": 0.7631,
"step": 3450
},
{
"epoch": 2.247885491216656,
"grad_norm": 1.3709341287612915,
"learning_rate": 2.956806812319865e-05,
"loss": 0.7589,
"step": 3455
},
{
"epoch": 2.2511385816525697,
"grad_norm": 1.204150676727295,
"learning_rate": 2.951731081623742e-05,
"loss": 0.7662,
"step": 3460
},
{
"epoch": 2.254391672088484,
"grad_norm": 1.6271796226501465,
"learning_rate": 2.946653425272247e-05,
"loss": 0.7821,
"step": 3465
},
{
"epoch": 2.2576447625243983,
"grad_norm": 1.0852000713348389,
"learning_rate": 2.9415738649105963e-05,
"loss": 0.7408,
"step": 3470
},
{
"epoch": 2.260897852960312,
"grad_norm": 1.0353608131408691,
"learning_rate": 2.9364924221921185e-05,
"loss": 0.7478,
"step": 3475
},
{
"epoch": 2.2641509433962264,
"grad_norm": 1.881262183189392,
"learning_rate": 2.9314091187781715e-05,
"loss": 0.7584,
"step": 3480
},
{
"epoch": 2.2674040338321406,
"grad_norm": 1.2990703582763672,
"learning_rate": 2.9263239763380412e-05,
"loss": 0.7566,
"step": 3485
},
{
"epoch": 2.2706571242680544,
"grad_norm": 0.9985173940658569,
"learning_rate": 2.921237016548854e-05,
"loss": 0.7676,
"step": 3490
},
{
"epoch": 2.2739102147039687,
"grad_norm": 0.9522629976272583,
"learning_rate": 2.9161482610954842e-05,
"loss": 0.7475,
"step": 3495
},
{
"epoch": 2.277163305139883,
"grad_norm": 0.9219643473625183,
"learning_rate": 2.9110577316704602e-05,
"loss": 0.7613,
"step": 3500
},
{
"epoch": 2.280416395575797,
"grad_norm": 0.9594421982765198,
"learning_rate": 2.905965449973871e-05,
"loss": 0.768,
"step": 3505
},
{
"epoch": 2.283669486011711,
"grad_norm": 1.0452098846435547,
"learning_rate": 2.900871437713279e-05,
"loss": 0.7699,
"step": 3510
},
{
"epoch": 2.2869225764476253,
"grad_norm": 0.9670342803001404,
"learning_rate": 2.8957757166036193e-05,
"loss": 0.7573,
"step": 3515
},
{
"epoch": 2.290175666883539,
"grad_norm": 1.147403597831726,
"learning_rate": 2.890678308367115e-05,
"loss": 0.7688,
"step": 3520
},
{
"epoch": 2.2934287573194534,
"grad_norm": 1.086470603942871,
"learning_rate": 2.8855792347331793e-05,
"loss": 0.7671,
"step": 3525
},
{
"epoch": 2.2966818477553677,
"grad_norm": 1.6733858585357666,
"learning_rate": 2.8804785174383248e-05,
"loss": 0.7753,
"step": 3530
},
{
"epoch": 2.2999349381912815,
"grad_norm": 1.0693230628967285,
"learning_rate": 2.8753761782260723e-05,
"loss": 0.7457,
"step": 3535
},
{
"epoch": 2.3031880286271957,
"grad_norm": 1.079010009765625,
"learning_rate": 2.8702722388468546e-05,
"loss": 0.7701,
"step": 3540
},
{
"epoch": 2.30644111906311,
"grad_norm": 0.9620556235313416,
"learning_rate": 2.8651667210579257e-05,
"loss": 0.759,
"step": 3545
},
{
"epoch": 2.3096942094990243,
"grad_norm": 1.1349847316741943,
"learning_rate": 2.8600596466232715e-05,
"loss": 0.7776,
"step": 3550
},
{
"epoch": 2.312947299934938,
"grad_norm": 1.4847538471221924,
"learning_rate": 2.8549510373135092e-05,
"loss": 0.7566,
"step": 3555
},
{
"epoch": 2.3162003903708523,
"grad_norm": 1.657256007194519,
"learning_rate": 2.8498409149058008e-05,
"loss": 0.762,
"step": 3560
},
{
"epoch": 2.3194534808067666,
"grad_norm": 1.0619240999221802,
"learning_rate": 2.8447293011837596e-05,
"loss": 0.771,
"step": 3565
},
{
"epoch": 2.3227065712426804,
"grad_norm": 0.8844910264015198,
"learning_rate": 2.8396162179373535e-05,
"loss": 0.7573,
"step": 3570
},
{
"epoch": 2.3259596616785947,
"grad_norm": 1.3543357849121094,
"learning_rate": 2.8345016869628175e-05,
"loss": 0.7736,
"step": 3575
},
{
"epoch": 2.329212752114509,
"grad_norm": 0.9610804319381714,
"learning_rate": 2.8293857300625555e-05,
"loss": 0.7536,
"step": 3580
},
{
"epoch": 2.3324658425504228,
"grad_norm": 1.2407771348953247,
"learning_rate": 2.8242683690450518e-05,
"loss": 0.7584,
"step": 3585
},
{
"epoch": 2.335718932986337,
"grad_norm": 1.388168215751648,
"learning_rate": 2.8191496257247764e-05,
"loss": 0.7426,
"step": 3590
},
{
"epoch": 2.3389720234222513,
"grad_norm": 1.1140729188919067,
"learning_rate": 2.814029521922088e-05,
"loss": 0.7418,
"step": 3595
},
{
"epoch": 2.342225113858165,
"grad_norm": 1.0877522230148315,
"learning_rate": 2.8089080794631512e-05,
"loss": 0.7531,
"step": 3600
},
{
"epoch": 2.3454782042940794,
"grad_norm": 1.0917423963546753,
"learning_rate": 2.803785320179832e-05,
"loss": 0.7435,
"step": 3605
},
{
"epoch": 2.3487312947299936,
"grad_norm": 1.3571592569351196,
"learning_rate": 2.7986612659096113e-05,
"loss": 0.7594,
"step": 3610
},
{
"epoch": 2.3519843851659075,
"grad_norm": 1.0520139932632446,
"learning_rate": 2.7935359384954914e-05,
"loss": 0.758,
"step": 3615
},
{
"epoch": 2.3552374756018217,
"grad_norm": 1.271592617034912,
"learning_rate": 2.7884093597858996e-05,
"loss": 0.7457,
"step": 3620
},
{
"epoch": 2.358490566037736,
"grad_norm": 0.9961024522781372,
"learning_rate": 2.783281551634599e-05,
"loss": 0.7626,
"step": 3625
},
{
"epoch": 2.36174365647365,
"grad_norm": 1.3508564233779907,
"learning_rate": 2.7781525359005943e-05,
"loss": 0.734,
"step": 3630
},
{
"epoch": 2.364996746909564,
"grad_norm": 1.0961614847183228,
"learning_rate": 2.7730223344480348e-05,
"loss": 0.7553,
"step": 3635
},
{
"epoch": 2.3682498373454783,
"grad_norm": 1.032395839691162,
"learning_rate": 2.7678909691461274e-05,
"loss": 0.7915,
"step": 3640
},
{
"epoch": 2.371502927781392,
"grad_norm": 1.1500605344772339,
"learning_rate": 2.7627584618690394e-05,
"loss": 0.7539,
"step": 3645
},
{
"epoch": 2.3747560182173064,
"grad_norm": 1.0203113555908203,
"learning_rate": 2.7576248344958054e-05,
"loss": 0.7771,
"step": 3650
},
{
"epoch": 2.3780091086532207,
"grad_norm": 2.247779607772827,
"learning_rate": 2.7524901089102358e-05,
"loss": 0.764,
"step": 3655
},
{
"epoch": 2.3812621990891345,
"grad_norm": 1.131200909614563,
"learning_rate": 2.7473543070008213e-05,
"loss": 0.742,
"step": 3660
},
{
"epoch": 2.3845152895250488,
"grad_norm": 1.2509359121322632,
"learning_rate": 2.7422174506606413e-05,
"loss": 0.7461,
"step": 3665
},
{
"epoch": 2.387768379960963,
"grad_norm": 0.864366352558136,
"learning_rate": 2.737079561787272e-05,
"loss": 0.7405,
"step": 3670
},
{
"epoch": 2.391021470396877,
"grad_norm": 0.9416084885597229,
"learning_rate": 2.7319406622826878e-05,
"loss": 0.7439,
"step": 3675
},
{
"epoch": 2.394274560832791,
"grad_norm": 1.7094473838806152,
"learning_rate": 2.726800774053173e-05,
"loss": 0.7698,
"step": 3680
},
{
"epoch": 2.3975276512687054,
"grad_norm": 0.9964091777801514,
"learning_rate": 2.7216599190092273e-05,
"loss": 0.7536,
"step": 3685
},
{
"epoch": 2.4007807417046196,
"grad_norm": 1.1519944667816162,
"learning_rate": 2.7165181190654702e-05,
"loss": 0.7459,
"step": 3690
},
{
"epoch": 2.4040338321405335,
"grad_norm": 1.2240533828735352,
"learning_rate": 2.7113753961405515e-05,
"loss": 0.7434,
"step": 3695
},
{
"epoch": 2.4072869225764477,
"grad_norm": 1.122253656387329,
"learning_rate": 2.7062317721570512e-05,
"loss": 0.7471,
"step": 3700
},
{
"epoch": 2.410540013012362,
"grad_norm": 1.0433543920516968,
"learning_rate": 2.7010872690413956e-05,
"loss": 0.7429,
"step": 3705
},
{
"epoch": 2.413793103448276,
"grad_norm": 1.092159628868103,
"learning_rate": 2.6959419087237553e-05,
"loss": 0.7506,
"step": 3710
},
{
"epoch": 2.41704619388419,
"grad_norm": 0.9082927107810974,
"learning_rate": 2.6907957131379553e-05,
"loss": 0.7666,
"step": 3715
},
{
"epoch": 2.4202992843201043,
"grad_norm": 0.8798219561576843,
"learning_rate": 2.6856487042213822e-05,
"loss": 0.7637,
"step": 3720
},
{
"epoch": 2.423552374756018,
"grad_norm": 0.8654388189315796,
"learning_rate": 2.6805009039148897e-05,
"loss": 0.7541,
"step": 3725
},
{
"epoch": 2.4268054651919324,
"grad_norm": 1.0439229011535645,
"learning_rate": 2.675352334162704e-05,
"loss": 0.7618,
"step": 3730
},
{
"epoch": 2.4300585556278467,
"grad_norm": 0.9634140729904175,
"learning_rate": 2.6702030169123316e-05,
"loss": 0.737,
"step": 3735
},
{
"epoch": 2.4333116460637605,
"grad_norm": 0.8647895455360413,
"learning_rate": 2.6650529741144665e-05,
"loss": 0.7485,
"step": 3740
},
{
"epoch": 2.4365647364996748,
"grad_norm": 1.984215259552002,
"learning_rate": 2.6599022277228948e-05,
"loss": 0.7541,
"step": 3745
},
{
"epoch": 2.439817826935589,
"grad_norm": 1.074607014656067,
"learning_rate": 2.6547507996944022e-05,
"loss": 0.7595,
"step": 3750
},
{
"epoch": 2.443070917371503,
"grad_norm": 0.9121082425117493,
"learning_rate": 2.649598711988679e-05,
"loss": 0.7741,
"step": 3755
},
{
"epoch": 2.446324007807417,
"grad_norm": 1.6042678356170654,
"learning_rate": 2.6444459865682297e-05,
"loss": 0.7699,
"step": 3760
},
{
"epoch": 2.4495770982433314,
"grad_norm": 0.9366397857666016,
"learning_rate": 2.6392926453982748e-05,
"loss": 0.7525,
"step": 3765
},
{
"epoch": 2.452830188679245,
"grad_norm": 1.0728055238723755,
"learning_rate": 2.6341387104466612e-05,
"loss": 0.749,
"step": 3770
},
{
"epoch": 2.4560832791151594,
"grad_norm": 0.988258957862854,
"learning_rate": 2.6289842036837675e-05,
"loss": 0.7563,
"step": 3775
},
{
"epoch": 2.4593363695510737,
"grad_norm": 1.2626458406448364,
"learning_rate": 2.6238291470824085e-05,
"loss": 0.7367,
"step": 3780
},
{
"epoch": 2.4625894599869875,
"grad_norm": 0.8835701942443848,
"learning_rate": 2.6186735626177428e-05,
"loss": 0.7534,
"step": 3785
},
{
"epoch": 2.465842550422902,
"grad_norm": 0.8948650360107422,
"learning_rate": 2.6135174722671813e-05,
"loss": 0.7975,
"step": 3790
},
{
"epoch": 2.469095640858816,
"grad_norm": 1.0557647943496704,
"learning_rate": 2.608360898010288e-05,
"loss": 0.7542,
"step": 3795
},
{
"epoch": 2.47234873129473,
"grad_norm": 1.1379538774490356,
"learning_rate": 2.603203861828693e-05,
"loss": 0.7569,
"step": 3800
},
{
"epoch": 2.475601821730644,
"grad_norm": 1.1298165321350098,
"learning_rate": 2.598046385705994e-05,
"loss": 0.7662,
"step": 3805
},
{
"epoch": 2.4788549121665584,
"grad_norm": 0.9936167001724243,
"learning_rate": 2.5928884916276635e-05,
"loss": 0.7427,
"step": 3810
},
{
"epoch": 2.482108002602472,
"grad_norm": 1.055421233177185,
"learning_rate": 2.5877302015809574e-05,
"loss": 0.741,
"step": 3815
},
{
"epoch": 2.4853610930383865,
"grad_norm": 1.0035120248794556,
"learning_rate": 2.5825715375548175e-05,
"loss": 0.7495,
"step": 3820
},
{
"epoch": 2.4886141834743007,
"grad_norm": 1.5768109560012817,
"learning_rate": 2.5774125215397815e-05,
"loss": 0.7677,
"step": 3825
},
{
"epoch": 2.4918672739102146,
"grad_norm": 1.1085072755813599,
"learning_rate": 2.5722531755278874e-05,
"loss": 0.7693,
"step": 3830
},
{
"epoch": 2.495120364346129,
"grad_norm": 0.9290764927864075,
"learning_rate": 2.567093521512578e-05,
"loss": 0.7734,
"step": 3835
},
{
"epoch": 2.498373454782043,
"grad_norm": 1.2003841400146484,
"learning_rate": 2.561933581488612e-05,
"loss": 0.7529,
"step": 3840
},
{
"epoch": 2.501626545217957,
"grad_norm": 0.9982072114944458,
"learning_rate": 2.556773377451965e-05,
"loss": 0.7555,
"step": 3845
},
{
"epoch": 2.504879635653871,
"grad_norm": 0.9454076886177063,
"learning_rate": 2.5516129313997388e-05,
"loss": 0.7726,
"step": 3850
},
{
"epoch": 2.5081327260897854,
"grad_norm": 0.9885278940200806,
"learning_rate": 2.5464522653300676e-05,
"loss": 0.7585,
"step": 3855
},
{
"epoch": 2.5113858165256993,
"grad_norm": 1.0617841482162476,
"learning_rate": 2.541291401242022e-05,
"loss": 0.7613,
"step": 3860
},
{
"epoch": 2.5146389069616135,
"grad_norm": 0.9445372223854065,
"learning_rate": 2.536130361135518e-05,
"loss": 0.7867,
"step": 3865
},
{
"epoch": 2.517891997397528,
"grad_norm": 1.2932319641113281,
"learning_rate": 2.5309691670112218e-05,
"loss": 0.7509,
"step": 3870
},
{
"epoch": 2.5211450878334416,
"grad_norm": 1.1702325344085693,
"learning_rate": 2.525807840870455e-05,
"loss": 0.7772,
"step": 3875
},
{
"epoch": 2.524398178269356,
"grad_norm": 1.0334542989730835,
"learning_rate": 2.5206464047151046e-05,
"loss": 0.7478,
"step": 3880
},
{
"epoch": 2.52765126870527,
"grad_norm": 2.0176279544830322,
"learning_rate": 2.5154848805475224e-05,
"loss": 0.759,
"step": 3885
},
{
"epoch": 2.530904359141184,
"grad_norm": 1.1288046836853027,
"learning_rate": 2.5103232903704393e-05,
"loss": 0.7529,
"step": 3890
},
{
"epoch": 2.534157449577098,
"grad_norm": 1.0248112678527832,
"learning_rate": 2.5051616561868663e-05,
"loss": 0.7748,
"step": 3895
},
{
"epoch": 2.5374105400130125,
"grad_norm": 0.8906844258308411,
"learning_rate": 2.5e-05,
"loss": 0.7369,
"step": 3900
},
{
"epoch": 2.5406636304489263,
"grad_norm": 1.1588047742843628,
"learning_rate": 2.4948383438131346e-05,
"loss": 0.7465,
"step": 3905
},
{
"epoch": 2.5439167208848406,
"grad_norm": 1.0166900157928467,
"learning_rate": 2.4896767096295613e-05,
"loss": 0.7576,
"step": 3910
},
{
"epoch": 2.547169811320755,
"grad_norm": 1.0682686567306519,
"learning_rate": 2.484515119452478e-05,
"loss": 0.7884,
"step": 3915
},
{
"epoch": 2.5504229017566686,
"grad_norm": 0.9026442766189575,
"learning_rate": 2.4793535952848963e-05,
"loss": 0.7311,
"step": 3920
},
{
"epoch": 2.553675992192583,
"grad_norm": 0.8642654418945312,
"learning_rate": 2.4741921591295454e-05,
"loss": 0.7547,
"step": 3925
},
{
"epoch": 2.556929082628497,
"grad_norm": 1.1124982833862305,
"learning_rate": 2.4690308329887788e-05,
"loss": 0.7523,
"step": 3930
},
{
"epoch": 2.560182173064411,
"grad_norm": 1.664115309715271,
"learning_rate": 2.463869638864483e-05,
"loss": 0.7249,
"step": 3935
},
{
"epoch": 2.5634352635003252,
"grad_norm": 0.9926962852478027,
"learning_rate": 2.458708598757979e-05,
"loss": 0.7318,
"step": 3940
},
{
"epoch": 2.5666883539362395,
"grad_norm": 1.076627254486084,
"learning_rate": 2.4535477346699333e-05,
"loss": 0.7586,
"step": 3945
},
{
"epoch": 2.5699414443721533,
"grad_norm": 1.7046575546264648,
"learning_rate": 2.4483870686002625e-05,
"loss": 0.7482,
"step": 3950
},
{
"epoch": 2.5731945348080676,
"grad_norm": 1.0066241025924683,
"learning_rate": 2.443226622548036e-05,
"loss": 0.7636,
"step": 3955
},
{
"epoch": 2.576447625243982,
"grad_norm": 2.010552406311035,
"learning_rate": 2.4380664185113887e-05,
"loss": 0.7661,
"step": 3960
},
{
"epoch": 2.5797007156798957,
"grad_norm": 1.1133430004119873,
"learning_rate": 2.432906478487423e-05,
"loss": 0.7597,
"step": 3965
},
{
"epoch": 2.58295380611581,
"grad_norm": 1.1634178161621094,
"learning_rate": 2.427746824472113e-05,
"loss": 0.76,
"step": 3970
},
{
"epoch": 2.586206896551724,
"grad_norm": 0.9780275821685791,
"learning_rate": 2.4225874784602184e-05,
"loss": 0.7688,
"step": 3975
},
{
"epoch": 2.589459986987638,
"grad_norm": 1.2186133861541748,
"learning_rate": 2.4174284624451824e-05,
"loss": 0.7309,
"step": 3980
},
{
"epoch": 2.5927130774235523,
"grad_norm": 0.9547963738441467,
"learning_rate": 2.4122697984190428e-05,
"loss": 0.7593,
"step": 3985
},
{
"epoch": 2.5959661678594665,
"grad_norm": 0.943261444568634,
"learning_rate": 2.4071115083723364e-05,
"loss": 0.7562,
"step": 3990
},
{
"epoch": 2.5992192582953804,
"grad_norm": 0.9355084896087646,
"learning_rate": 2.401953614294006e-05,
"loss": 0.7294,
"step": 3995
},
{
"epoch": 2.6024723487312946,
"grad_norm": 1.0167070627212524,
"learning_rate": 2.396796138171307e-05,
"loss": 0.7578,
"step": 4000
},
{
"epoch": 2.605725439167209,
"grad_norm": 0.9536129832267761,
"learning_rate": 2.391639101989712e-05,
"loss": 0.7363,
"step": 4005
},
{
"epoch": 2.6089785296031227,
"grad_norm": 0.9292064309120178,
"learning_rate": 2.3864825277328193e-05,
"loss": 0.7517,
"step": 4010
},
{
"epoch": 2.612231620039037,
"grad_norm": 1.1821918487548828,
"learning_rate": 2.3813264373822578e-05,
"loss": 0.7627,
"step": 4015
},
{
"epoch": 2.6154847104749512,
"grad_norm": 0.9278668165206909,
"learning_rate": 2.376170852917592e-05,
"loss": 0.7673,
"step": 4020
},
{
"epoch": 2.618737800910865,
"grad_norm": 0.9061160683631897,
"learning_rate": 2.3710157963162328e-05,
"loss": 0.774,
"step": 4025
},
{
"epoch": 2.6219908913467793,
"grad_norm": 1.2330580949783325,
"learning_rate": 2.3658612895533393e-05,
"loss": 0.7514,
"step": 4030
},
{
"epoch": 2.6252439817826936,
"grad_norm": 0.9609399437904358,
"learning_rate": 2.3607073546017258e-05,
"loss": 0.7373,
"step": 4035
},
{
"epoch": 2.6284970722186074,
"grad_norm": 1.5064210891723633,
"learning_rate": 2.3555540134317712e-05,
"loss": 0.7487,
"step": 4040
},
{
"epoch": 2.6317501626545217,
"grad_norm": 1.0178202390670776,
"learning_rate": 2.3504012880113216e-05,
"loss": 0.7789,
"step": 4045
},
{
"epoch": 2.635003253090436,
"grad_norm": 0.8506657481193542,
"learning_rate": 2.3452492003055984e-05,
"loss": 0.7316,
"step": 4050
},
{
"epoch": 2.63825634352635,
"grad_norm": 0.9458078145980835,
"learning_rate": 2.3400977722771058e-05,
"loss": 0.7703,
"step": 4055
},
{
"epoch": 2.641509433962264,
"grad_norm": 1.1263021230697632,
"learning_rate": 2.3349470258855337e-05,
"loss": 0.7579,
"step": 4060
},
{
"epoch": 2.6447625243981783,
"grad_norm": 0.8372018933296204,
"learning_rate": 2.3297969830876686e-05,
"loss": 0.76,
"step": 4065
},
{
"epoch": 2.6480156148340925,
"grad_norm": 0.8701651692390442,
"learning_rate": 2.3246476658372973e-05,
"loss": 0.7476,
"step": 4070
},
{
"epoch": 2.6512687052700064,
"grad_norm": 1.3167948722839355,
"learning_rate": 2.3194990960851112e-05,
"loss": 0.7628,
"step": 4075
},
{
"epoch": 2.6545217957059206,
"grad_norm": 1.0400781631469727,
"learning_rate": 2.3143512957786184e-05,
"loss": 0.7773,
"step": 4080
},
{
"epoch": 2.657774886141835,
"grad_norm": 0.9622422456741333,
"learning_rate": 2.309204286862046e-05,
"loss": 0.7469,
"step": 4085
},
{
"epoch": 2.6610279765777487,
"grad_norm": 0.929834246635437,
"learning_rate": 2.3040580912762456e-05,
"loss": 0.7544,
"step": 4090
},
{
"epoch": 2.664281067013663,
"grad_norm": 1.018149495124817,
"learning_rate": 2.298912730958605e-05,
"loss": 0.7746,
"step": 4095
},
{
"epoch": 2.6675341574495772,
"grad_norm": 1.0057318210601807,
"learning_rate": 2.2937682278429494e-05,
"loss": 0.7352,
"step": 4100
},
{
"epoch": 2.6707872478854915,
"grad_norm": 0.9973504543304443,
"learning_rate": 2.288624603859449e-05,
"loss": 0.721,
"step": 4105
},
{
"epoch": 2.6740403383214053,
"grad_norm": 1.0883572101593018,
"learning_rate": 2.2834818809345297e-05,
"loss": 0.7474,
"step": 4110
},
{
"epoch": 2.6772934287573196,
"grad_norm": 1.337254524230957,
"learning_rate": 2.2783400809907726e-05,
"loss": 0.7701,
"step": 4115
},
{
"epoch": 2.680546519193234,
"grad_norm": 1.1612261533737183,
"learning_rate": 2.2731992259468272e-05,
"loss": 0.7547,
"step": 4120
},
{
"epoch": 2.6837996096291477,
"grad_norm": 1.0043455362319946,
"learning_rate": 2.2680593377173124e-05,
"loss": 0.7576,
"step": 4125
},
{
"epoch": 2.687052700065062,
"grad_norm": 1.180498719215393,
"learning_rate": 2.2629204382127284e-05,
"loss": 0.7533,
"step": 4130
},
{
"epoch": 2.690305790500976,
"grad_norm": 1.0349406003952026,
"learning_rate": 2.257782549339359e-05,
"loss": 0.7636,
"step": 4135
},
{
"epoch": 2.69355888093689,
"grad_norm": 1.073776125907898,
"learning_rate": 2.2526456929991793e-05,
"loss": 0.7718,
"step": 4140
},
{
"epoch": 2.6968119713728043,
"grad_norm": 1.114530324935913,
"learning_rate": 2.2475098910897645e-05,
"loss": 0.7445,
"step": 4145
},
{
"epoch": 2.7000650618087185,
"grad_norm": 0.9346311092376709,
"learning_rate": 2.2423751655041952e-05,
"loss": 0.7294,
"step": 4150
},
{
"epoch": 2.7033181522446323,
"grad_norm": 1.086501955986023,
"learning_rate": 2.237241538130961e-05,
"loss": 0.7507,
"step": 4155
},
{
"epoch": 2.7065712426805466,
"grad_norm": 0.9763929843902588,
"learning_rate": 2.2321090308538732e-05,
"loss": 0.743,
"step": 4160
},
{
"epoch": 2.709824333116461,
"grad_norm": 0.8880870938301086,
"learning_rate": 2.2269776655519658e-05,
"loss": 0.7418,
"step": 4165
},
{
"epoch": 2.7130774235523747,
"grad_norm": 0.9564589858055115,
"learning_rate": 2.2218474640994063e-05,
"loss": 0.765,
"step": 4170
},
{
"epoch": 2.716330513988289,
"grad_norm": 1.169952630996704,
"learning_rate": 2.2167184483654013e-05,
"loss": 0.7531,
"step": 4175
},
{
"epoch": 2.719583604424203,
"grad_norm": 0.9627036452293396,
"learning_rate": 2.211590640214101e-05,
"loss": 0.7623,
"step": 4180
},
{
"epoch": 2.722836694860117,
"grad_norm": 0.9291010499000549,
"learning_rate": 2.2064640615045092e-05,
"loss": 0.7641,
"step": 4185
},
{
"epoch": 2.7260897852960313,
"grad_norm": 1.0236008167266846,
"learning_rate": 2.2013387340903893e-05,
"loss": 0.7703,
"step": 4190
},
{
"epoch": 2.7293428757319456,
"grad_norm": 1.2711366415023804,
"learning_rate": 2.1962146798201684e-05,
"loss": 0.7454,
"step": 4195
},
{
"epoch": 2.7325959661678594,
"grad_norm": 1.1424434185028076,
"learning_rate": 2.191091920536849e-05,
"loss": 0.7559,
"step": 4200
},
{
"epoch": 2.7358490566037736,
"grad_norm": 1.4138892889022827,
"learning_rate": 2.1859704780779126e-05,
"loss": 0.7569,
"step": 4205
},
{
"epoch": 2.739102147039688,
"grad_norm": 0.967829704284668,
"learning_rate": 2.1808503742752252e-05,
"loss": 0.7432,
"step": 4210
},
{
"epoch": 2.7423552374756017,
"grad_norm": 0.8999619483947754,
"learning_rate": 2.175731630954949e-05,
"loss": 0.7457,
"step": 4215
},
{
"epoch": 2.745608327911516,
"grad_norm": 1.0657751560211182,
"learning_rate": 2.1706142699374454e-05,
"loss": 0.786,
"step": 4220
},
{
"epoch": 2.7488614183474303,
"grad_norm": 1.5017127990722656,
"learning_rate": 2.1654983130371837e-05,
"loss": 0.7516,
"step": 4225
},
{
"epoch": 2.752114508783344,
"grad_norm": 1.0914252996444702,
"learning_rate": 2.1603837820626478e-05,
"loss": 0.7616,
"step": 4230
},
{
"epoch": 2.7553675992192583,
"grad_norm": 1.1397154331207275,
"learning_rate": 2.1552706988162417e-05,
"loss": 0.761,
"step": 4235
},
{
"epoch": 2.7586206896551726,
"grad_norm": 1.162166714668274,
"learning_rate": 2.1501590850941994e-05,
"loss": 0.7353,
"step": 4240
},
{
"epoch": 2.7618737800910864,
"grad_norm": 1.0100218057632446,
"learning_rate": 2.1450489626864907e-05,
"loss": 0.7446,
"step": 4245
},
{
"epoch": 2.7651268705270007,
"grad_norm": 0.9108495116233826,
"learning_rate": 2.139940353376728e-05,
"loss": 0.7644,
"step": 4250
},
{
"epoch": 2.768379960962915,
"grad_norm": 0.9544759392738342,
"learning_rate": 2.134833278942074e-05,
"loss": 0.7693,
"step": 4255
},
{
"epoch": 2.7716330513988288,
"grad_norm": 1.6715203523635864,
"learning_rate": 2.1297277611531456e-05,
"loss": 0.764,
"step": 4260
},
{
"epoch": 2.774886141834743,
"grad_norm": 1.0044587850570679,
"learning_rate": 2.1246238217739283e-05,
"loss": 0.7593,
"step": 4265
},
{
"epoch": 2.7781392322706573,
"grad_norm": 0.9041277766227722,
"learning_rate": 2.119521482561675e-05,
"loss": 0.7427,
"step": 4270
},
{
"epoch": 2.781392322706571,
"grad_norm": 0.8890901803970337,
"learning_rate": 2.114420765266821e-05,
"loss": 0.7462,
"step": 4275
},
{
"epoch": 2.7846454131424854,
"grad_norm": 0.9522978663444519,
"learning_rate": 2.1093216916328855e-05,
"loss": 0.7398,
"step": 4280
},
{
"epoch": 2.7878985035783996,
"grad_norm": 1.2829575538635254,
"learning_rate": 2.104224283396381e-05,
"loss": 0.7632,
"step": 4285
},
{
"epoch": 2.7911515940143135,
"grad_norm": 0.9626341462135315,
"learning_rate": 2.0991285622867215e-05,
"loss": 0.7681,
"step": 4290
},
{
"epoch": 2.7944046844502277,
"grad_norm": 0.952867865562439,
"learning_rate": 2.0940345500261294e-05,
"loss": 0.7518,
"step": 4295
},
{
"epoch": 2.797657774886142,
"grad_norm": 1.0598902702331543,
"learning_rate": 2.0889422683295407e-05,
"loss": 0.7884,
"step": 4300
},
{
"epoch": 2.800910865322056,
"grad_norm": 1.0540211200714111,
"learning_rate": 2.083851738904516e-05,
"loss": 0.7518,
"step": 4305
},
{
"epoch": 2.80416395575797,
"grad_norm": 0.9470973014831543,
"learning_rate": 2.0787629834511466e-05,
"loss": 0.764,
"step": 4310
},
{
"epoch": 2.8074170461938843,
"grad_norm": 1.127659559249878,
"learning_rate": 2.0736760236619594e-05,
"loss": 0.7332,
"step": 4315
},
{
"epoch": 2.810670136629798,
"grad_norm": 1.0755411386489868,
"learning_rate": 2.0685908812218287e-05,
"loss": 0.7622,
"step": 4320
},
{
"epoch": 2.8139232270657124,
"grad_norm": 1.1209520101547241,
"learning_rate": 2.0635075778078817e-05,
"loss": 0.7416,
"step": 4325
},
{
"epoch": 2.8171763175016267,
"grad_norm": 1.0491728782653809,
"learning_rate": 2.0584261350894046e-05,
"loss": 0.7802,
"step": 4330
},
{
"epoch": 2.8204294079375405,
"grad_norm": 1.025694727897644,
"learning_rate": 2.0533465747277535e-05,
"loss": 0.7487,
"step": 4335
},
{
"epoch": 2.8236824983734548,
"grad_norm": 0.9486551880836487,
"learning_rate": 2.0482689183762588e-05,
"loss": 0.7594,
"step": 4340
},
{
"epoch": 2.826935588809369,
"grad_norm": 0.9839990139007568,
"learning_rate": 2.0431931876801352e-05,
"loss": 0.7431,
"step": 4345
},
{
"epoch": 2.830188679245283,
"grad_norm": 1.0050575733184814,
"learning_rate": 2.03811940427639e-05,
"loss": 0.7527,
"step": 4350
},
{
"epoch": 2.833441769681197,
"grad_norm": 0.9743004441261292,
"learning_rate": 2.033047589793726e-05,
"loss": 0.7307,
"step": 4355
},
{
"epoch": 2.8366948601171114,
"grad_norm": 1.0488122701644897,
"learning_rate": 2.027977765852456e-05,
"loss": 0.7598,
"step": 4360
},
{
"epoch": 2.839947950553025,
"grad_norm": 1.074271321296692,
"learning_rate": 2.022909954064407e-05,
"loss": 0.7571,
"step": 4365
},
{
"epoch": 2.8432010409889394,
"grad_norm": 0.9306830167770386,
"learning_rate": 2.0178441760328268e-05,
"loss": 0.735,
"step": 4370
},
{
"epoch": 2.8464541314248537,
"grad_norm": 0.8995447754859924,
"learning_rate": 2.0127804533522948e-05,
"loss": 0.7519,
"step": 4375
},
{
"epoch": 2.8497072218607675,
"grad_norm": 0.9495101571083069,
"learning_rate": 2.0077188076086288e-05,
"loss": 0.7544,
"step": 4380
},
{
"epoch": 2.852960312296682,
"grad_norm": 1.3610079288482666,
"learning_rate": 2.002659260378794e-05,
"loss": 0.7573,
"step": 4385
},
{
"epoch": 2.856213402732596,
"grad_norm": 0.9668116569519043,
"learning_rate": 1.9976018332308077e-05,
"loss": 0.7332,
"step": 4390
},
{
"epoch": 2.85946649316851,
"grad_norm": 1.128670334815979,
"learning_rate": 1.992546547723651e-05,
"loss": 0.7512,
"step": 4395
},
{
"epoch": 2.862719583604424,
"grad_norm": 1.276426911354065,
"learning_rate": 1.987493425407176e-05,
"loss": 0.7449,
"step": 4400
},
{
"epoch": 2.8659726740403384,
"grad_norm": 0.9716594815254211,
"learning_rate": 1.982442487822011e-05,
"loss": 0.7432,
"step": 4405
},
{
"epoch": 2.869225764476252,
"grad_norm": 0.9533106088638306,
"learning_rate": 1.9773937564994745e-05,
"loss": 0.7423,
"step": 4410
},
{
"epoch": 2.8724788549121665,
"grad_norm": 1.0256469249725342,
"learning_rate": 1.972347252961479e-05,
"loss": 0.7614,
"step": 4415
},
{
"epoch": 2.8757319453480807,
"grad_norm": 1.1626900434494019,
"learning_rate": 1.967302998720438e-05,
"loss": 0.7392,
"step": 4420
},
{
"epoch": 2.8789850357839946,
"grad_norm": 0.9739611744880676,
"learning_rate": 1.9622610152791792e-05,
"loss": 0.7622,
"step": 4425
},
{
"epoch": 2.882238126219909,
"grad_norm": 1.0657685995101929,
"learning_rate": 1.9572213241308507e-05,
"loss": 0.7507,
"step": 4430
},
{
"epoch": 2.885491216655823,
"grad_norm": 1.029432773590088,
"learning_rate": 1.952183946758826e-05,
"loss": 0.7723,
"step": 4435
},
{
"epoch": 2.888744307091737,
"grad_norm": 1.1281373500823975,
"learning_rate": 1.9471489046366185e-05,
"loss": 0.7479,
"step": 4440
},
{
"epoch": 2.891997397527651,
"grad_norm": 1.1470041275024414,
"learning_rate": 1.942116219227784e-05,
"loss": 0.7341,
"step": 4445
},
{
"epoch": 2.8952504879635654,
"grad_norm": 1.0326032638549805,
"learning_rate": 1.937085911985834e-05,
"loss": 0.7571,
"step": 4450
},
{
"epoch": 2.8985035783994793,
"grad_norm": 0.9806135296821594,
"learning_rate": 1.9320580043541425e-05,
"loss": 0.734,
"step": 4455
},
{
"epoch": 2.9017566688353935,
"grad_norm": 1.063024878501892,
"learning_rate": 1.9270325177658523e-05,
"loss": 0.7521,
"step": 4460
},
{
"epoch": 2.905009759271308,
"grad_norm": 4.5842156410217285,
"learning_rate": 1.922009473643787e-05,
"loss": 0.7563,
"step": 4465
},
{
"epoch": 2.9082628497072216,
"grad_norm": 1.3341448307037354,
"learning_rate": 1.9169888934003598e-05,
"loss": 0.7528,
"step": 4470
},
{
"epoch": 2.911515940143136,
"grad_norm": 1.3391072750091553,
"learning_rate": 1.9119707984374774e-05,
"loss": 0.737,
"step": 4475
},
{
"epoch": 2.91476903057905,
"grad_norm": 0.985970139503479,
"learning_rate": 1.9069552101464552e-05,
"loss": 0.7657,
"step": 4480
},
{
"epoch": 2.918022121014964,
"grad_norm": 1.069992184638977,
"learning_rate": 1.901942149907922e-05,
"loss": 0.7526,
"step": 4485
},
{
"epoch": 2.921275211450878,
"grad_norm": 0.8812434077262878,
"learning_rate": 1.8969316390917288e-05,
"loss": 0.7664,
"step": 4490
},
{
"epoch": 2.9245283018867925,
"grad_norm": 1.2932692766189575,
"learning_rate": 1.891923699056861e-05,
"loss": 0.7553,
"step": 4495
},
{
"epoch": 2.9277813923227067,
"grad_norm": 0.935070276260376,
"learning_rate": 1.886918351151343e-05,
"loss": 0.7583,
"step": 4500
},
{
"epoch": 2.9310344827586206,
"grad_norm": 0.9840937852859497,
"learning_rate": 1.881915616712151e-05,
"loss": 0.748,
"step": 4505
},
{
"epoch": 2.934287573194535,
"grad_norm": 1.0583505630493164,
"learning_rate": 1.8769155170651203e-05,
"loss": 0.7482,
"step": 4510
},
{
"epoch": 2.937540663630449,
"grad_norm": 1.0253130197525024,
"learning_rate": 1.8719180735248522e-05,
"loss": 0.751,
"step": 4515
},
{
"epoch": 2.940793754066363,
"grad_norm": 1.0491794347763062,
"learning_rate": 1.8669233073946303e-05,
"loss": 0.7533,
"step": 4520
},
{
"epoch": 2.944046844502277,
"grad_norm": 1.1201449632644653,
"learning_rate": 1.86193123996632e-05,
"loss": 0.7486,
"step": 4525
},
{
"epoch": 2.9472999349381914,
"grad_norm": 1.3683768510818481,
"learning_rate": 1.856941892520284e-05,
"loss": 0.7584,
"step": 4530
},
{
"epoch": 2.9505530253741052,
"grad_norm": 1.0555903911590576,
"learning_rate": 1.851955286325292e-05,
"loss": 0.7554,
"step": 4535
},
{
"epoch": 2.9538061158100195,
"grad_norm": 1.5055445432662964,
"learning_rate": 1.846971442638426e-05,
"loss": 0.7418,
"step": 4540
},
{
"epoch": 2.9570592062459338,
"grad_norm": 1.222474455833435,
"learning_rate": 1.841990382704993e-05,
"loss": 0.7455,
"step": 4545
},
{
"epoch": 2.960312296681848,
"grad_norm": 1.0359810590744019,
"learning_rate": 1.8370121277584325e-05,
"loss": 0.7404,
"step": 4550
},
{
"epoch": 2.963565387117762,
"grad_norm": 1.2511727809906006,
"learning_rate": 1.8320366990202276e-05,
"loss": 0.7228,
"step": 4555
},
{
"epoch": 2.966818477553676,
"grad_norm": 0.8730882406234741,
"learning_rate": 1.827064117699814e-05,
"loss": 0.7586,
"step": 4560
},
{
"epoch": 2.9700715679895904,
"grad_norm": 1.5805312395095825,
"learning_rate": 1.822094404994487e-05,
"loss": 0.7499,
"step": 4565
},
{
"epoch": 2.973324658425504,
"grad_norm": 1.1607098579406738,
"learning_rate": 1.817127582089317e-05,
"loss": 0.7637,
"step": 4570
},
{
"epoch": 2.9765777488614185,
"grad_norm": 0.9193926453590393,
"learning_rate": 1.8121636701570537e-05,
"loss": 0.7532,
"step": 4575
},
{
"epoch": 2.9798308392973327,
"grad_norm": 1.0218764543533325,
"learning_rate": 1.807202690358037e-05,
"loss": 0.7503,
"step": 4580
},
{
"epoch": 2.9830839297332465,
"grad_norm": 1.0876221656799316,
"learning_rate": 1.802244663840109e-05,
"loss": 0.7707,
"step": 4585
},
{
"epoch": 2.986337020169161,
"grad_norm": 1.0459486246109009,
"learning_rate": 1.797289611738523e-05,
"loss": 0.7397,
"step": 4590
},
{
"epoch": 2.989590110605075,
"grad_norm": 1.0498055219650269,
"learning_rate": 1.7923375551758505e-05,
"loss": 0.7691,
"step": 4595
},
{
"epoch": 2.992843201040989,
"grad_norm": 0.9780749082565308,
"learning_rate": 1.7873885152618956e-05,
"loss": 0.7525,
"step": 4600
},
{
"epoch": 2.996096291476903,
"grad_norm": 1.0338603258132935,
"learning_rate": 1.7824425130936023e-05,
"loss": 0.7459,
"step": 4605
},
{
"epoch": 2.9993493819128174,
"grad_norm": 0.9098593592643738,
"learning_rate": 1.7774995697549645e-05,
"loss": 0.7488,
"step": 4610
},
{
"epoch": 3.0,
"eval_f1": 0.8012369099843738,
"eval_loss": 0.45166015625,
"eval_precision": 0.8020338050069477,
"eval_recall": 0.8006626052475169,
"eval_runtime": 238.3932,
"eval_samples_per_second": 1650.361,
"eval_steps_per_second": 1.615,
"step": 4611
},
{
"epoch": 3.0026024723487312,
"grad_norm": 1.3282872438430786,
"learning_rate": 1.7725597063169386e-05,
"loss": 0.6622,
"step": 4615
},
{
"epoch": 3.0058555627846455,
"grad_norm": 1.3152724504470825,
"learning_rate": 1.767622943837349e-05,
"loss": 0.6352,
"step": 4620
},
{
"epoch": 3.0091086532205593,
"grad_norm": 1.105705976486206,
"learning_rate": 1.7626893033608038e-05,
"loss": 0.6291,
"step": 4625
},
{
"epoch": 3.0123617436564736,
"grad_norm": 1.0462555885314941,
"learning_rate": 1.7577588059186027e-05,
"loss": 0.6476,
"step": 4630
},
{
"epoch": 3.015614834092388,
"grad_norm": 1.0921547412872314,
"learning_rate": 1.7528314725286443e-05,
"loss": 0.6358,
"step": 4635
},
{
"epoch": 3.018867924528302,
"grad_norm": 1.1877232789993286,
"learning_rate": 1.747907324195342e-05,
"loss": 0.6434,
"step": 4640
},
{
"epoch": 3.022121014964216,
"grad_norm": 1.1791988611221313,
"learning_rate": 1.7429863819095313e-05,
"loss": 0.6372,
"step": 4645
},
{
"epoch": 3.02537410540013,
"grad_norm": 1.23057222366333,
"learning_rate": 1.738068666648379e-05,
"loss": 0.6521,
"step": 4650
},
{
"epoch": 3.0286271958360445,
"grad_norm": 1.0966289043426514,
"learning_rate": 1.7331541993752993e-05,
"loss": 0.6337,
"step": 4655
},
{
"epoch": 3.0318802862719583,
"grad_norm": 1.108396291732788,
"learning_rate": 1.7282430010398577e-05,
"loss": 0.6394,
"step": 4660
},
{
"epoch": 3.0351333767078725,
"grad_norm": 1.2432180643081665,
"learning_rate": 1.723335092577686e-05,
"loss": 0.6319,
"step": 4665
},
{
"epoch": 3.038386467143787,
"grad_norm": 1.5450379848480225,
"learning_rate": 1.718430494910391e-05,
"loss": 0.632,
"step": 4670
},
{
"epoch": 3.0416395575797006,
"grad_norm": 1.3607127666473389,
"learning_rate": 1.713529228945466e-05,
"loss": 0.6608,
"step": 4675
},
{
"epoch": 3.044892648015615,
"grad_norm": 1.0697190761566162,
"learning_rate": 1.7086313155762046e-05,
"loss": 0.6263,
"step": 4680
},
{
"epoch": 3.048145738451529,
"grad_norm": 1.3838845491409302,
"learning_rate": 1.703736775681604e-05,
"loss": 0.6367,
"step": 4685
},
{
"epoch": 3.051398828887443,
"grad_norm": 1.324628233909607,
"learning_rate": 1.6988456301262854e-05,
"loss": 0.6435,
"step": 4690
},
{
"epoch": 3.0546519193233572,
"grad_norm": 1.2009634971618652,
"learning_rate": 1.6939578997603983e-05,
"loss": 0.6467,
"step": 4695
},
{
"epoch": 3.0579050097592715,
"grad_norm": 1.2275351285934448,
"learning_rate": 1.689073605419533e-05,
"loss": 0.6403,
"step": 4700
},
{
"epoch": 3.0611581001951853,
"grad_norm": 1.9216879606246948,
"learning_rate": 1.6841927679246345e-05,
"loss": 0.6186,
"step": 4705
},
{
"epoch": 3.0644111906310996,
"grad_norm": 2.3563551902770996,
"learning_rate": 1.679315408081911e-05,
"loss": 0.6202,
"step": 4710
},
{
"epoch": 3.067664281067014,
"grad_norm": 1.435333490371704,
"learning_rate": 1.6744415466827463e-05,
"loss": 0.6273,
"step": 4715
},
{
"epoch": 3.0709173715029277,
"grad_norm": 1.315987229347229,
"learning_rate": 1.6695712045036104e-05,
"loss": 0.6318,
"step": 4720
},
{
"epoch": 3.074170461938842,
"grad_norm": 1.5982025861740112,
"learning_rate": 1.6647044023059712e-05,
"loss": 0.6384,
"step": 4725
},
{
"epoch": 3.077423552374756,
"grad_norm": 1.998374104499817,
"learning_rate": 1.659841160836207e-05,
"loss": 0.6286,
"step": 4730
},
{
"epoch": 3.08067664281067,
"grad_norm": 1.3811148405075073,
"learning_rate": 1.6549815008255176e-05,
"loss": 0.6482,
"step": 4735
},
{
"epoch": 3.0839297332465843,
"grad_norm": 1.2464516162872314,
"learning_rate": 1.6501254429898343e-05,
"loss": 0.6433,
"step": 4740
},
{
"epoch": 3.0871828236824985,
"grad_norm": 1.2944623231887817,
"learning_rate": 1.6452730080297342e-05,
"loss": 0.6328,
"step": 4745
},
{
"epoch": 3.0904359141184123,
"grad_norm": 1.1027922630310059,
"learning_rate": 1.6404242166303507e-05,
"loss": 0.6357,
"step": 4750
},
{
"epoch": 3.0936890045543266,
"grad_norm": 3.5568132400512695,
"learning_rate": 1.6355790894612834e-05,
"loss": 0.6081,
"step": 4755
},
{
"epoch": 3.096942094990241,
"grad_norm": 1.588714838027954,
"learning_rate": 1.630737647176514e-05,
"loss": 0.6601,
"step": 4760
},
{
"epoch": 3.1001951854261547,
"grad_norm": 1.1922274827957153,
"learning_rate": 1.6258999104143157e-05,
"loss": 0.6145,
"step": 4765
},
{
"epoch": 3.103448275862069,
"grad_norm": 1.3667454719543457,
"learning_rate": 1.621065899797165e-05,
"loss": 0.6372,
"step": 4770
},
{
"epoch": 3.106701366297983,
"grad_norm": 1.8918445110321045,
"learning_rate": 1.616235635931655e-05,
"loss": 0.6152,
"step": 4775
},
{
"epoch": 3.109954456733897,
"grad_norm": 1.293562650680542,
"learning_rate": 1.611409139408406e-05,
"loss": 0.6211,
"step": 4780
},
{
"epoch": 3.1132075471698113,
"grad_norm": 1.446754813194275,
"learning_rate": 1.6065864308019807e-05,
"loss": 0.6453,
"step": 4785
},
{
"epoch": 3.1164606376057256,
"grad_norm": 1.1851979494094849,
"learning_rate": 1.6017675306707926e-05,
"loss": 0.631,
"step": 4790
},
{
"epoch": 3.1197137280416394,
"grad_norm": 1.3031965494155884,
"learning_rate": 1.5969524595570216e-05,
"loss": 0.6184,
"step": 4795
},
{
"epoch": 3.1229668184775536,
"grad_norm": 2.6355156898498535,
"learning_rate": 1.5921412379865257e-05,
"loss": 0.6451,
"step": 4800
},
{
"epoch": 3.126219908913468,
"grad_norm": 1.4367573261260986,
"learning_rate": 1.58733388646875e-05,
"loss": 0.6466,
"step": 4805
},
{
"epoch": 3.1294729993493817,
"grad_norm": 1.4838011264801025,
"learning_rate": 1.5825304254966445e-05,
"loss": 0.6181,
"step": 4810
},
{
"epoch": 3.132726089785296,
"grad_norm": 1.2338780164718628,
"learning_rate": 1.577730875546575e-05,
"loss": 0.6179,
"step": 4815
},
{
"epoch": 3.1359791802212102,
"grad_norm": 1.4179608821868896,
"learning_rate": 1.5729352570782324e-05,
"loss": 0.6362,
"step": 4820
},
{
"epoch": 3.139232270657124,
"grad_norm": 1.2671458721160889,
"learning_rate": 1.5681435905345522e-05,
"loss": 0.6365,
"step": 4825
},
{
"epoch": 3.1424853610930383,
"grad_norm": 1.368369221687317,
"learning_rate": 1.5643131164122626e-05,
"loss": 0.6102,
"step": 4830
},
{
"epoch": 3.1457384515289526,
"grad_norm": 1.341280460357666,
"learning_rate": 1.5595286147953364e-05,
"loss": 0.637,
"step": 4835
},
{
"epoch": 3.1489915419648664,
"grad_norm": 1.5806121826171875,
"learning_rate": 1.5547481222533846e-05,
"loss": 0.6296,
"step": 4840
},
{
"epoch": 3.1522446324007807,
"grad_norm": 1.505342721939087,
"learning_rate": 1.549971659164861e-05,
"loss": 0.6284,
"step": 4845
},
{
"epoch": 3.155497722836695,
"grad_norm": 1.2677946090698242,
"learning_rate": 1.5451992458910442e-05,
"loss": 0.6134,
"step": 4850
},
{
"epoch": 3.1587508132726088,
"grad_norm": 1.2727744579315186,
"learning_rate": 1.540430902775946e-05,
"loss": 0.626,
"step": 4855
},
{
"epoch": 3.162003903708523,
"grad_norm": 1.258187174797058,
"learning_rate": 1.5356666501462314e-05,
"loss": 0.6085,
"step": 4860
},
{
"epoch": 3.1652569941444373,
"grad_norm": 1.589736819267273,
"learning_rate": 1.5309065083111255e-05,
"loss": 0.6247,
"step": 4865
},
{
"epoch": 3.168510084580351,
"grad_norm": 1.2900131940841675,
"learning_rate": 1.5261504975623306e-05,
"loss": 0.624,
"step": 4870
},
{
"epoch": 3.1717631750162654,
"grad_norm": 2.3252532482147217,
"learning_rate": 1.5213986381739393e-05,
"loss": 0.6295,
"step": 4875
},
{
"epoch": 3.1750162654521796,
"grad_norm": 1.3652303218841553,
"learning_rate": 1.5166509504023473e-05,
"loss": 0.6274,
"step": 4880
},
{
"epoch": 3.178269355888094,
"grad_norm": 1.8075648546218872,
"learning_rate": 1.5119074544861678e-05,
"loss": 0.6375,
"step": 4885
},
{
"epoch": 3.1815224463240077,
"grad_norm": 1.2221382856369019,
"learning_rate": 1.5071681706461438e-05,
"loss": 0.6273,
"step": 4890
},
{
"epoch": 3.184775536759922,
"grad_norm": 1.5147900581359863,
"learning_rate": 1.5024331190850637e-05,
"loss": 0.6381,
"step": 4895
},
{
"epoch": 3.1880286271958362,
"grad_norm": 2.4453020095825195,
"learning_rate": 1.4977023199876743e-05,
"loss": 0.6552,
"step": 4900
},
{
"epoch": 3.19128171763175,
"grad_norm": 2.3050053119659424,
"learning_rate": 1.4929757935205951e-05,
"loss": 0.6176,
"step": 4905
},
{
"epoch": 3.1945348080676643,
"grad_norm": 1.289581060409546,
"learning_rate": 1.4882535598322311e-05,
"loss": 0.6253,
"step": 4910
},
{
"epoch": 3.1977878985035786,
"grad_norm": 1.5076651573181152,
"learning_rate": 1.4835356390526888e-05,
"loss": 0.6194,
"step": 4915
},
{
"epoch": 3.2010409889394924,
"grad_norm": 1.4202001094818115,
"learning_rate": 1.478822051293689e-05,
"loss": 0.6081,
"step": 4920
},
{
"epoch": 3.2042940793754067,
"grad_norm": 1.287611961364746,
"learning_rate": 1.4741128166484824e-05,
"loss": 0.6429,
"step": 4925
},
{
"epoch": 3.207547169811321,
"grad_norm": 1.2236043214797974,
"learning_rate": 1.4694079551917629e-05,
"loss": 0.6176,
"step": 4930
},
{
"epoch": 3.2108002602472347,
"grad_norm": 1.3410075902938843,
"learning_rate": 1.4656472282003922e-05,
"loss": 0.6209,
"step": 4935
},
{
"epoch": 3.214053350683149,
"grad_norm": 1.419541835784912,
"learning_rate": 1.4609502890116145e-05,
"loss": 0.6436,
"step": 4940
},
{
"epoch": 3.2173064411190633,
"grad_norm": 1.7478810548782349,
"learning_rate": 1.4562577791210158e-05,
"loss": 0.6023,
"step": 4945
},
{
"epoch": 3.220559531554977,
"grad_norm": 1.8083374500274658,
"learning_rate": 1.4515697185319946e-05,
"loss": 0.6166,
"step": 4950
},
{
"epoch": 3.2238126219908914,
"grad_norm": 2.203806161880493,
"learning_rate": 1.4468861272289818e-05,
"loss": 0.636,
"step": 4955
},
{
"epoch": 3.2270657124268056,
"grad_norm": 1.3574259281158447,
"learning_rate": 1.4422070251773594e-05,
"loss": 0.6012,
"step": 4960
},
{
"epoch": 3.2303188028627194,
"grad_norm": 1.4441782236099243,
"learning_rate": 1.4375324323233697e-05,
"loss": 0.6197,
"step": 4965
},
{
"epoch": 3.2335718932986337,
"grad_norm": 1.7502111196517944,
"learning_rate": 1.4328623685940335e-05,
"loss": 0.6354,
"step": 4970
},
{
"epoch": 3.236824983734548,
"grad_norm": 1.5651460886001587,
"learning_rate": 1.4281968538970646e-05,
"loss": 0.6257,
"step": 4975
},
{
"epoch": 3.240078074170462,
"grad_norm": 1.3271369934082031,
"learning_rate": 1.4235359081207871e-05,
"loss": 0.6378,
"step": 4980
},
{
"epoch": 3.243331164606376,
"grad_norm": 1.354906678199768,
"learning_rate": 1.4188795511340461e-05,
"loss": 0.6324,
"step": 4985
},
{
"epoch": 3.2465842550422903,
"grad_norm": 1.295578956604004,
"learning_rate": 1.4142278027861253e-05,
"loss": 0.6176,
"step": 4990
},
{
"epoch": 3.249837345478204,
"grad_norm": 1.4495329856872559,
"learning_rate": 1.4095806829066655e-05,
"loss": 0.6387,
"step": 4995
},
{
"epoch": 3.2530904359141184,
"grad_norm": 1.3459370136260986,
"learning_rate": 1.404938211305574e-05,
"loss": 0.6343,
"step": 5000
},
{
"epoch": 3.2563435263500327,
"grad_norm": 1.299459457397461,
"learning_rate": 1.4003004077729438e-05,
"loss": 0.6394,
"step": 5005
},
{
"epoch": 3.2595966167859465,
"grad_norm": 1.3181241750717163,
"learning_rate": 1.3956672920789705e-05,
"loss": 0.6135,
"step": 5010
},
{
"epoch": 3.2628497072218607,
"grad_norm": 1.5811583995819092,
"learning_rate": 1.3910388839738647e-05,
"loss": 0.6377,
"step": 5015
},
{
"epoch": 3.266102797657775,
"grad_norm": 1.3512473106384277,
"learning_rate": 1.386415203187768e-05,
"loss": 0.6293,
"step": 5020
},
{
"epoch": 3.269355888093689,
"grad_norm": 1.8290486335754395,
"learning_rate": 1.3817962694306747e-05,
"loss": 0.635,
"step": 5025
},
{
"epoch": 3.272608978529603,
"grad_norm": 1.5076416730880737,
"learning_rate": 1.3771821023923383e-05,
"loss": 0.6027,
"step": 5030
},
{
"epoch": 3.2758620689655173,
"grad_norm": 1.5753469467163086,
"learning_rate": 1.3725727217421947e-05,
"loss": 0.6165,
"step": 5035
},
{
"epoch": 3.279115159401431,
"grad_norm": 1.5028088092803955,
"learning_rate": 1.3679681471292776e-05,
"loss": 0.621,
"step": 5040
},
{
"epoch": 3.2823682498373454,
"grad_norm": 1.4654455184936523,
"learning_rate": 1.363368398182131e-05,
"loss": 0.6266,
"step": 5045
},
{
"epoch": 3.2856213402732597,
"grad_norm": 1.7276520729064941,
"learning_rate": 1.3587734945087277e-05,
"loss": 0.6258,
"step": 5050
},
{
"epoch": 3.288874430709174,
"grad_norm": 1.710095763206482,
"learning_rate": 1.3541834556963895e-05,
"loss": 0.6388,
"step": 5055
},
{
"epoch": 3.2921275211450878,
"grad_norm": 1.6146140098571777,
"learning_rate": 1.3495983013116953e-05,
"loss": 0.6466,
"step": 5060
},
{
"epoch": 3.295380611581002,
"grad_norm": 1.3169276714324951,
"learning_rate": 1.3450180509004066e-05,
"loss": 0.6389,
"step": 5065
},
{
"epoch": 3.2986337020169163,
"grad_norm": 2.564819574356079,
"learning_rate": 1.3404427239873763e-05,
"loss": 0.6158,
"step": 5070
},
{
"epoch": 3.30188679245283,
"grad_norm": 1.6384319067001343,
"learning_rate": 1.335872340076474e-05,
"loss": 0.6241,
"step": 5075
},
{
"epoch": 3.3051398828887444,
"grad_norm": 1.4620628356933594,
"learning_rate": 1.3313069186504929e-05,
"loss": 0.6203,
"step": 5080
},
{
"epoch": 3.3083929733246586,
"grad_norm": 1.7426296472549438,
"learning_rate": 1.3267464791710747e-05,
"loss": 0.6238,
"step": 5085
},
{
"epoch": 3.3116460637605725,
"grad_norm": 2.093579053878784,
"learning_rate": 1.3221910410786248e-05,
"loss": 0.6144,
"step": 5090
},
{
"epoch": 3.3148991541964867,
"grad_norm": 1.4141899347305298,
"learning_rate": 1.3176406237922262e-05,
"loss": 0.6145,
"step": 5095
},
{
"epoch": 3.318152244632401,
"grad_norm": 1.2416197061538696,
"learning_rate": 1.3130952467095593e-05,
"loss": 0.6134,
"step": 5100
},
{
"epoch": 3.321405335068315,
"grad_norm": 1.6651731729507446,
"learning_rate": 1.3085549292068213e-05,
"loss": 0.6366,
"step": 5105
},
{
"epoch": 3.324658425504229,
"grad_norm": 1.4123419523239136,
"learning_rate": 1.3040196906386392e-05,
"loss": 0.6363,
"step": 5110
},
{
"epoch": 3.3279115159401433,
"grad_norm": 1.5788094997406006,
"learning_rate": 1.2994895503379886e-05,
"loss": 0.6463,
"step": 5115
},
{
"epoch": 3.331164606376057,
"grad_norm": 1.9464671611785889,
"learning_rate": 1.2949645276161149e-05,
"loss": 0.6193,
"step": 5120
},
{
"epoch": 3.3344176968119714,
"grad_norm": 1.3868358135223389,
"learning_rate": 1.2904446417624457e-05,
"loss": 0.6182,
"step": 5125
},
{
"epoch": 3.3376707872478857,
"grad_norm": 7.827129364013672,
"learning_rate": 1.2859299120445107e-05,
"loss": 0.615,
"step": 5130
},
{
"epoch": 3.3409238776837995,
"grad_norm": 1.3248870372772217,
"learning_rate": 1.2814203577078626e-05,
"loss": 0.6286,
"step": 5135
},
{
"epoch": 3.3441769681197138,
"grad_norm": 1.3587925434112549,
"learning_rate": 1.2769159979759899e-05,
"loss": 0.6285,
"step": 5140
},
{
"epoch": 3.347430058555628,
"grad_norm": 1.518294095993042,
"learning_rate": 1.2724168520502371e-05,
"loss": 0.6304,
"step": 5145
},
{
"epoch": 3.350683148991542,
"grad_norm": 1.2859338521957397,
"learning_rate": 1.2679229391097241e-05,
"loss": 0.6299,
"step": 5150
},
{
"epoch": 3.353936239427456,
"grad_norm": 1.3024553060531616,
"learning_rate": 1.2634342783112646e-05,
"loss": 0.6177,
"step": 5155
},
{
"epoch": 3.3571893298633704,
"grad_norm": 3.6768040657043457,
"learning_rate": 1.258950888789281e-05,
"loss": 0.6385,
"step": 5160
},
{
"epoch": 3.360442420299284,
"grad_norm": 1.476014256477356,
"learning_rate": 1.2544727896557257e-05,
"loss": 0.6313,
"step": 5165
},
{
"epoch": 3.3636955107351985,
"grad_norm": 2.193185806274414,
"learning_rate": 1.2500000000000006e-05,
"loss": 0.6386,
"step": 5170
},
{
"epoch": 3.3669486011711127,
"grad_norm": 1.4634368419647217,
"learning_rate": 1.2455325388888726e-05,
"loss": 0.617,
"step": 5175
},
{
"epoch": 3.3702016916070265,
"grad_norm": 1.770553708076477,
"learning_rate": 1.2410704253663932e-05,
"loss": 0.637,
"step": 5180
},
{
"epoch": 3.373454782042941,
"grad_norm": 1.7664306163787842,
"learning_rate": 1.236613678453821e-05,
"loss": 0.6203,
"step": 5185
},
{
"epoch": 3.376707872478855,
"grad_norm": 1.4499051570892334,
"learning_rate": 1.232162317149535e-05,
"loss": 0.6417,
"step": 5190
},
{
"epoch": 3.379960962914769,
"grad_norm": 2.710038661956787,
"learning_rate": 1.2277163604289558e-05,
"loss": 0.6246,
"step": 5195
},
{
"epoch": 3.383214053350683,
"grad_norm": 1.9992517232894897,
"learning_rate": 1.2232758272444672e-05,
"loss": 0.6188,
"step": 5200
},
{
"epoch": 3.3864671437865974,
"grad_norm": 1.1757420301437378,
"learning_rate": 1.2188407365253337e-05,
"loss": 0.6232,
"step": 5205
},
{
"epoch": 3.3897202342225112,
"grad_norm": 1.3049498796463013,
"learning_rate": 1.2144111071776174e-05,
"loss": 0.6314,
"step": 5210
},
{
"epoch": 3.3929733246584255,
"grad_norm": 1.2970354557037354,
"learning_rate": 1.209986958084099e-05,
"loss": 0.6361,
"step": 5215
},
{
"epoch": 3.3962264150943398,
"grad_norm": 1.4407247304916382,
"learning_rate": 1.205568308104201e-05,
"loss": 0.6246,
"step": 5220
},
{
"epoch": 3.3994795055302536,
"grad_norm": 1.673065185546875,
"learning_rate": 1.2011551760739014e-05,
"loss": 0.6318,
"step": 5225
},
{
"epoch": 3.402732595966168,
"grad_norm": 1.4697465896606445,
"learning_rate": 1.196747580805656e-05,
"loss": 0.6417,
"step": 5230
},
{
"epoch": 3.405985686402082,
"grad_norm": 1.6552962064743042,
"learning_rate": 1.1923455410883212e-05,
"loss": 0.6343,
"step": 5235
},
{
"epoch": 3.409238776837996,
"grad_norm": 1.5813676118850708,
"learning_rate": 1.1879490756870674e-05,
"loss": 0.6352,
"step": 5240
},
{
"epoch": 3.41249186727391,
"grad_norm": 3.213158130645752,
"learning_rate": 1.1835582033433037e-05,
"loss": 0.6352,
"step": 5245
},
{
"epoch": 3.4157449577098244,
"grad_norm": 1.2842360734939575,
"learning_rate": 1.1791729427745992e-05,
"loss": 0.6416,
"step": 5250
},
{
"epoch": 3.4189980481457383,
"grad_norm": 1.6811124086380005,
"learning_rate": 1.1747933126745983e-05,
"loss": 0.651,
"step": 5255
},
{
"epoch": 3.4222511385816525,
"grad_norm": 1.2236487865447998,
"learning_rate": 1.170419331712943e-05,
"loss": 0.641,
"step": 5260
},
{
"epoch": 3.425504229017567,
"grad_norm": 1.3968175649642944,
"learning_rate": 1.1660510185351978e-05,
"loss": 0.6271,
"step": 5265
},
{
"epoch": 3.4287573194534806,
"grad_norm": 2.152369976043701,
"learning_rate": 1.161688391762763e-05,
"loss": 0.633,
"step": 5270
},
{
"epoch": 3.432010409889395,
"grad_norm": 1.5563530921936035,
"learning_rate": 1.1573314699927985e-05,
"loss": 0.6429,
"step": 5275
},
{
"epoch": 3.435263500325309,
"grad_norm": 1.4173344373703003,
"learning_rate": 1.1529802717981475e-05,
"loss": 0.6344,
"step": 5280
},
{
"epoch": 3.438516590761223,
"grad_norm": 1.8149155378341675,
"learning_rate": 1.1486348157272526e-05,
"loss": 0.6278,
"step": 5285
},
{
"epoch": 3.441769681197137,
"grad_norm": 1.4700722694396973,
"learning_rate": 1.1442951203040775e-05,
"loss": 0.607,
"step": 5290
},
{
"epoch": 3.4450227716330515,
"grad_norm": 1.4950767755508423,
"learning_rate": 1.139961204028033e-05,
"loss": 0.6298,
"step": 5295
},
{
"epoch": 3.4482758620689653,
"grad_norm": 1.702974796295166,
"learning_rate": 1.1356330853738906e-05,
"loss": 0.6599,
"step": 5300
},
{
"epoch": 3.4515289525048796,
"grad_norm": 1.7694127559661865,
"learning_rate": 1.1313107827917083e-05,
"loss": 0.6235,
"step": 5305
},
{
"epoch": 3.454782042940794,
"grad_norm": 1.2292397022247314,
"learning_rate": 1.1269943147067535e-05,
"loss": 0.6264,
"step": 5310
},
{
"epoch": 3.4580351333767076,
"grad_norm": 1.3355427980422974,
"learning_rate": 1.1226836995194196e-05,
"loss": 0.6274,
"step": 5315
},
{
"epoch": 3.461288223812622,
"grad_norm": 1.313506841659546,
"learning_rate": 1.1183789556051508e-05,
"loss": 0.6075,
"step": 5320
},
{
"epoch": 3.464541314248536,
"grad_norm": 1.3950237035751343,
"learning_rate": 1.1140801013143618e-05,
"loss": 0.606,
"step": 5325
},
{
"epoch": 3.46779440468445,
"grad_norm": 1.4222460985183716,
"learning_rate": 1.1097871549723629e-05,
"loss": 0.6238,
"step": 5330
},
{
"epoch": 3.4710474951203643,
"grad_norm": 1.701815128326416,
"learning_rate": 1.1055001348792807e-05,
"loss": 0.6227,
"step": 5335
},
{
"epoch": 3.4743005855562785,
"grad_norm": 1.5569487810134888,
"learning_rate": 1.1012190593099744e-05,
"loss": 0.643,
"step": 5340
},
{
"epoch": 3.4775536759921923,
"grad_norm": 1.3712338209152222,
"learning_rate": 1.0969439465139687e-05,
"loss": 0.6167,
"step": 5345
},
{
"epoch": 3.4808067664281066,
"grad_norm": 1.3950178623199463,
"learning_rate": 1.0926748147153648e-05,
"loss": 0.6318,
"step": 5350
},
{
"epoch": 3.484059856864021,
"grad_norm": 1.347066044807434,
"learning_rate": 1.088411682112771e-05,
"loss": 0.6225,
"step": 5355
},
{
"epoch": 3.487312947299935,
"grad_norm": 1.347697138786316,
"learning_rate": 1.08415456687922e-05,
"loss": 0.6225,
"step": 5360
},
{
"epoch": 3.490566037735849,
"grad_norm": 1.5315964221954346,
"learning_rate": 1.0799034871620958e-05,
"loss": 0.6067,
"step": 5365
},
{
"epoch": 3.493819128171763,
"grad_norm": 1.3384947776794434,
"learning_rate": 1.0756584610830523e-05,
"loss": 0.6235,
"step": 5370
},
{
"epoch": 3.4970722186076775,
"grad_norm": 1.3656494617462158,
"learning_rate": 1.071419506737937e-05,
"loss": 0.6347,
"step": 5375
},
{
"epoch": 3.5003253090435913,
"grad_norm": 1.3071860074996948,
"learning_rate": 1.0671866421967175e-05,
"loss": 0.6108,
"step": 5380
},
{
"epoch": 3.5035783994795056,
"grad_norm": 1.3579492568969727,
"learning_rate": 1.062959885503399e-05,
"loss": 0.6354,
"step": 5385
},
{
"epoch": 3.5068314899154194,
"grad_norm": 1.52472722530365,
"learning_rate": 1.0587392546759498e-05,
"loss": 0.6177,
"step": 5390
},
{
"epoch": 3.5100845803513336,
"grad_norm": 1.7216352224349976,
"learning_rate": 1.0545247677062273e-05,
"loss": 0.6225,
"step": 5395
},
{
"epoch": 3.513337670787248,
"grad_norm": 1.3169187307357788,
"learning_rate": 1.050316442559896e-05,
"loss": 0.6196,
"step": 5400
},
{
"epoch": 3.516590761223162,
"grad_norm": 1.7447690963745117,
"learning_rate": 1.0461142971763535e-05,
"loss": 0.6338,
"step": 5405
},
{
"epoch": 3.519843851659076,
"grad_norm": 1.4032801389694214,
"learning_rate": 1.0419183494686574e-05,
"loss": 0.6261,
"step": 5410
},
{
"epoch": 3.5230969420949902,
"grad_norm": 1.6217771768569946,
"learning_rate": 1.0377286173234416e-05,
"loss": 0.6306,
"step": 5415
},
{
"epoch": 3.5263500325309045,
"grad_norm": 1.2982110977172852,
"learning_rate": 1.0335451186008454e-05,
"loss": 0.6242,
"step": 5420
},
{
"epoch": 3.5296031229668183,
"grad_norm": 1.2958654165267944,
"learning_rate": 1.0293678711344382e-05,
"loss": 0.6292,
"step": 5425
},
{
"epoch": 3.5328562134027326,
"grad_norm": 1.7522900104522705,
"learning_rate": 1.0251968927311384e-05,
"loss": 0.6541,
"step": 5430
},
{
"epoch": 3.536109303838647,
"grad_norm": 1.435259222984314,
"learning_rate": 1.0210322011711408e-05,
"loss": 0.6064,
"step": 5435
},
{
"epoch": 3.5393623942745607,
"grad_norm": 1.3290374279022217,
"learning_rate": 1.0168738142078429e-05,
"loss": 0.6255,
"step": 5440
},
{
"epoch": 3.542615484710475,
"grad_norm": 1.3328436613082886,
"learning_rate": 1.012721749567764e-05,
"loss": 0.6006,
"step": 5445
},
{
"epoch": 3.545868575146389,
"grad_norm": 1.3372770547866821,
"learning_rate": 1.0085760249504728e-05,
"loss": 0.6194,
"step": 5450
},
{
"epoch": 3.5491216655823035,
"grad_norm": 1.7760313749313354,
"learning_rate": 1.0044366580285137e-05,
"loss": 0.6067,
"step": 5455
},
{
"epoch": 3.5523747560182173,
"grad_norm": 1.7420598268508911,
"learning_rate": 1.0003036664473267e-05,
"loss": 0.6071,
"step": 5460
},
{
"epoch": 3.5556278464541315,
"grad_norm": 1.498193621635437,
"learning_rate": 9.96177067825175e-06,
"loss": 0.6146,
"step": 5465
},
{
"epoch": 3.558880936890046,
"grad_norm": 1.8063032627105713,
"learning_rate": 9.920568797530716e-06,
"loss": 0.626,
"step": 5470
},
{
"epoch": 3.5621340273259596,
"grad_norm": 1.2613329887390137,
"learning_rate": 9.879431197947014e-06,
"loss": 0.6049,
"step": 5475
},
{
"epoch": 3.565387117761874,
"grad_norm": 1.34530770778656,
"learning_rate": 9.83835805486347e-06,
"loss": 0.6197,
"step": 5480
},
{
"epoch": 3.568640208197788,
"grad_norm": 1.9523491859436035,
"learning_rate": 9.797349543368128e-06,
"loss": 0.6342,
"step": 5485
},
{
"epoch": 3.571893298633702,
"grad_norm": 1.8784916400909424,
"learning_rate": 9.756405838273558e-06,
"loss": 0.64,
"step": 5490
},
{
"epoch": 3.5751463890696162,
"grad_norm": 1.5533080101013184,
"learning_rate": 9.715527114116035e-06,
"loss": 0.6243,
"step": 5495
},
{
"epoch": 3.5783994795055305,
"grad_norm": 1.385695219039917,
"learning_rate": 9.674713545154831e-06,
"loss": 0.6264,
"step": 5500
},
{
"epoch": 3.5816525699414443,
"grad_norm": 1.3538482189178467,
"learning_rate": 9.633965305371506e-06,
"loss": 0.621,
"step": 5505
},
{
"epoch": 3.5849056603773586,
"grad_norm": 1.6445493698120117,
"learning_rate": 9.5932825684691e-06,
"loss": 0.6239,
"step": 5510
},
{
"epoch": 3.588158750813273,
"grad_norm": 1.803451657295227,
"learning_rate": 9.552665507871428e-06,
"loss": 0.6311,
"step": 5515
},
{
"epoch": 3.5914118412491867,
"grad_norm": 1.3346718549728394,
"learning_rate": 9.51211429672236e-06,
"loss": 0.6396,
"step": 5520
},
{
"epoch": 3.594664931685101,
"grad_norm": 2.1071603298187256,
"learning_rate": 9.471629107885038e-06,
"loss": 0.6238,
"step": 5525
},
{
"epoch": 3.597918022121015,
"grad_norm": 1.4250411987304688,
"learning_rate": 9.431210113941169e-06,
"loss": 0.6063,
"step": 5530
},
{
"epoch": 3.601171112556929,
"grad_norm": 1.3815439939498901,
"learning_rate": 9.390857487190274e-06,
"loss": 0.5978,
"step": 5535
},
{
"epoch": 3.6044242029928433,
"grad_norm": 1.6549842357635498,
"learning_rate": 9.350571399648988e-06,
"loss": 0.6094,
"step": 5540
},
{
"epoch": 3.6076772934287575,
"grad_norm": 1.4034509658813477,
"learning_rate": 9.310352023050272e-06,
"loss": 0.6187,
"step": 5545
},
{
"epoch": 3.6109303838646714,
"grad_norm": 1.6350473165512085,
"learning_rate": 9.270199528842715e-06,
"loss": 0.6076,
"step": 5550
},
{
"epoch": 3.6141834743005856,
"grad_norm": 1.4474992752075195,
"learning_rate": 9.230114088189814e-06,
"loss": 0.6507,
"step": 5555
},
{
"epoch": 3.6174365647365,
"grad_norm": 1.4828194379806519,
"learning_rate": 9.19009587196921e-06,
"loss": 0.6264,
"step": 5560
},
{
"epoch": 3.6206896551724137,
"grad_norm": 1.7121607065200806,
"learning_rate": 9.150145050771972e-06,
"loss": 0.6383,
"step": 5565
},
{
"epoch": 3.623942745608328,
"grad_norm": 1.8459277153015137,
"learning_rate": 9.110261794901903e-06,
"loss": 0.6436,
"step": 5570
},
{
"epoch": 3.6271958360442422,
"grad_norm": 1.4332444667816162,
"learning_rate": 9.070446274374766e-06,
"loss": 0.6313,
"step": 5575
},
{
"epoch": 3.630448926480156,
"grad_norm": 1.2665612697601318,
"learning_rate": 9.030698658917566e-06,
"loss": 0.6003,
"step": 5580
},
{
"epoch": 3.6337020169160703,
"grad_norm": 1.5076160430908203,
"learning_rate": 8.99101911796788e-06,
"loss": 0.6203,
"step": 5585
},
{
"epoch": 3.6369551073519846,
"grad_norm": 1.567221999168396,
"learning_rate": 8.951407820673058e-06,
"loss": 0.6252,
"step": 5590
},
{
"epoch": 3.6402081977878984,
"grad_norm": 1.504109263420105,
"learning_rate": 8.911864935889544e-06,
"loss": 0.6332,
"step": 5595
},
{
"epoch": 3.6434612882238127,
"grad_norm": 1.6598913669586182,
"learning_rate": 8.872390632182175e-06,
"loss": 0.6258,
"step": 5600
},
{
"epoch": 3.646714378659727,
"grad_norm": 1.3711302280426025,
"learning_rate": 8.832985077823406e-06,
"loss": 0.6273,
"step": 5605
},
{
"epoch": 3.6499674690956407,
"grad_norm": 1.293453574180603,
"learning_rate": 8.793648440792654e-06,
"loss": 0.6041,
"step": 5610
},
{
"epoch": 3.653220559531555,
"grad_norm": 1.6621414422988892,
"learning_rate": 8.754380888775523e-06,
"loss": 0.6177,
"step": 5615
},
{
"epoch": 3.6564736499674693,
"grad_norm": 1.2931593656539917,
"learning_rate": 8.715182589163153e-06,
"loss": 0.6084,
"step": 5620
},
{
"epoch": 3.659726740403383,
"grad_norm": 1.4701381921768188,
"learning_rate": 8.676053709051446e-06,
"loss": 0.6235,
"step": 5625
},
{
"epoch": 3.6629798308392973,
"grad_norm": 2.272709369659424,
"learning_rate": 8.636994415240376e-06,
"loss": 0.6326,
"step": 5630
},
{
"epoch": 3.6662329212752116,
"grad_norm": 1.3057537078857422,
"learning_rate": 8.598004874233315e-06,
"loss": 0.616,
"step": 5635
},
{
"epoch": 3.6694860117111254,
"grad_norm": 1.6016069650650024,
"learning_rate": 8.559085252236259e-06,
"loss": 0.6126,
"step": 5640
},
{
"epoch": 3.6727391021470397,
"grad_norm": 1.38706636428833,
"learning_rate": 8.520235715157152e-06,
"loss": 0.6424,
"step": 5645
},
{
"epoch": 3.675992192582954,
"grad_norm": 1.403805136680603,
"learning_rate": 8.481456428605205e-06,
"loss": 0.6328,
"step": 5650
},
{
"epoch": 3.6792452830188678,
"grad_norm": 2.8022546768188477,
"learning_rate": 8.442747557890138e-06,
"loss": 0.6225,
"step": 5655
},
{
"epoch": 3.682498373454782,
"grad_norm": 1.2923667430877686,
"learning_rate": 8.404109268021493e-06,
"loss": 0.6068,
"step": 5660
},
{
"epoch": 3.6857514638906963,
"grad_norm": 1.327010154724121,
"learning_rate": 8.365541723707971e-06,
"loss": 0.6032,
"step": 5665
},
{
"epoch": 3.68900455432661,
"grad_norm": 3.022547960281372,
"learning_rate": 8.327045089356663e-06,
"loss": 0.6202,
"step": 5670
},
{
"epoch": 3.6922576447625244,
"grad_norm": 1.7190786600112915,
"learning_rate": 8.288619529072394e-06,
"loss": 0.6136,
"step": 5675
},
{
"epoch": 3.6955107351984386,
"grad_norm": 1.8883839845657349,
"learning_rate": 8.250265206657025e-06,
"loss": 0.626,
"step": 5680
},
{
"epoch": 3.6987638256343525,
"grad_norm": 1.216133952140808,
"learning_rate": 8.211982285608721e-06,
"loss": 0.6084,
"step": 5685
},
{
"epoch": 3.7020169160702667,
"grad_norm": 1.4318759441375732,
"learning_rate": 8.17377092912128e-06,
"loss": 0.6252,
"step": 5690
},
{
"epoch": 3.705270006506181,
"grad_norm": 1.3429824113845825,
"learning_rate": 8.135631300083448e-06,
"loss": 0.6421,
"step": 5695
},
{
"epoch": 3.708523096942095,
"grad_norm": 1.563573956489563,
"learning_rate": 8.097563561078193e-06,
"loss": 0.6426,
"step": 5700
},
{
"epoch": 3.711776187378009,
"grad_norm": 1.3186182975769043,
"learning_rate": 8.059567874382023e-06,
"loss": 0.6148,
"step": 5705
},
{
"epoch": 3.7150292778139233,
"grad_norm": 1.4381370544433594,
"learning_rate": 8.021644401964305e-06,
"loss": 0.6206,
"step": 5710
},
{
"epoch": 3.718282368249837,
"grad_norm": 1.6375632286071777,
"learning_rate": 7.983793305486583e-06,
"loss": 0.6169,
"step": 5715
},
{
"epoch": 3.7215354586857514,
"grad_norm": 1.426100730895996,
"learning_rate": 7.946014746301858e-06,
"loss": 0.6299,
"step": 5720
},
{
"epoch": 3.7247885491216657,
"grad_norm": 1.6016979217529297,
"learning_rate": 7.908308885453908e-06,
"loss": 0.6039,
"step": 5725
},
{
"epoch": 3.7280416395575795,
"grad_norm": 1.8250033855438232,
"learning_rate": 7.87067588367664e-06,
"loss": 0.6375,
"step": 5730
},
{
"epoch": 3.7312947299934938,
"grad_norm": 1.6048786640167236,
"learning_rate": 7.833115901393347e-06,
"loss": 0.6469,
"step": 5735
},
{
"epoch": 3.734547820429408,
"grad_norm": 1.473156213760376,
"learning_rate": 7.795629098716045e-06,
"loss": 0.6291,
"step": 5740
},
{
"epoch": 3.737800910865322,
"grad_norm": 1.4616464376449585,
"learning_rate": 7.758215635444848e-06,
"loss": 0.6418,
"step": 5745
},
{
"epoch": 3.741054001301236,
"grad_norm": 1.3316526412963867,
"learning_rate": 7.720875671067188e-06,
"loss": 0.6052,
"step": 5750
},
{
"epoch": 3.7443070917371504,
"grad_norm": 2.7276248931884766,
"learning_rate": 7.683609364757192e-06,
"loss": 0.6311,
"step": 5755
},
{
"epoch": 3.747560182173064,
"grad_norm": 1.4057763814926147,
"learning_rate": 7.646416875374992e-06,
"loss": 0.6262,
"step": 5760
},
{
"epoch": 3.7508132726089785,
"grad_norm": 1.7808401584625244,
"learning_rate": 7.609298361466083e-06,
"loss": 0.6372,
"step": 5765
},
{
"epoch": 3.7540663630448927,
"grad_norm": 1.5597418546676636,
"learning_rate": 7.572253981260571e-06,
"loss": 0.6181,
"step": 5770
},
{
"epoch": 3.7573194534808065,
"grad_norm": 1.6378741264343262,
"learning_rate": 7.535283892672562e-06,
"loss": 0.6247,
"step": 5775
},
{
"epoch": 3.760572543916721,
"grad_norm": 2.498858690261841,
"learning_rate": 7.498388253299482e-06,
"loss": 0.643,
"step": 5780
},
{
"epoch": 3.763825634352635,
"grad_norm": 1.9484217166900635,
"learning_rate": 7.46156722042137e-06,
"loss": 0.6223,
"step": 5785
},
{
"epoch": 3.767078724788549,
"grad_norm": 1.3782168626785278,
"learning_rate": 7.424820951000233e-06,
"loss": 0.6148,
"step": 5790
},
{
"epoch": 3.770331815224463,
"grad_norm": 1.3748527765274048,
"learning_rate": 7.388149601679392e-06,
"loss": 0.6242,
"step": 5795
},
{
"epoch": 3.7735849056603774,
"grad_norm": 1.4963568449020386,
"learning_rate": 7.351553328782779e-06,
"loss": 0.6014,
"step": 5800
},
{
"epoch": 3.7768379960962912,
"grad_norm": 1.708061695098877,
"learning_rate": 7.31503228831428e-06,
"loss": 0.6154,
"step": 5805
},
{
"epoch": 3.7800910865322055,
"grad_norm": 1.8436424732208252,
"learning_rate": 7.278586635957107e-06,
"loss": 0.6263,
"step": 5810
},
{
"epoch": 3.7833441769681198,
"grad_norm": 1.9801384210586548,
"learning_rate": 7.242216527073079e-06,
"loss": 0.5955,
"step": 5815
},
{
"epoch": 3.7865972674040336,
"grad_norm": 1.4177374839782715,
"learning_rate": 7.205922116701985e-06,
"loss": 0.6255,
"step": 5820
},
{
"epoch": 3.789850357839948,
"grad_norm": 1.4929031133651733,
"learning_rate": 7.169703559560953e-06,
"loss": 0.6046,
"step": 5825
},
{
"epoch": 3.793103448275862,
"grad_norm": 2.4425814151763916,
"learning_rate": 7.133561010043724e-06,
"loss": 0.6072,
"step": 5830
},
{
"epoch": 3.796356538711776,
"grad_norm": 1.5860954523086548,
"learning_rate": 7.097494622220049e-06,
"loss": 0.6173,
"step": 5835
},
{
"epoch": 3.79960962914769,
"grad_norm": 1.4166280031204224,
"learning_rate": 7.0615045498350215e-06,
"loss": 0.5985,
"step": 5840
},
{
"epoch": 3.8028627195836044,
"grad_norm": 1.7926712036132812,
"learning_rate": 7.025590946308402e-06,
"loss": 0.6077,
"step": 5845
},
{
"epoch": 3.8061158100195187,
"grad_norm": 1.411357045173645,
"learning_rate": 6.9897539647339725e-06,
"loss": 0.6126,
"step": 5850
},
{
"epoch": 3.8093689004554325,
"grad_norm": 1.4378728866577148,
"learning_rate": 6.95399375787891e-06,
"loss": 0.6217,
"step": 5855
},
{
"epoch": 3.812621990891347,
"grad_norm": 1.630339503288269,
"learning_rate": 6.918310478183093e-06,
"loss": 0.6081,
"step": 5860
},
{
"epoch": 3.815875081327261,
"grad_norm": 1.4536669254302979,
"learning_rate": 6.882704277758475e-06,
"loss": 0.631,
"step": 5865
},
{
"epoch": 3.819128171763175,
"grad_norm": 1.369432806968689,
"learning_rate": 6.847175308388451e-06,
"loss": 0.6023,
"step": 5870
},
{
"epoch": 3.822381262199089,
"grad_norm": 1.8251979351043701,
"learning_rate": 6.811723721527161e-06,
"loss": 0.6088,
"step": 5875
},
{
"epoch": 3.8256343526350034,
"grad_norm": 1.4121100902557373,
"learning_rate": 6.776349668298912e-06,
"loss": 0.6393,
"step": 5880
},
{
"epoch": 3.828887443070917,
"grad_norm": 1.4803780317306519,
"learning_rate": 6.741053299497468e-06,
"loss": 0.601,
"step": 5885
},
{
"epoch": 3.8321405335068315,
"grad_norm": 1.5110501050949097,
"learning_rate": 6.705834765585459e-06,
"loss": 0.6299,
"step": 5890
},
{
"epoch": 3.8353936239427457,
"grad_norm": 1.8608803749084473,
"learning_rate": 6.670694216693701e-06,
"loss": 0.6394,
"step": 5895
},
{
"epoch": 3.8386467143786596,
"grad_norm": 1.4101976156234741,
"learning_rate": 6.635631802620576e-06,
"loss": 0.6149,
"step": 5900
},
{
"epoch": 3.841899804814574,
"grad_norm": 1.5235905647277832,
"learning_rate": 6.600647672831406e-06,
"loss": 0.6377,
"step": 5905
},
{
"epoch": 3.845152895250488,
"grad_norm": 2.4760963916778564,
"learning_rate": 6.565741976457782e-06,
"loss": 0.6315,
"step": 5910
},
{
"epoch": 3.8484059856864024,
"grad_norm": 1.4764820337295532,
"learning_rate": 6.530914862296947e-06,
"loss": 0.6148,
"step": 5915
},
{
"epoch": 3.851659076122316,
"grad_norm": 1.408517599105835,
"learning_rate": 6.496166478811164e-06,
"loss": 0.629,
"step": 5920
},
{
"epoch": 3.8549121665582304,
"grad_norm": 2.276674509048462,
"learning_rate": 6.461496974127093e-06,
"loss": 0.613,
"step": 5925
},
{
"epoch": 3.8581652569941447,
"grad_norm": 1.5643647909164429,
"learning_rate": 6.426906496035129e-06,
"loss": 0.6063,
"step": 5930
},
{
"epoch": 3.8614183474300585,
"grad_norm": 1.3531688451766968,
"learning_rate": 6.39239519198879e-06,
"loss": 0.6135,
"step": 5935
},
{
"epoch": 3.864671437865973,
"grad_norm": 1.4261928796768188,
"learning_rate": 6.357963209104106e-06,
"loss": 0.6206,
"step": 5940
},
{
"epoch": 3.867924528301887,
"grad_norm": 1.3013157844543457,
"learning_rate": 6.32361069415896e-06,
"loss": 0.6153,
"step": 5945
},
{
"epoch": 3.871177618737801,
"grad_norm": 1.520578145980835,
"learning_rate": 6.289337793592468e-06,
"loss": 0.629,
"step": 5950
},
{
"epoch": 3.874430709173715,
"grad_norm": 1.5987921953201294,
"learning_rate": 6.255144653504382e-06,
"loss": 0.645,
"step": 5955
},
{
"epoch": 3.8776837996096294,
"grad_norm": 2.1227879524230957,
"learning_rate": 6.221031419654444e-06,
"loss": 0.6333,
"step": 5960
},
{
"epoch": 3.880936890045543,
"grad_norm": 1.5177706480026245,
"learning_rate": 6.1869982374617495e-06,
"loss": 0.629,
"step": 5965
},
{
"epoch": 3.8841899804814575,
"grad_norm": 1.3354036808013916,
"learning_rate": 6.153045252004177e-06,
"loss": 0.6055,
"step": 5970
},
{
"epoch": 3.8874430709173717,
"grad_norm": 1.8337645530700684,
"learning_rate": 6.119172608017718e-06,
"loss": 0.623,
"step": 5975
},
{
"epoch": 3.8906961613532856,
"grad_norm": 1.2876662015914917,
"learning_rate": 6.08538044989588e-06,
"loss": 0.6064,
"step": 5980
},
{
"epoch": 3.8939492517892,
"grad_norm": 1.3676327466964722,
"learning_rate": 6.051668921689094e-06,
"loss": 0.6219,
"step": 5985
},
{
"epoch": 3.897202342225114,
"grad_norm": 1.5804736614227295,
"learning_rate": 6.0180381671040596e-06,
"loss": 0.6135,
"step": 5990
},
{
"epoch": 3.900455432661028,
"grad_norm": 2.2858810424804688,
"learning_rate": 5.9844883295031515e-06,
"loss": 0.6393,
"step": 5995
},
{
"epoch": 3.903708523096942,
"grad_norm": 1.8066788911819458,
"learning_rate": 5.9510195519038245e-06,
"loss": 0.6056,
"step": 6000
},
{
"epoch": 3.9069616135328564,
"grad_norm": 1.3947362899780273,
"learning_rate": 5.917631976977975e-06,
"loss": 0.6138,
"step": 6005
},
{
"epoch": 3.9102147039687702,
"grad_norm": 1.551949381828308,
"learning_rate": 5.884325747051336e-06,
"loss": 0.614,
"step": 6010
},
{
"epoch": 3.9134677944046845,
"grad_norm": 1.3901867866516113,
"learning_rate": 5.851101004102907e-06,
"loss": 0.6375,
"step": 6015
},
{
"epoch": 3.9167208848405988,
"grad_norm": 1.4056464433670044,
"learning_rate": 5.817957889764308e-06,
"loss": 0.6141,
"step": 6020
},
{
"epoch": 3.9199739752765126,
"grad_norm": 1.499922752380371,
"learning_rate": 5.784896545319187e-06,
"loss": 0.6074,
"step": 6025
},
{
"epoch": 3.923227065712427,
"grad_norm": 1.2578163146972656,
"learning_rate": 5.751917111702612e-06,
"loss": 0.6143,
"step": 6030
},
{
"epoch": 3.926480156148341,
"grad_norm": 1.2877789735794067,
"learning_rate": 5.719019729500508e-06,
"loss": 0.5956,
"step": 6035
},
{
"epoch": 3.929733246584255,
"grad_norm": 1.576788067817688,
"learning_rate": 5.686204538948997e-06,
"loss": 0.6141,
"step": 6040
},
{
"epoch": 3.932986337020169,
"grad_norm": 1.8292930126190186,
"learning_rate": 5.653471679933839e-06,
"loss": 0.5909,
"step": 6045
},
{
"epoch": 3.9362394274560835,
"grad_norm": 1.5432319641113281,
"learning_rate": 5.62082129198985e-06,
"loss": 0.6199,
"step": 6050
},
{
"epoch": 3.9394925178919973,
"grad_norm": 1.739689826965332,
"learning_rate": 5.58825351430026e-06,
"loss": 0.6035,
"step": 6055
},
{
"epoch": 3.9427456083279115,
"grad_norm": 1.3205852508544922,
"learning_rate": 5.555768485696144e-06,
"loss": 0.6169,
"step": 6060
},
{
"epoch": 3.945998698763826,
"grad_norm": 1.6433742046356201,
"learning_rate": 5.523366344655856e-06,
"loss": 0.6404,
"step": 6065
},
{
"epoch": 3.9492517891997396,
"grad_norm": 1.6137924194335938,
"learning_rate": 5.491047229304397e-06,
"loss": 0.6219,
"step": 6070
},
{
"epoch": 3.952504879635654,
"grad_norm": 1.5387951135635376,
"learning_rate": 5.4588112774128314e-06,
"loss": 0.5937,
"step": 6075
},
{
"epoch": 3.955757970071568,
"grad_norm": 1.4663158655166626,
"learning_rate": 5.42665862639774e-06,
"loss": 0.6066,
"step": 6080
},
{
"epoch": 3.959011060507482,
"grad_norm": 4.082248210906982,
"learning_rate": 5.394589413320589e-06,
"loss": 0.6311,
"step": 6085
},
{
"epoch": 3.9622641509433962,
"grad_norm": 1.4563738107681274,
"learning_rate": 5.3626037748871565e-06,
"loss": 0.6142,
"step": 6090
},
{
"epoch": 3.9655172413793105,
"grad_norm": 1.569101095199585,
"learning_rate": 5.330701847446962e-06,
"loss": 0.6014,
"step": 6095
},
{
"epoch": 3.9687703318152243,
"grad_norm": 1.567270278930664,
"learning_rate": 5.29888376699269e-06,
"loss": 0.6155,
"step": 6100
},
{
"epoch": 3.9720234222511386,
"grad_norm": 1.668445110321045,
"learning_rate": 5.267149669159588e-06,
"loss": 0.6171,
"step": 6105
},
{
"epoch": 3.975276512687053,
"grad_norm": 1.7854609489440918,
"learning_rate": 5.235499689224885e-06,
"loss": 0.6135,
"step": 6110
},
{
"epoch": 3.9785296031229667,
"grad_norm": 1.8517600297927856,
"learning_rate": 5.203933962107266e-06,
"loss": 0.6207,
"step": 6115
},
{
"epoch": 3.981782693558881,
"grad_norm": 1.5116204023361206,
"learning_rate": 5.172452622366228e-06,
"loss": 0.614,
"step": 6120
},
{
"epoch": 3.985035783994795,
"grad_norm": 1.4917980432510376,
"learning_rate": 5.141055804201541e-06,
"loss": 0.6118,
"step": 6125
},
{
"epoch": 3.988288874430709,
"grad_norm": 1.527981162071228,
"learning_rate": 5.109743641452699e-06,
"loss": 0.6083,
"step": 6130
},
{
"epoch": 3.9915419648666233,
"grad_norm": 1.3188831806182861,
"learning_rate": 5.078516267598299e-06,
"loss": 0.6141,
"step": 6135
},
{
"epoch": 3.9947950553025375,
"grad_norm": 1.4134242534637451,
"learning_rate": 5.047373815755496e-06,
"loss": 0.6234,
"step": 6140
},
{
"epoch": 3.9980481457384514,
"grad_norm": 1.5778809785842896,
"learning_rate": 5.016316418679454e-06,
"loss": 0.6177,
"step": 6145
},
{
"epoch": 4.0,
"eval_f1": 0.7989837428748611,
"eval_loss": 0.491455078125,
"eval_precision": 0.7989192926261178,
"eval_recall": 0.7990541428374994,
"eval_runtime": 238.1189,
"eval_samples_per_second": 1652.263,
"eval_steps_per_second": 1.617,
"step": 6148
},
{
"epoch": 4.001301236174366,
"grad_norm": 1.376760721206665,
"learning_rate": 4.985344208762757e-06,
"loss": 0.5954,
"step": 6150
},
{
"epoch": 4.00455432661028,
"grad_norm": 1.2846732139587402,
"learning_rate": 4.954457318034841e-06,
"loss": 0.533,
"step": 6155
},
{
"epoch": 4.007807417046194,
"grad_norm": 1.16463303565979,
"learning_rate": 4.92365587816144e-06,
"loss": 0.533,
"step": 6160
},
{
"epoch": 4.011060507482108,
"grad_norm": 1.4882513284683228,
"learning_rate": 4.892940020444043e-06,
"loss": 0.5236,
"step": 6165
},
{
"epoch": 4.014313597918022,
"grad_norm": 3.275876998901367,
"learning_rate": 4.862309875819299e-06,
"loss": 0.5213,
"step": 6170
},
{
"epoch": 4.017566688353936,
"grad_norm": 1.5742096900939941,
"learning_rate": 4.837867561302392e-06,
"loss": 0.5295,
"step": 6175
},
{
"epoch": 4.020819778789851,
"grad_norm": 5.1677422523498535,
"learning_rate": 4.807392029038138e-06,
"loss": 0.5301,
"step": 6180
},
{
"epoch": 4.024072869225765,
"grad_norm": 1.7716647386550903,
"learning_rate": 4.77700257454356e-06,
"loss": 0.5366,
"step": 6185
},
{
"epoch": 4.027325959661678,
"grad_norm": 1.8003216981887817,
"learning_rate": 4.746699327363918e-06,
"loss": 0.5209,
"step": 6190
},
{
"epoch": 4.030579050097593,
"grad_norm": 1.7417036294937134,
"learning_rate": 4.7164824166769735e-06,
"loss": 0.5335,
"step": 6195
},
{
"epoch": 4.033832140533507,
"grad_norm": 1.7009021043777466,
"learning_rate": 4.686351971292443e-06,
"loss": 0.5222,
"step": 6200
},
{
"epoch": 4.037085230969421,
"grad_norm": 2.0051186084747314,
"learning_rate": 4.6563081196514786e-06,
"loss": 0.5516,
"step": 6205
},
{
"epoch": 4.040338321405335,
"grad_norm": 1.5723603963851929,
"learning_rate": 4.626350989826075e-06,
"loss": 0.5263,
"step": 6210
},
{
"epoch": 4.043591411841249,
"grad_norm": 1.8875335454940796,
"learning_rate": 4.596480709518547e-06,
"loss": 0.5346,
"step": 6215
},
{
"epoch": 4.046844502277163,
"grad_norm": 1.5543326139450073,
"learning_rate": 4.566697406061005e-06,
"loss": 0.5344,
"step": 6220
},
{
"epoch": 4.050097592713078,
"grad_norm": 1.6131196022033691,
"learning_rate": 4.53700120641477e-06,
"loss": 0.5318,
"step": 6225
},
{
"epoch": 4.053350683148992,
"grad_norm": 1.3502036333084106,
"learning_rate": 4.5073922371698554e-06,
"loss": 0.5234,
"step": 6230
},
{
"epoch": 4.056603773584905,
"grad_norm": 2.2002179622650146,
"learning_rate": 4.4778706245444475e-06,
"loss": 0.5422,
"step": 6235
},
{
"epoch": 4.05985686402082,
"grad_norm": 1.62948477268219,
"learning_rate": 4.44843649438432e-06,
"loss": 0.5136,
"step": 6240
},
{
"epoch": 4.063109954456734,
"grad_norm": 1.563274621963501,
"learning_rate": 4.419089972162327e-06,
"loss": 0.5087,
"step": 6245
},
{
"epoch": 4.066363044892648,
"grad_norm": 1.5413563251495361,
"learning_rate": 4.389831182977882e-06,
"loss": 0.535,
"step": 6250
},
{
"epoch": 4.0696161353285625,
"grad_norm": 1.6265994310379028,
"learning_rate": 4.360660251556395e-06,
"loss": 0.5291,
"step": 6255
},
{
"epoch": 4.072869225764476,
"grad_norm": 1.6212644577026367,
"learning_rate": 4.331577302248746e-06,
"loss": 0.5165,
"step": 6260
},
{
"epoch": 4.07612231620039,
"grad_norm": 1.5618913173675537,
"learning_rate": 4.302582459030769e-06,
"loss": 0.5301,
"step": 6265
},
{
"epoch": 4.079375406636305,
"grad_norm": 1.7876514196395874,
"learning_rate": 4.273675845502722e-06,
"loss": 0.5282,
"step": 6270
},
{
"epoch": 4.082628497072219,
"grad_norm": 1.6155240535736084,
"learning_rate": 4.244857584888748e-06,
"loss": 0.5219,
"step": 6275
},
{
"epoch": 4.0858815875081325,
"grad_norm": 1.826150894165039,
"learning_rate": 4.2161278000363456e-06,
"loss": 0.5254,
"step": 6280
},
{
"epoch": 4.089134677944047,
"grad_norm": 1.569254755973816,
"learning_rate": 4.187486613415878e-06,
"loss": 0.5563,
"step": 6285
},
{
"epoch": 4.092387768379961,
"grad_norm": 1.651341438293457,
"learning_rate": 4.158934147120019e-06,
"loss": 0.5196,
"step": 6290
},
{
"epoch": 4.095640858815875,
"grad_norm": 1.960835337638855,
"learning_rate": 4.130470522863231e-06,
"loss": 0.5233,
"step": 6295
},
{
"epoch": 4.0988939492517895,
"grad_norm": 1.762459397315979,
"learning_rate": 4.102095861981275e-06,
"loss": 0.5101,
"step": 6300
},
{
"epoch": 4.102147039687703,
"grad_norm": 1.7269344329833984,
"learning_rate": 4.073810285430668e-06,
"loss": 0.5283,
"step": 6305
},
{
"epoch": 4.105400130123617,
"grad_norm": 2.420794725418091,
"learning_rate": 4.045613913788171e-06,
"loss": 0.5168,
"step": 6310
},
{
"epoch": 4.108653220559532,
"grad_norm": 1.5948150157928467,
"learning_rate": 4.0175068672502784e-06,
"loss": 0.535,
"step": 6315
},
{
"epoch": 4.111906310995446,
"grad_norm": 2.1127867698669434,
"learning_rate": 3.9894892656327235e-06,
"loss": 0.5181,
"step": 6320
},
{
"epoch": 4.1151594014313595,
"grad_norm": 2.1554746627807617,
"learning_rate": 3.961561228369928e-06,
"loss": 0.5314,
"step": 6325
},
{
"epoch": 4.118412491867274,
"grad_norm": 1.7790179252624512,
"learning_rate": 3.933722874514526e-06,
"loss": 0.5327,
"step": 6330
},
{
"epoch": 4.121665582303188,
"grad_norm": 1.5885546207427979,
"learning_rate": 3.905974322736849e-06,
"loss": 0.5221,
"step": 6335
},
{
"epoch": 4.124918672739102,
"grad_norm": 1.4991848468780518,
"learning_rate": 3.878315691324416e-06,
"loss": 0.5134,
"step": 6340
},
{
"epoch": 4.1281717631750166,
"grad_norm": 1.57703697681427,
"learning_rate": 3.850747098181421e-06,
"loss": 0.5239,
"step": 6345
},
{
"epoch": 4.13142485361093,
"grad_norm": 3.0852479934692383,
"learning_rate": 3.82326866082825e-06,
"loss": 0.5216,
"step": 6350
},
{
"epoch": 4.134677944046844,
"grad_norm": 1.6248340606689453,
"learning_rate": 3.7958804964009692e-06,
"loss": 0.5195,
"step": 6355
},
{
"epoch": 4.137931034482759,
"grad_norm": 1.69948410987854,
"learning_rate": 3.7685827216508124e-06,
"loss": 0.507,
"step": 6360
},
{
"epoch": 4.141184124918673,
"grad_norm": 1.6397584676742554,
"learning_rate": 3.741375452943724e-06,
"loss": 0.5353,
"step": 6365
},
{
"epoch": 4.1444372153545865,
"grad_norm": 1.4918780326843262,
"learning_rate": 3.714258806259807e-06,
"loss": 0.5013,
"step": 6370
},
{
"epoch": 4.147690305790501,
"grad_norm": 2.1283321380615234,
"learning_rate": 3.6872328971928718e-06,
"loss": 0.5289,
"step": 6375
},
{
"epoch": 4.150943396226415,
"grad_norm": 2.7849512100219727,
"learning_rate": 3.660297840949933e-06,
"loss": 0.5289,
"step": 6380
},
{
"epoch": 4.154196486662329,
"grad_norm": 1.7255409955978394,
"learning_rate": 3.633453752350707e-06,
"loss": 0.5174,
"step": 6385
},
{
"epoch": 4.157449577098244,
"grad_norm": 1.7871309518814087,
"learning_rate": 3.606700745827127e-06,
"loss": 0.5231,
"step": 6390
},
{
"epoch": 4.160702667534157,
"grad_norm": 1.5307867527008057,
"learning_rate": 3.5800389354228748e-06,
"loss": 0.524,
"step": 6395
},
{
"epoch": 4.163955757970071,
"grad_norm": 1.9164159297943115,
"learning_rate": 3.553468434792859e-06,
"loss": 0.5321,
"step": 6400
},
{
"epoch": 4.167208848405986,
"grad_norm": 1.539781093597412,
"learning_rate": 3.526989357202756e-06,
"loss": 0.5223,
"step": 6405
},
{
"epoch": 4.1704619388419,
"grad_norm": 1.5751947164535522,
"learning_rate": 3.5006018155285286e-06,
"loss": 0.5302,
"step": 6410
},
{
"epoch": 4.173715029277814,
"grad_norm": 1.7798151969909668,
"learning_rate": 3.4743059222559298e-06,
"loss": 0.5295,
"step": 6415
},
{
"epoch": 4.176968119713728,
"grad_norm": 2.035566568374634,
"learning_rate": 3.448101789480024e-06,
"loss": 0.5249,
"step": 6420
},
{
"epoch": 4.180221210149642,
"grad_norm": 1.6014204025268555,
"learning_rate": 3.4219895289047317e-06,
"loss": 0.5236,
"step": 6425
},
{
"epoch": 4.183474300585556,
"grad_norm": 1.9151594638824463,
"learning_rate": 3.395969251842329e-06,
"loss": 0.5146,
"step": 6430
},
{
"epoch": 4.186727391021471,
"grad_norm": 1.543568730354309,
"learning_rate": 3.3700410692129815e-06,
"loss": 0.518,
"step": 6435
},
{
"epoch": 4.189980481457384,
"grad_norm": 1.5983526706695557,
"learning_rate": 3.3442050915442615e-06,
"loss": 0.5047,
"step": 6440
},
{
"epoch": 4.193233571893298,
"grad_norm": 1.5908766984939575,
"learning_rate": 3.318461428970707e-06,
"loss": 0.5273,
"step": 6445
},
{
"epoch": 4.196486662329213,
"grad_norm": 1.7272975444793701,
"learning_rate": 3.2928101912333197e-06,
"loss": 0.5143,
"step": 6450
},
{
"epoch": 4.199739752765127,
"grad_norm": 1.6854057312011719,
"learning_rate": 3.2672514876791044e-06,
"loss": 0.5412,
"step": 6455
},
{
"epoch": 4.202992843201041,
"grad_norm": 1.7159767150878906,
"learning_rate": 3.2417854272606212e-06,
"loss": 0.5328,
"step": 6460
},
{
"epoch": 4.206245933636955,
"grad_norm": 2.0293431282043457,
"learning_rate": 3.2164121185355026e-06,
"loss": 0.5207,
"step": 6465
},
{
"epoch": 4.209499024072869,
"grad_norm": 1.4942529201507568,
"learning_rate": 3.1911316696659837e-06,
"loss": 0.5098,
"step": 6470
},
{
"epoch": 4.212752114508783,
"grad_norm": 1.5757249593734741,
"learning_rate": 3.165944188418474e-06,
"loss": 0.5075,
"step": 6475
},
{
"epoch": 4.216005204944698,
"grad_norm": 1.6114063262939453,
"learning_rate": 3.140849782163066e-06,
"loss": 0.5283,
"step": 6480
},
{
"epoch": 4.2192582953806115,
"grad_norm": 1.791574478149414,
"learning_rate": 3.1158485578730883e-06,
"loss": 0.5116,
"step": 6485
},
{
"epoch": 4.222511385816525,
"grad_norm": 1.4832271337509155,
"learning_rate": 3.090940622124644e-06,
"loss": 0.5187,
"step": 6490
},
{
"epoch": 4.22576447625244,
"grad_norm": 1.5384358167648315,
"learning_rate": 3.066126081096185e-06,
"loss": 0.5158,
"step": 6495
},
{
"epoch": 4.229017566688354,
"grad_norm": 1.766423225402832,
"learning_rate": 3.0414050405680155e-06,
"loss": 0.5196,
"step": 6500
},
{
"epoch": 4.232270657124268,
"grad_norm": 2.07438325881958,
"learning_rate": 3.016777605921861e-06,
"loss": 0.5062,
"step": 6505
},
{
"epoch": 4.235523747560182,
"grad_norm": 4.485304355621338,
"learning_rate": 2.9922438821404415e-06,
"loss": 0.4975,
"step": 6510
},
{
"epoch": 4.238776837996096,
"grad_norm": 1.6027443408966064,
"learning_rate": 2.9678039738069845e-06,
"loss": 0.5211,
"step": 6515
},
{
"epoch": 4.24202992843201,
"grad_norm": 2.2789571285247803,
"learning_rate": 2.9434579851047973e-06,
"loss": 0.5084,
"step": 6520
},
{
"epoch": 4.245283018867925,
"grad_norm": 1.481426477432251,
"learning_rate": 2.919206019816842e-06,
"loss": 0.5417,
"step": 6525
},
{
"epoch": 4.2485361093038385,
"grad_norm": 1.6203233003616333,
"learning_rate": 2.895048181325252e-06,
"loss": 0.5114,
"step": 6530
},
{
"epoch": 4.251789199739752,
"grad_norm": 1.5848479270935059,
"learning_rate": 2.8709845726109243e-06,
"loss": 0.5028,
"step": 6535
},
{
"epoch": 4.255042290175667,
"grad_norm": 1.80342435836792,
"learning_rate": 2.8470152962530723e-06,
"loss": 0.5122,
"step": 6540
},
{
"epoch": 4.258295380611581,
"grad_norm": 2.087617874145508,
"learning_rate": 2.8231404544287796e-06,
"loss": 0.506,
"step": 6545
},
{
"epoch": 4.261548471047496,
"grad_norm": 1.7649626731872559,
"learning_rate": 2.7993601489125693e-06,
"loss": 0.5166,
"step": 6550
},
{
"epoch": 4.264801561483409,
"grad_norm": 3.1642332077026367,
"learning_rate": 2.7756744810759823e-06,
"loss": 0.5107,
"step": 6555
},
{
"epoch": 4.268054651919323,
"grad_norm": 1.9564752578735352,
"learning_rate": 2.7520835518871302e-06,
"loss": 0.5112,
"step": 6560
},
{
"epoch": 4.271307742355237,
"grad_norm": 1.6043564081192017,
"learning_rate": 2.7285874619102675e-06,
"loss": 0.5084,
"step": 6565
},
{
"epoch": 4.274560832791152,
"grad_norm": 1.9543806314468384,
"learning_rate": 2.705186311305355e-06,
"loss": 0.5135,
"step": 6570
},
{
"epoch": 4.2778139232270656,
"grad_norm": 1.6966253519058228,
"learning_rate": 2.6818801998276634e-06,
"loss": 0.525,
"step": 6575
},
{
"epoch": 4.28106701366298,
"grad_norm": 2.0935935974121094,
"learning_rate": 2.658669226827315e-06,
"loss": 0.5216,
"step": 6580
},
{
"epoch": 4.284320104098894,
"grad_norm": 1.7863517999649048,
"learning_rate": 2.6355534912488627e-06,
"loss": 0.5271,
"step": 6585
},
{
"epoch": 4.287573194534808,
"grad_norm": 1.611092448234558,
"learning_rate": 2.612533091630903e-06,
"loss": 0.5142,
"step": 6590
},
{
"epoch": 4.290826284970722,
"grad_norm": 1.709322452545166,
"learning_rate": 2.5896081261056138e-06,
"loss": 0.5292,
"step": 6595
},
{
"epoch": 4.294079375406636,
"grad_norm": 1.7398649454116821,
"learning_rate": 2.5667786923983443e-06,
"loss": 0.5253,
"step": 6600
},
{
"epoch": 4.29733246584255,
"grad_norm": 1.5445489883422852,
"learning_rate": 2.544044887827235e-06,
"loss": 0.5443,
"step": 6605
},
{
"epoch": 4.300585556278465,
"grad_norm": 1.763914704322815,
"learning_rate": 2.5214068093027484e-06,
"loss": 0.5301,
"step": 6610
},
{
"epoch": 4.303838646714379,
"grad_norm": 2.1207916736602783,
"learning_rate": 2.498864553327296e-06,
"loss": 0.5351,
"step": 6615
},
{
"epoch": 4.307091737150293,
"grad_norm": 1.8002142906188965,
"learning_rate": 2.4764182159948133e-06,
"loss": 0.5043,
"step": 6620
},
{
"epoch": 4.310344827586207,
"grad_norm": 1.4603972434997559,
"learning_rate": 2.454067892990347e-06,
"loss": 0.5032,
"step": 6625
},
{
"epoch": 4.313597918022121,
"grad_norm": 1.6874291896820068,
"learning_rate": 2.431813679589645e-06,
"loss": 0.5232,
"step": 6630
},
{
"epoch": 4.316851008458035,
"grad_norm": 1.7689220905303955,
"learning_rate": 2.4096556706587726e-06,
"loss": 0.5218,
"step": 6635
},
{
"epoch": 4.32010409889395,
"grad_norm": 1.5644956827163696,
"learning_rate": 2.387593960653675e-06,
"loss": 0.5164,
"step": 6640
},
{
"epoch": 4.3233571893298635,
"grad_norm": 2.199660301208496,
"learning_rate": 2.3656286436197965e-06,
"loss": 0.538,
"step": 6645
},
{
"epoch": 4.326610279765777,
"grad_norm": 2.4460320472717285,
"learning_rate": 2.343759813191676e-06,
"loss": 0.5197,
"step": 6650
},
{
"epoch": 4.329863370201692,
"grad_norm": 1.8965719938278198,
"learning_rate": 2.3219875625925452e-06,
"loss": 0.5399,
"step": 6655
},
{
"epoch": 4.333116460637606,
"grad_norm": 1.7241499423980713,
"learning_rate": 2.3003119846339293e-06,
"loss": 0.514,
"step": 6660
},
{
"epoch": 4.33636955107352,
"grad_norm": 1.776291847229004,
"learning_rate": 2.27873317171525e-06,
"loss": 0.5217,
"step": 6665
},
{
"epoch": 4.339622641509434,
"grad_norm": 1.6230307817459106,
"learning_rate": 2.25725121582345e-06,
"loss": 0.5208,
"step": 6670
},
{
"epoch": 4.342875731945348,
"grad_norm": 1.5767405033111572,
"learning_rate": 2.2358662085325723e-06,
"loss": 0.5064,
"step": 6675
},
{
"epoch": 4.346128822381262,
"grad_norm": 1.785072922706604,
"learning_rate": 2.2145782410033844e-06,
"loss": 0.5195,
"step": 6680
},
{
"epoch": 4.349381912817177,
"grad_norm": 2.802659034729004,
"learning_rate": 2.1933874039830078e-06,
"loss": 0.5178,
"step": 6685
},
{
"epoch": 4.3526350032530905,
"grad_norm": 1.8929702043533325,
"learning_rate": 2.172293787804483e-06,
"loss": 0.5281,
"step": 6690
},
{
"epoch": 4.355888093689004,
"grad_norm": 2.050996780395508,
"learning_rate": 2.1512974823864414e-06,
"loss": 0.5432,
"step": 6695
},
{
"epoch": 4.359141184124919,
"grad_norm": 1.6718263626098633,
"learning_rate": 2.130398577232673e-06,
"loss": 0.5267,
"step": 6700
},
{
"epoch": 4.362394274560833,
"grad_norm": 1.8539758920669556,
"learning_rate": 2.109597161431784e-06,
"loss": 0.5334,
"step": 6705
},
{
"epoch": 4.365647364996747,
"grad_norm": 1.541066288948059,
"learning_rate": 2.088893323656793e-06,
"loss": 0.5235,
"step": 6710
},
{
"epoch": 4.368900455432661,
"grad_norm": 1.5558756589889526,
"learning_rate": 2.068287152164747e-06,
"loss": 0.5157,
"step": 6715
},
{
"epoch": 4.372153545868575,
"grad_norm": 1.825431227684021,
"learning_rate": 2.0477787347963823e-06,
"loss": 0.521,
"step": 6720
},
{
"epoch": 4.375406636304489,
"grad_norm": 1.558396816253662,
"learning_rate": 2.0273681589757063e-06,
"loss": 0.5082,
"step": 6725
},
{
"epoch": 4.378659726740404,
"grad_norm": 1.8559561967849731,
"learning_rate": 2.007055511709646e-06,
"loss": 0.526,
"step": 6730
},
{
"epoch": 4.3819128171763175,
"grad_norm": 1.8222005367279053,
"learning_rate": 1.986840879587687e-06,
"loss": 0.522,
"step": 6735
},
{
"epoch": 4.385165907612231,
"grad_norm": 4.778210639953613,
"learning_rate": 1.966724348781479e-06,
"loss": 0.5089,
"step": 6740
},
{
"epoch": 4.388418998048146,
"grad_norm": 1.7374241352081299,
"learning_rate": 1.9467060050444824e-06,
"loss": 0.5166,
"step": 6745
},
{
"epoch": 4.39167208848406,
"grad_norm": 1.846447467803955,
"learning_rate": 1.9267859337116195e-06,
"loss": 0.5255,
"step": 6750
},
{
"epoch": 4.394925178919974,
"grad_norm": 1.6373209953308105,
"learning_rate": 1.9069642196988757e-06,
"loss": 0.5103,
"step": 6755
},
{
"epoch": 4.398178269355888,
"grad_norm": 2.6573219299316406,
"learning_rate": 1.8872409475029524e-06,
"loss": 0.5192,
"step": 6760
},
{
"epoch": 4.401431359791802,
"grad_norm": 3.289806365966797,
"learning_rate": 1.8676162012009307e-06,
"loss": 0.5195,
"step": 6765
},
{
"epoch": 4.404684450227716,
"grad_norm": 2.3919076919555664,
"learning_rate": 1.8480900644498756e-06,
"loss": 0.5139,
"step": 6770
},
{
"epoch": 4.407937540663631,
"grad_norm": 2.7541277408599854,
"learning_rate": 1.8286626204864903e-06,
"loss": 0.5285,
"step": 6775
},
{
"epoch": 4.411190631099545,
"grad_norm": 2.060319423675537,
"learning_rate": 1.8093339521267876e-06,
"loss": 0.5211,
"step": 6780
},
{
"epoch": 4.414443721535458,
"grad_norm": 1.9002997875213623,
"learning_rate": 1.7901041417657027e-06,
"loss": 0.5189,
"step": 6785
},
{
"epoch": 4.417696811971373,
"grad_norm": 2.1053810119628906,
"learning_rate": 1.7709732713767497e-06,
"loss": 0.5107,
"step": 6790
},
{
"epoch": 4.420949902407287,
"grad_norm": 1.6905279159545898,
"learning_rate": 1.7519414225116937e-06,
"loss": 0.5147,
"step": 6795
},
{
"epoch": 4.424202992843201,
"grad_norm": 2.2751264572143555,
"learning_rate": 1.733008676300177e-06,
"loss": 0.5065,
"step": 6800
},
{
"epoch": 4.427456083279115,
"grad_norm": 1.9138133525848389,
"learning_rate": 1.7141751134493815e-06,
"loss": 0.5144,
"step": 6805
},
{
"epoch": 4.430709173715029,
"grad_norm": 1.75284743309021,
"learning_rate": 1.6954408142436955e-06,
"loss": 0.5164,
"step": 6810
},
{
"epoch": 4.433962264150943,
"grad_norm": 1.6290788650512695,
"learning_rate": 1.6768058585443585e-06,
"loss": 0.5197,
"step": 6815
},
{
"epoch": 4.437215354586858,
"grad_norm": 2.135432243347168,
"learning_rate": 1.6582703257891214e-06,
"loss": 0.5252,
"step": 6820
},
{
"epoch": 4.440468445022772,
"grad_norm": 1.6389341354370117,
"learning_rate": 1.63983429499191e-06,
"loss": 0.5217,
"step": 6825
},
{
"epoch": 4.443721535458685,
"grad_norm": 1.6227918863296509,
"learning_rate": 1.6214978447425062e-06,
"loss": 0.5178,
"step": 6830
},
{
"epoch": 4.4469746258946,
"grad_norm": 1.907899022102356,
"learning_rate": 1.603261053206176e-06,
"loss": 0.5235,
"step": 6835
},
{
"epoch": 4.450227716330514,
"grad_norm": 2.548617362976074,
"learning_rate": 1.5851239981233639e-06,
"loss": 0.5238,
"step": 6840
},
{
"epoch": 4.453480806766428,
"grad_norm": 1.8666588068008423,
"learning_rate": 1.5670867568093633e-06,
"loss": 0.5378,
"step": 6845
},
{
"epoch": 4.4567338972023425,
"grad_norm": 1.6732510328292847,
"learning_rate": 1.5491494061539658e-06,
"loss": 0.5101,
"step": 6850
},
{
"epoch": 4.459986987638256,
"grad_norm": 1.560084342956543,
"learning_rate": 1.5313120226211452e-06,
"loss": 0.5318,
"step": 6855
},
{
"epoch": 4.46324007807417,
"grad_norm": 4.673284530639648,
"learning_rate": 1.5135746822487419e-06,
"loss": 0.5279,
"step": 6860
},
{
"epoch": 4.466493168510085,
"grad_norm": 1.600279450416565,
"learning_rate": 1.4959374606481251e-06,
"loss": 0.4943,
"step": 6865
},
{
"epoch": 4.469746258945999,
"grad_norm": 2.073321580886841,
"learning_rate": 1.4784004330038653e-06,
"loss": 0.5204,
"step": 6870
},
{
"epoch": 4.4729993493819125,
"grad_norm": 3.2433438301086426,
"learning_rate": 1.4609636740734316e-06,
"loss": 0.5174,
"step": 6875
},
{
"epoch": 4.476252439817827,
"grad_norm": 2.53226637840271,
"learning_rate": 1.4436272581868665e-06,
"loss": 0.54,
"step": 6880
},
{
"epoch": 4.479505530253741,
"grad_norm": 1.7645595073699951,
"learning_rate": 1.4263912592464597e-06,
"loss": 0.5271,
"step": 6885
},
{
"epoch": 4.482758620689655,
"grad_norm": 1.7925113439559937,
"learning_rate": 1.4092557507264375e-06,
"loss": 0.5169,
"step": 6890
},
{
"epoch": 4.4860117111255695,
"grad_norm": 2.9148597717285156,
"learning_rate": 1.3922208056726644e-06,
"loss": 0.525,
"step": 6895
},
{
"epoch": 4.489264801561483,
"grad_norm": 3.2308194637298584,
"learning_rate": 1.3752864967023105e-06,
"loss": 0.5341,
"step": 6900
},
{
"epoch": 4.492517891997397,
"grad_norm": 1.633375644683838,
"learning_rate": 1.358452896003548e-06,
"loss": 0.5249,
"step": 6905
},
{
"epoch": 4.495770982433312,
"grad_norm": 1.7651923894882202,
"learning_rate": 1.3417200753352538e-06,
"loss": 0.5211,
"step": 6910
},
{
"epoch": 4.499024072869226,
"grad_norm": 1.584030032157898,
"learning_rate": 1.3250881060266952e-06,
"loss": 0.5164,
"step": 6915
},
{
"epoch": 4.5022771633051395,
"grad_norm": 2.4326541423797607,
"learning_rate": 1.3085570589772168e-06,
"loss": 0.5306,
"step": 6920
},
{
"epoch": 4.505530253741054,
"grad_norm": 1.5874032974243164,
"learning_rate": 1.2921270046559658e-06,
"loss": 0.5374,
"step": 6925
},
{
"epoch": 4.508783344176968,
"grad_norm": 2.053276300430298,
"learning_rate": 1.2757980131015563e-06,
"loss": 0.5294,
"step": 6930
},
{
"epoch": 4.512036434612883,
"grad_norm": 1.5977790355682373,
"learning_rate": 1.2595701539217963e-06,
"loss": 0.515,
"step": 6935
},
{
"epoch": 4.5152895250487965,
"grad_norm": 1.5569490194320679,
"learning_rate": 1.2434434962933866e-06,
"loss": 0.5178,
"step": 6940
},
{
"epoch": 4.51854261548471,
"grad_norm": 1.8135985136032104,
"learning_rate": 1.2274181089616172e-06,
"loss": 0.5268,
"step": 6945
},
{
"epoch": 4.521795705920624,
"grad_norm": 1.5852515697479248,
"learning_rate": 1.2114940602400788e-06,
"loss": 0.5192,
"step": 6950
},
{
"epoch": 4.525048796356539,
"grad_norm": 2.1236679553985596,
"learning_rate": 1.19567141801038e-06,
"loss": 0.527,
"step": 6955
},
{
"epoch": 4.528301886792453,
"grad_norm": 2.562978744506836,
"learning_rate": 1.1799502497218368e-06,
"loss": 0.5379,
"step": 6960
},
{
"epoch": 4.531554977228367,
"grad_norm": 1.632822871208191,
"learning_rate": 1.164330622391213e-06,
"loss": 0.5162,
"step": 6965
},
{
"epoch": 4.534808067664281,
"grad_norm": 2.966524124145508,
"learning_rate": 1.1488126026024087e-06,
"loss": 0.5399,
"step": 6970
},
{
"epoch": 4.538061158100195,
"grad_norm": 1.8411732912063599,
"learning_rate": 1.1333962565061973e-06,
"loss": 0.5232,
"step": 6975
},
{
"epoch": 4.541314248536109,
"grad_norm": 1.5464459657669067,
"learning_rate": 1.118081649819927e-06,
"loss": 0.5168,
"step": 6980
},
{
"epoch": 4.544567338972024,
"grad_norm": 1.7210750579833984,
"learning_rate": 1.1028688478272459e-06,
"loss": 0.5327,
"step": 6985
},
{
"epoch": 4.547820429407937,
"grad_norm": 2.1294288635253906,
"learning_rate": 1.0877579153778323e-06,
"loss": 0.4963,
"step": 6990
},
{
"epoch": 4.551073519843852,
"grad_norm": 1.5896108150482178,
"learning_rate": 1.0727489168871092e-06,
"loss": 0.537,
"step": 6995
},
{
"epoch": 4.554326610279766,
"grad_norm": 1.6593022346496582,
"learning_rate": 1.0578419163359666e-06,
"loss": 0.5164,
"step": 7000
},
{
"epoch": 4.55757970071568,
"grad_norm": 1.6132862567901611,
"learning_rate": 1.0430369772705034e-06,
"loss": 0.5246,
"step": 7005
},
{
"epoch": 4.560832791151594,
"grad_norm": 1.6968963146209717,
"learning_rate": 1.028334162801739e-06,
"loss": 0.5169,
"step": 7010
},
{
"epoch": 4.564085881587508,
"grad_norm": 3.422121524810791,
"learning_rate": 1.0137335356053545e-06,
"loss": 0.5306,
"step": 7015
},
{
"epoch": 4.567338972023422,
"grad_norm": 2.2838146686553955,
"learning_rate": 9.99235157921427e-07,
"loss": 0.536,
"step": 7020
},
{
"epoch": 4.570592062459337,
"grad_norm": 1.923091173171997,
"learning_rate": 9.8483909155416e-07,
"loss": 0.5165,
"step": 7025
},
{
"epoch": 4.573845152895251,
"grad_norm": 1.5500158071517944,
"learning_rate": 9.705453978716112e-07,
"loss": 0.5086,
"step": 7030
},
{
"epoch": 4.577098243331164,
"grad_norm": 1.948114037513733,
"learning_rate": 9.56354137805457e-07,
"loss": 0.5262,
"step": 7035
},
{
"epoch": 4.580351333767078,
"grad_norm": 2.5097603797912598,
"learning_rate": 9.422653718507007e-07,
"loss": 0.5353,
"step": 7040
},
{
"epoch": 4.583604424202993,
"grad_norm": 1.757633090019226,
"learning_rate": 9.282791600654428e-07,
"loss": 0.5167,
"step": 7045
},
{
"epoch": 4.586857514638907,
"grad_norm": 2.2960455417633057,
"learning_rate": 9.14395562070594e-07,
"loss": 0.5264,
"step": 7050
},
{
"epoch": 4.5901106050748215,
"grad_norm": 1.556706428527832,
"learning_rate": 9.006146370496654e-07,
"loss": 0.5177,
"step": 7055
},
{
"epoch": 4.593363695510735,
"grad_norm": 1.7054029703140259,
"learning_rate": 8.869364437484678e-07,
"loss": 0.4893,
"step": 7060
},
{
"epoch": 4.596616785946649,
"grad_norm": 1.746472716331482,
"learning_rate": 8.733610404748904e-07,
"loss": 0.5093,
"step": 7065
},
{
"epoch": 4.599869876382563,
"grad_norm": 2.1942458152770996,
"learning_rate": 8.598884850986533e-07,
"loss": 0.5299,
"step": 7070
},
{
"epoch": 4.603122966818478,
"grad_norm": 2.43866229057312,
"learning_rate": 8.465188350510411e-07,
"loss": 0.5282,
"step": 7075
},
{
"epoch": 4.6063760572543915,
"grad_norm": 1.625575304031372,
"learning_rate": 8.332521473246758e-07,
"loss": 0.5189,
"step": 7080
},
{
"epoch": 4.609629147690306,
"grad_norm": 2.3699636459350586,
"learning_rate": 8.200884784732688e-07,
"loss": 0.5249,
"step": 7085
},
{
"epoch": 4.61288223812622,
"grad_norm": 1.750931739807129,
"learning_rate": 8.070278846113749e-07,
"loss": 0.5165,
"step": 7090
},
{
"epoch": 4.616135328562134,
"grad_norm": 1.8055213689804077,
"learning_rate": 7.940704214141614e-07,
"loss": 0.5315,
"step": 7095
},
{
"epoch": 4.6193884189980485,
"grad_norm": 2.2767059803009033,
"learning_rate": 7.812161441171611e-07,
"loss": 0.5232,
"step": 7100
},
{
"epoch": 4.622641509433962,
"grad_norm": 1.4966483116149902,
"learning_rate": 7.684651075160531e-07,
"loss": 0.5045,
"step": 7105
},
{
"epoch": 4.625894599869876,
"grad_norm": 2.188704490661621,
"learning_rate": 7.558173659664075e-07,
"loss": 0.5201,
"step": 7110
},
{
"epoch": 4.629147690305791,
"grad_norm": 2.934805154800415,
"learning_rate": 7.432729733834631e-07,
"loss": 0.5247,
"step": 7115
},
{
"epoch": 4.632400780741705,
"grad_norm": 1.9948830604553223,
"learning_rate": 7.308319832419141e-07,
"loss": 0.5247,
"step": 7120
},
{
"epoch": 4.6356538711776185,
"grad_norm": 1.8401069641113281,
"learning_rate": 7.18494448575649e-07,
"loss": 0.5364,
"step": 7125
},
{
"epoch": 4.638906961613533,
"grad_norm": 1.45015549659729,
"learning_rate": 7.062604219775531e-07,
"loss": 0.5106,
"step": 7130
},
{
"epoch": 4.642160052049447,
"grad_norm": 1.7785407304763794,
"learning_rate": 6.941299555992737e-07,
"loss": 0.5117,
"step": 7135
},
{
"epoch": 4.645413142485361,
"grad_norm": 2.026643753051758,
"learning_rate": 6.821031011509937e-07,
"loss": 0.5039,
"step": 7140
},
{
"epoch": 4.648666232921276,
"grad_norm": 1.6481338739395142,
"learning_rate": 6.701799099012141e-07,
"loss": 0.5385,
"step": 7145
},
{
"epoch": 4.651919323357189,
"grad_norm": 2.9961116313934326,
"learning_rate": 6.583604326765496e-07,
"loss": 0.5148,
"step": 7150
},
{
"epoch": 4.655172413793103,
"grad_norm": 1.7340404987335205,
"learning_rate": 6.466447198614806e-07,
"loss": 0.4913,
"step": 7155
},
{
"epoch": 4.658425504229018,
"grad_norm": 1.569608211517334,
"learning_rate": 6.350328213981654e-07,
"loss": 0.5052,
"step": 7160
},
{
"epoch": 4.661678594664932,
"grad_norm": 1.9746705293655396,
"learning_rate": 6.235247867862226e-07,
"loss": 0.4885,
"step": 7165
},
{
"epoch": 4.6649316851008455,
"grad_norm": 1.7358078956604004,
"learning_rate": 6.121206650825162e-07,
"loss": 0.5256,
"step": 7170
},
{
"epoch": 4.66818477553676,
"grad_norm": 1.609820008277893,
"learning_rate": 6.008205049009341e-07,
"loss": 0.5275,
"step": 7175
},
{
"epoch": 4.671437865972674,
"grad_norm": 1.8338040113449097,
"learning_rate": 5.896243544122076e-07,
"loss": 0.5019,
"step": 7180
},
{
"epoch": 4.674690956408588,
"grad_norm": 1.7695443630218506,
"learning_rate": 5.785322613436894e-07,
"loss": 0.5287,
"step": 7185
},
{
"epoch": 4.677944046844503,
"grad_norm": 1.9566013813018799,
"learning_rate": 5.675442729791425e-07,
"loss": 0.5262,
"step": 7190
},
{
"epoch": 4.681197137280416,
"grad_norm": 1.9720107316970825,
"learning_rate": 5.566604361585626e-07,
"loss": 0.5327,
"step": 7195
},
{
"epoch": 4.68445022771633,
"grad_norm": 2.7521474361419678,
"learning_rate": 5.458807972779534e-07,
"loss": 0.5002,
"step": 7200
},
{
"epoch": 4.687703318152245,
"grad_norm": 6.726840019226074,
"learning_rate": 5.352054022891406e-07,
"loss": 0.52,
"step": 7205
},
{
"epoch": 4.690956408588159,
"grad_norm": 1.8968901634216309,
"learning_rate": 5.246342966995888e-07,
"loss": 0.5259,
"step": 7210
},
{
"epoch": 4.694209499024073,
"grad_norm": 1.647226333618164,
"learning_rate": 5.141675255721762e-07,
"loss": 0.532,
"step": 7215
},
{
"epoch": 4.697462589459987,
"grad_norm": 2.063908576965332,
"learning_rate": 5.038051335250316e-07,
"loss": 0.5132,
"step": 7220
},
{
"epoch": 4.700715679895901,
"grad_norm": 1.5827159881591797,
"learning_rate": 4.935471647313284e-07,
"loss": 0.515,
"step": 7225
},
{
"epoch": 4.703968770331815,
"grad_norm": 1.9684885740280151,
"learning_rate": 4.833936629191016e-07,
"loss": 0.5054,
"step": 7230
},
{
"epoch": 4.70722186076773,
"grad_norm": 2.0594069957733154,
"learning_rate": 4.7334467137105933e-07,
"loss": 0.5235,
"step": 7235
},
{
"epoch": 4.7104749512036435,
"grad_norm": 1.9911025762557983,
"learning_rate": 4.634002329244047e-07,
"loss": 0.5146,
"step": 7240
},
{
"epoch": 4.713728041639557,
"grad_norm": 1.7078765630722046,
"learning_rate": 4.535603899706448e-07,
"loss": 0.5174,
"step": 7245
},
{
"epoch": 4.716981132075472,
"grad_norm": 1.6561988592147827,
"learning_rate": 4.438251844554098e-07,
"loss": 0.5201,
"step": 7250
},
{
"epoch": 4.720234222511386,
"grad_norm": 1.7727643251419067,
"learning_rate": 4.341946578782868e-07,
"loss": 0.5185,
"step": 7255
},
{
"epoch": 4.7234873129473,
"grad_norm": 1.8667991161346436,
"learning_rate": 4.2466885129262004e-07,
"loss": 0.5033,
"step": 7260
},
{
"epoch": 4.726740403383214,
"grad_norm": 1.5502984523773193,
"learning_rate": 4.152478053053632e-07,
"loss": 0.5328,
"step": 7265
},
{
"epoch": 4.729993493819128,
"grad_norm": 1.9481128454208374,
"learning_rate": 4.059315600768887e-07,
"loss": 0.5151,
"step": 7270
},
{
"epoch": 4.733246584255042,
"grad_norm": 2.2370522022247314,
"learning_rate": 3.967201553208122e-07,
"loss": 0.5126,
"step": 7275
},
{
"epoch": 4.736499674690957,
"grad_norm": 1.9233421087265015,
"learning_rate": 3.876136303038458e-07,
"loss": 0.5224,
"step": 7280
},
{
"epoch": 4.7397527651268705,
"grad_norm": 1.6725999116897583,
"learning_rate": 3.7861202384560644e-07,
"loss": 0.5343,
"step": 7285
},
{
"epoch": 4.743005855562784,
"grad_norm": 1.5591987371444702,
"learning_rate": 3.6971537431846057e-07,
"loss": 0.5073,
"step": 7290
},
{
"epoch": 4.746258945998699,
"grad_norm": 1.721091866493225,
"learning_rate": 3.609237196473658e-07,
"loss": 0.5274,
"step": 7295
},
{
"epoch": 4.749512036434613,
"grad_norm": 1.7789474725723267,
"learning_rate": 3.5223709730970446e-07,
"loss": 0.5072,
"step": 7300
},
{
"epoch": 4.752765126870527,
"grad_norm": 1.7836154699325562,
"learning_rate": 3.4365554433511416e-07,
"loss": 0.5126,
"step": 7305
},
{
"epoch": 4.756018217306441,
"grad_norm": 1.6633206605911255,
"learning_rate": 3.3517909730534926e-07,
"loss": 0.5137,
"step": 7310
},
{
"epoch": 4.759271307742355,
"grad_norm": 3.098612070083618,
"learning_rate": 3.268077923541085e-07,
"loss": 0.5061,
"step": 7315
},
{
"epoch": 4.762524398178269,
"grad_norm": 1.939909815788269,
"learning_rate": 3.185416651668882e-07,
"loss": 0.5349,
"step": 7320
},
{
"epoch": 4.765777488614184,
"grad_norm": 1.7502937316894531,
"learning_rate": 3.1038075098083485e-07,
"loss": 0.5032,
"step": 7325
},
{
"epoch": 4.7690305790500975,
"grad_norm": 1.7100647687911987,
"learning_rate": 3.023250845845815e-07,
"loss": 0.5133,
"step": 7330
},
{
"epoch": 4.772283669486011,
"grad_norm": 1.752884030342102,
"learning_rate": 2.943747003181091e-07,
"loss": 0.5358,
"step": 7335
},
{
"epoch": 4.775536759921926,
"grad_norm": 1.700315237045288,
"learning_rate": 2.8652963207260184e-07,
"loss": 0.5048,
"step": 7340
},
{
"epoch": 4.77878985035784,
"grad_norm": 2.1294970512390137,
"learning_rate": 2.787899132902949e-07,
"loss": 0.4829,
"step": 7345
},
{
"epoch": 4.782042940793754,
"grad_norm": 1.8150845766067505,
"learning_rate": 2.711555769643381e-07,
"loss": 0.512,
"step": 7350
},
{
"epoch": 4.785296031229668,
"grad_norm": 2.119196653366089,
"learning_rate": 2.636266556386546e-07,
"loss": 0.5267,
"step": 7355
},
{
"epoch": 4.788549121665582,
"grad_norm": 2.050795793533325,
"learning_rate": 2.562031814077964e-07,
"loss": 0.5089,
"step": 7360
},
{
"epoch": 4.791802212101496,
"grad_norm": 1.6425533294677734,
"learning_rate": 2.488851859168112e-07,
"loss": 0.5168,
"step": 7365
},
{
"epoch": 4.795055302537411,
"grad_norm": 1.8162248134613037,
"learning_rate": 2.4167270036111743e-07,
"loss": 0.5028,
"step": 7370
},
{
"epoch": 4.798308392973325,
"grad_norm": 1.7280550003051758,
"learning_rate": 2.345657554863545e-07,
"loss": 0.5127,
"step": 7375
},
{
"epoch": 4.801561483409239,
"grad_norm": 1.8187389373779297,
"learning_rate": 2.2756438158826053e-07,
"loss": 0.5349,
"step": 7380
},
{
"epoch": 4.804814573845153,
"grad_norm": 1.7695400714874268,
"learning_rate": 2.2066860851253922e-07,
"loss": 0.5211,
"step": 7385
},
{
"epoch": 4.808067664281067,
"grad_norm": 3.8797385692596436,
"learning_rate": 2.1387846565474045e-07,
"loss": 0.5189,
"step": 7390
},
{
"epoch": 4.811320754716981,
"grad_norm": 1.7038609981536865,
"learning_rate": 2.0719398196012707e-07,
"loss": 0.5342,
"step": 7395
},
{
"epoch": 4.814573845152895,
"grad_norm": 1.7032898664474487,
"learning_rate": 2.0061518592355277e-07,
"loss": 0.5139,
"step": 7400
},
{
"epoch": 4.817826935588809,
"grad_norm": 1.818298101425171,
"learning_rate": 1.9414210558934554e-07,
"loss": 0.5198,
"step": 7405
},
{
"epoch": 4.821080026024724,
"grad_norm": 1.8034625053405762,
"learning_rate": 1.8777476855118547e-07,
"loss": 0.5314,
"step": 7410
},
{
"epoch": 4.824333116460638,
"grad_norm": 3.0345141887664795,
"learning_rate": 1.8151320195197997e-07,
"loss": 0.5387,
"step": 7415
},
{
"epoch": 4.827586206896552,
"grad_norm": 1.8238238096237183,
"learning_rate": 1.753574324837609e-07,
"loss": 0.5219,
"step": 7420
},
{
"epoch": 4.830839297332465,
"grad_norm": 1.8523563146591187,
"learning_rate": 1.6930748638756266e-07,
"loss": 0.5075,
"step": 7425
},
{
"epoch": 4.83409238776838,
"grad_norm": 1.592421531677246,
"learning_rate": 1.6336338945331098e-07,
"loss": 0.512,
"step": 7430
},
{
"epoch": 4.837345478204294,
"grad_norm": 1.459957480430603,
"learning_rate": 1.57525167019712e-07,
"loss": 0.5154,
"step": 7435
},
{
"epoch": 4.840598568640209,
"grad_norm": 1.7186057567596436,
"learning_rate": 1.517928439741495e-07,
"loss": 0.5316,
"step": 7440
},
{
"epoch": 4.8438516590761225,
"grad_norm": 1.5618077516555786,
"learning_rate": 1.461664447525768e-07,
"loss": 0.4997,
"step": 7445
},
{
"epoch": 4.847104749512036,
"grad_norm": 1.9501081705093384,
"learning_rate": 1.4064599333940555e-07,
"loss": 0.5115,
"step": 7450
},
{
"epoch": 4.85035783994795,
"grad_norm": 1.593405842781067,
"learning_rate": 1.3523151326741702e-07,
"loss": 0.5062,
"step": 7455
},
{
"epoch": 4.853610930383865,
"grad_norm": 1.6193232536315918,
"learning_rate": 1.299230276176483e-07,
"loss": 0.5096,
"step": 7460
},
{
"epoch": 4.856864020819779,
"grad_norm": 1.7456111907958984,
"learning_rate": 1.247205590192979e-07,
"loss": 0.5154,
"step": 7465
},
{
"epoch": 4.860117111255693,
"grad_norm": 1.7586069107055664,
"learning_rate": 1.1962412964964254e-07,
"loss": 0.5285,
"step": 7470
},
{
"epoch": 4.863370201691607,
"grad_norm": 2.715386152267456,
"learning_rate": 1.1463376123391766e-07,
"loss": 0.4909,
"step": 7475
},
{
"epoch": 4.866623292127521,
"grad_norm": 2.343010902404785,
"learning_rate": 1.0974947504524269e-07,
"loss": 0.5142,
"step": 7480
},
{
"epoch": 4.869876382563435,
"grad_norm": 1.7289972305297852,
"learning_rate": 1.0497129190452926e-07,
"loss": 0.5191,
"step": 7485
},
{
"epoch": 4.8731294729993495,
"grad_norm": 1.742447018623352,
"learning_rate": 1.0029923218038972e-07,
"loss": 0.5248,
"step": 7490
},
{
"epoch": 4.876382563435263,
"grad_norm": 1.901174545288086,
"learning_rate": 9.573331578904e-08,
"loss": 0.5213,
"step": 7495
},
{
"epoch": 4.879635653871178,
"grad_norm": 2.5633485317230225,
"learning_rate": 9.127356219423843e-08,
"loss": 0.5136,
"step": 7500
},
{
"epoch": 4.882888744307092,
"grad_norm": 1.8684697151184082,
"learning_rate": 8.691999040717491e-08,
"loss": 0.5188,
"step": 7505
},
{
"epoch": 4.886141834743006,
"grad_norm": 2.1927521228790283,
"learning_rate": 8.267261898641798e-08,
"loss": 0.5119,
"step": 7510
},
{
"epoch": 4.8893949251789195,
"grad_norm": 1.949514627456665,
"learning_rate": 7.853146603780947e-08,
"loss": 0.5147,
"step": 7515
},
{
"epoch": 4.892648015614834,
"grad_norm": 2.0919501781463623,
"learning_rate": 7.449654921440618e-08,
"loss": 0.5064,
"step": 7520
},
{
"epoch": 4.895901106050748,
"grad_norm": 1.5901788473129272,
"learning_rate": 7.056788571639105e-08,
"loss": 0.5109,
"step": 7525
},
{
"epoch": 4.899154196486663,
"grad_norm": 1.8604559898376465,
"learning_rate": 6.674549229101767e-08,
"loss": 0.526,
"step": 7530
},
{
"epoch": 4.9024072869225765,
"grad_norm": 1.8954790830612183,
"learning_rate": 6.302938523251589e-08,
"loss": 0.5039,
"step": 7535
},
{
"epoch": 4.90566037735849,
"grad_norm": 1.7178046703338623,
"learning_rate": 5.941958038204187e-08,
"loss": 0.5219,
"step": 7540
},
{
"epoch": 4.908913467794405,
"grad_norm": 5.730696201324463,
"learning_rate": 5.59160931275976e-08,
"loss": 0.5004,
"step": 7545
},
{
"epoch": 4.912166558230319,
"grad_norm": 1.6460310220718384,
"learning_rate": 5.2518938403978145e-08,
"loss": 0.5319,
"step": 7550
},
{
"epoch": 4.915419648666233,
"grad_norm": 1.6308308839797974,
"learning_rate": 4.922813069269394e-08,
"loss": 0.5214,
"step": 7555
},
{
"epoch": 4.918672739102147,
"grad_norm": 1.5023380517959595,
"learning_rate": 4.604368402191528e-08,
"loss": 0.5008,
"step": 7560
},
{
"epoch": 4.921925829538061,
"grad_norm": 1.7468260526657104,
"learning_rate": 4.2965611966416796e-08,
"loss": 0.5007,
"step": 7565
},
{
"epoch": 4.925178919973975,
"grad_norm": 1.9006001949310303,
"learning_rate": 3.9993927647516415e-08,
"loss": 0.51,
"step": 7570
},
{
"epoch": 4.92843201040989,
"grad_norm": 1.9338369369506836,
"learning_rate": 3.71286437330115e-08,
"loss": 0.5216,
"step": 7575
},
{
"epoch": 4.931685100845804,
"grad_norm": 1.6572381258010864,
"learning_rate": 3.4369772437137236e-08,
"loss": 0.542,
"step": 7580
},
{
"epoch": 4.934938191281717,
"grad_norm": 2.3215434551239014,
"learning_rate": 3.1717325520513876e-08,
"loss": 0.513,
"step": 7585
},
{
"epoch": 4.938191281717632,
"grad_norm": 1.6987948417663574,
"learning_rate": 2.9171314290080132e-08,
"loss": 0.5284,
"step": 7590
},
{
"epoch": 4.941444372153546,
"grad_norm": 1.7099159955978394,
"learning_rate": 2.6731749599065435e-08,
"loss": 0.5267,
"step": 7595
},
{
"epoch": 4.94469746258946,
"grad_norm": 1.620719075202942,
"learning_rate": 2.4398641846937187e-08,
"loss": 0.5248,
"step": 7600
},
{
"epoch": 4.9479505530253745,
"grad_norm": 1.8238213062286377,
"learning_rate": 2.2172000979345242e-08,
"loss": 0.5268,
"step": 7605
},
{
"epoch": 4.951203643461288,
"grad_norm": 2.011178970336914,
"learning_rate": 2.0051836488094167e-08,
"loss": 0.5184,
"step": 7610
},
{
"epoch": 4.954456733897202,
"grad_norm": 1.6856626272201538,
"learning_rate": 1.8038157411101597e-08,
"loss": 0.5102,
"step": 7615
},
{
"epoch": 4.957709824333117,
"grad_norm": 2.0251147747039795,
"learning_rate": 1.6130972332345505e-08,
"loss": 0.5112,
"step": 7620
},
{
"epoch": 4.960962914769031,
"grad_norm": 2.00230073928833,
"learning_rate": 1.4330289381844775e-08,
"loss": 0.5224,
"step": 7625
},
{
"epoch": 4.964216005204944,
"grad_norm": 1.570320963859558,
"learning_rate": 1.2636116235612005e-08,
"loss": 0.5315,
"step": 7630
},
{
"epoch": 4.967469095640859,
"grad_norm": 1.6889746189117432,
"learning_rate": 1.1048460115634096e-08,
"loss": 0.5193,
"step": 7635
},
{
"epoch": 4.970722186076773,
"grad_norm": 2.201413631439209,
"learning_rate": 9.567327789825054e-09,
"loss": 0.5286,
"step": 7640
},
{
"epoch": 4.973975276512687,
"grad_norm": 1.7942432165145874,
"learning_rate": 8.192725572006565e-09,
"loss": 0.5211,
"step": 7645
},
{
"epoch": 4.9772283669486015,
"grad_norm": 1.7889728546142578,
"learning_rate": 6.924659321888571e-09,
"loss": 0.5164,
"step": 7650
},
{
"epoch": 4.980481457384515,
"grad_norm": 2.543469190597534,
"learning_rate": 5.763134445022078e-09,
"loss": 0.5054,
"step": 7655
},
{
"epoch": 4.983734547820429,
"grad_norm": 5.216160297393799,
"learning_rate": 4.7081558927991594e-09,
"loss": 0.4954,
"step": 7660
},
{
"epoch": 4.986987638256344,
"grad_norm": 2.649937868118286,
"learning_rate": 3.759728162422427e-09,
"loss": 0.5127,
"step": 7665
},
{
"epoch": 4.990240728692258,
"grad_norm": 1.9266875982284546,
"learning_rate": 2.9178552968800454e-09,
"loss": 0.5304,
"step": 7670
},
{
"epoch": 4.9934938191281715,
"grad_norm": 1.6233766078948975,
"learning_rate": 2.1825408849401873e-09,
"loss": 0.5277,
"step": 7675
},
{
"epoch": 4.996746909564086,
"grad_norm": 1.7894667387008667,
"learning_rate": 1.5537880611260491e-09,
"loss": 0.5239,
"step": 7680
},
{
"epoch": 5.0,
"grad_norm": 1.5771998167037964,
"learning_rate": 1.0315995057075256e-09,
"loss": 0.5174,
"step": 7685
},
{
"epoch": 5.0,
"eval_f1": 0.7944165410554209,
"eval_loss": 0.54638671875,
"eval_precision": 0.7945063287994906,
"eval_recall": 0.7943501451962497,
"eval_runtime": 257.0765,
"eval_samples_per_second": 1530.42,
"eval_steps_per_second": 1.498,
"step": 7685
},
{
"epoch": 5.0,
"step": 7685,
"total_flos": 5.363134814553637e+18,
"train_loss": 0.7729351929743412,
"train_runtime": 35402.7725,
"train_samples_per_second": 444.524,
"train_steps_per_second": 0.217
}
],
"logging_steps": 5,
"max_steps": 7685,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 5.0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.363134814553637e+18,
"train_batch_size": 512,
"trial_name": null,
"trial_params": null
}