roleplayer-actor-lora / trainer_state.json
aifeifei798's picture
Upload 17 files
6656e66 verified
raw
history blame
60.4 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.4465031016252844,
"eval_steps": 500,
"global_step": 3410,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0013093932598981946,
"grad_norm": 0.9729704260826111,
"learning_rate": 0.00018,
"loss": 2.9613,
"step": 10
},
{
"epoch": 0.002618786519796389,
"grad_norm": 0.4988560676574707,
"learning_rate": 0.00019976402726796014,
"loss": 2.2501,
"step": 20
},
{
"epoch": 0.003928179779694584,
"grad_norm": 0.3629654049873352,
"learning_rate": 0.0001995018353434714,
"loss": 1.9558,
"step": 30
},
{
"epoch": 0.005237573039592778,
"grad_norm": 0.42317306995391846,
"learning_rate": 0.0001992396434189827,
"loss": 1.8904,
"step": 40
},
{
"epoch": 0.006546966299490973,
"grad_norm": 0.4342662990093231,
"learning_rate": 0.00019897745149449398,
"loss": 1.9487,
"step": 50
},
{
"epoch": 0.007856359559389167,
"grad_norm": 0.4164058268070221,
"learning_rate": 0.00019871525957000524,
"loss": 1.845,
"step": 60
},
{
"epoch": 0.009165752819287363,
"grad_norm": 0.38950663805007935,
"learning_rate": 0.0001984530676455165,
"loss": 1.8264,
"step": 70
},
{
"epoch": 0.010475146079185557,
"grad_norm": 0.42093154788017273,
"learning_rate": 0.00019819087572102778,
"loss": 1.8418,
"step": 80
},
{
"epoch": 0.011784539339083753,
"grad_norm": 0.4716477394104004,
"learning_rate": 0.00019792868379653908,
"loss": 1.8346,
"step": 90
},
{
"epoch": 0.013093932598981946,
"grad_norm": 0.4358816146850586,
"learning_rate": 0.00019766649187205035,
"loss": 1.8271,
"step": 100
},
{
"epoch": 0.014403325858880142,
"grad_norm": 0.45478910207748413,
"learning_rate": 0.00019740429994756162,
"loss": 1.7506,
"step": 110
},
{
"epoch": 0.015712719118778334,
"grad_norm": 0.4366815388202667,
"learning_rate": 0.00019714210802307289,
"loss": 1.7854,
"step": 120
},
{
"epoch": 0.01702211237867653,
"grad_norm": 0.45096880197525024,
"learning_rate": 0.00019687991609858418,
"loss": 1.779,
"step": 130
},
{
"epoch": 0.018331505638574726,
"grad_norm": 0.4566694498062134,
"learning_rate": 0.00019661772417409545,
"loss": 1.7509,
"step": 140
},
{
"epoch": 0.01964089889847292,
"grad_norm": 0.4729042649269104,
"learning_rate": 0.00019635553224960672,
"loss": 1.7271,
"step": 150
},
{
"epoch": 0.020950292158371114,
"grad_norm": 0.46566858887672424,
"learning_rate": 0.000196093340325118,
"loss": 1.714,
"step": 160
},
{
"epoch": 0.02225968541826931,
"grad_norm": 0.45467349886894226,
"learning_rate": 0.00019583114840062926,
"loss": 1.702,
"step": 170
},
{
"epoch": 0.023569078678167505,
"grad_norm": 0.434721440076828,
"learning_rate": 0.00019556895647614055,
"loss": 1.7162,
"step": 180
},
{
"epoch": 0.024878471938065697,
"grad_norm": 0.5182896852493286,
"learning_rate": 0.00019530676455165182,
"loss": 1.688,
"step": 190
},
{
"epoch": 0.026187865197963893,
"grad_norm": 0.5060753226280212,
"learning_rate": 0.0001950445726271631,
"loss": 1.6955,
"step": 200
},
{
"epoch": 0.02749725845786209,
"grad_norm": 0.46147406101226807,
"learning_rate": 0.00019478238070267436,
"loss": 1.681,
"step": 210
},
{
"epoch": 0.028806651717760284,
"grad_norm": 0.4517662823200226,
"learning_rate": 0.00019452018877818563,
"loss": 1.6936,
"step": 220
},
{
"epoch": 0.030116044977658477,
"grad_norm": 0.44920527935028076,
"learning_rate": 0.00019425799685369693,
"loss": 1.6633,
"step": 230
},
{
"epoch": 0.03142543823755667,
"grad_norm": 0.5066579580307007,
"learning_rate": 0.0001939958049292082,
"loss": 1.6872,
"step": 240
},
{
"epoch": 0.03273483149745487,
"grad_norm": 0.5238184928894043,
"learning_rate": 0.00019373361300471946,
"loss": 1.6255,
"step": 250
},
{
"epoch": 0.03404422475735306,
"grad_norm": 0.4943958520889282,
"learning_rate": 0.00019347142108023073,
"loss": 1.6499,
"step": 260
},
{
"epoch": 0.03535361801725126,
"grad_norm": 0.48346492648124695,
"learning_rate": 0.00019320922915574203,
"loss": 1.672,
"step": 270
},
{
"epoch": 0.03666301127714945,
"grad_norm": 0.4401436746120453,
"learning_rate": 0.0001929470372312533,
"loss": 1.6863,
"step": 280
},
{
"epoch": 0.037972404537047644,
"grad_norm": 0.4602312743663788,
"learning_rate": 0.00019268484530676457,
"loss": 1.646,
"step": 290
},
{
"epoch": 0.03928179779694584,
"grad_norm": 0.4927528202533722,
"learning_rate": 0.00019242265338227584,
"loss": 1.6252,
"step": 300
},
{
"epoch": 0.040591191056844035,
"grad_norm": 0.5075507760047913,
"learning_rate": 0.0001921604614577871,
"loss": 1.6218,
"step": 310
},
{
"epoch": 0.04190058431674223,
"grad_norm": 0.5239428877830505,
"learning_rate": 0.0001918982695332984,
"loss": 1.6354,
"step": 320
},
{
"epoch": 0.043209977576640426,
"grad_norm": 0.5954804420471191,
"learning_rate": 0.00019163607760880967,
"loss": 1.7022,
"step": 330
},
{
"epoch": 0.04451937083653862,
"grad_norm": 0.5364096760749817,
"learning_rate": 0.00019137388568432094,
"loss": 1.5981,
"step": 340
},
{
"epoch": 0.04582876409643681,
"grad_norm": 0.55096435546875,
"learning_rate": 0.0001911116937598322,
"loss": 1.6211,
"step": 350
},
{
"epoch": 0.04713815735633501,
"grad_norm": 0.5193445682525635,
"learning_rate": 0.00019084950183534348,
"loss": 1.6195,
"step": 360
},
{
"epoch": 0.0484475506162332,
"grad_norm": 0.528788685798645,
"learning_rate": 0.00019058730991085477,
"loss": 1.6076,
"step": 370
},
{
"epoch": 0.049756943876131395,
"grad_norm": 0.5360815525054932,
"learning_rate": 0.00019032511798636604,
"loss": 1.5912,
"step": 380
},
{
"epoch": 0.051066337136029594,
"grad_norm": 0.5031074285507202,
"learning_rate": 0.0001900629260618773,
"loss": 1.6157,
"step": 390
},
{
"epoch": 0.052375730395927786,
"grad_norm": 0.5149925351142883,
"learning_rate": 0.00018980073413738858,
"loss": 1.579,
"step": 400
},
{
"epoch": 0.053685123655825985,
"grad_norm": 0.5419250726699829,
"learning_rate": 0.00018953854221289985,
"loss": 1.6242,
"step": 410
},
{
"epoch": 0.05499451691572418,
"grad_norm": 0.5513054728507996,
"learning_rate": 0.00018927635028841112,
"loss": 1.5948,
"step": 420
},
{
"epoch": 0.05630391017562237,
"grad_norm": 0.5670781135559082,
"learning_rate": 0.0001890141583639224,
"loss": 1.5314,
"step": 430
},
{
"epoch": 0.05761330343552057,
"grad_norm": 0.5327165722846985,
"learning_rate": 0.00018875196643943366,
"loss": 1.5716,
"step": 440
},
{
"epoch": 0.05892269669541876,
"grad_norm": 0.5244112610816956,
"learning_rate": 0.00018848977451494493,
"loss": 1.5347,
"step": 450
},
{
"epoch": 0.06023208995531695,
"grad_norm": 0.5349589586257935,
"learning_rate": 0.00018822758259045622,
"loss": 1.564,
"step": 460
},
{
"epoch": 0.06154148321521515,
"grad_norm": 0.5296887755393982,
"learning_rate": 0.0001879653906659675,
"loss": 1.5779,
"step": 470
},
{
"epoch": 0.06285087647511334,
"grad_norm": 0.5426337718963623,
"learning_rate": 0.00018770319874147876,
"loss": 1.5112,
"step": 480
},
{
"epoch": 0.06416026973501154,
"grad_norm": 0.5532763004302979,
"learning_rate": 0.00018744100681699003,
"loss": 1.5458,
"step": 490
},
{
"epoch": 0.06546966299490974,
"grad_norm": 0.5318668484687805,
"learning_rate": 0.00018717881489250133,
"loss": 1.5597,
"step": 500
},
{
"epoch": 0.06677905625480793,
"grad_norm": 0.6084654331207275,
"learning_rate": 0.0001869166229680126,
"loss": 1.5485,
"step": 510
},
{
"epoch": 0.06808844951470612,
"grad_norm": 0.5626131296157837,
"learning_rate": 0.00018665443104352386,
"loss": 1.5217,
"step": 520
},
{
"epoch": 0.06939784277460431,
"grad_norm": 0.528758704662323,
"learning_rate": 0.00018639223911903513,
"loss": 1.5343,
"step": 530
},
{
"epoch": 0.07070723603450252,
"grad_norm": 0.5894292593002319,
"learning_rate": 0.0001861300471945464,
"loss": 1.5604,
"step": 540
},
{
"epoch": 0.07201662929440071,
"grad_norm": 0.5676683187484741,
"learning_rate": 0.0001858678552700577,
"loss": 1.5216,
"step": 550
},
{
"epoch": 0.0733260225542989,
"grad_norm": 0.6381473541259766,
"learning_rate": 0.00018560566334556897,
"loss": 1.4334,
"step": 560
},
{
"epoch": 0.0746354158141971,
"grad_norm": 0.6644160151481628,
"learning_rate": 0.00018534347142108024,
"loss": 1.4832,
"step": 570
},
{
"epoch": 0.07594480907409529,
"grad_norm": 0.5856960415840149,
"learning_rate": 0.0001850812794965915,
"loss": 1.5118,
"step": 580
},
{
"epoch": 0.07725420233399348,
"grad_norm": 0.5892801880836487,
"learning_rate": 0.00018481908757210277,
"loss": 1.5028,
"step": 590
},
{
"epoch": 0.07856359559389169,
"grad_norm": 0.5674527883529663,
"learning_rate": 0.00018455689564761407,
"loss": 1.5125,
"step": 600
},
{
"epoch": 0.07987298885378988,
"grad_norm": 0.6059868335723877,
"learning_rate": 0.00018429470372312534,
"loss": 1.4543,
"step": 610
},
{
"epoch": 0.08118238211368807,
"grad_norm": 0.6255605816841125,
"learning_rate": 0.0001840325117986366,
"loss": 1.4851,
"step": 620
},
{
"epoch": 0.08249177537358626,
"grad_norm": 0.5904423594474792,
"learning_rate": 0.00018377031987414788,
"loss": 1.4154,
"step": 630
},
{
"epoch": 0.08380116863348445,
"grad_norm": 0.6035749912261963,
"learning_rate": 0.00018350812794965917,
"loss": 1.4276,
"step": 640
},
{
"epoch": 0.08511056189338265,
"grad_norm": 0.597172737121582,
"learning_rate": 0.00018324593602517044,
"loss": 1.4736,
"step": 650
},
{
"epoch": 0.08641995515328085,
"grad_norm": 0.6352164149284363,
"learning_rate": 0.0001829837441006817,
"loss": 1.4975,
"step": 660
},
{
"epoch": 0.08772934841317905,
"grad_norm": 0.5500873327255249,
"learning_rate": 0.00018272155217619298,
"loss": 1.4578,
"step": 670
},
{
"epoch": 0.08903874167307724,
"grad_norm": 0.6423613429069519,
"learning_rate": 0.00018245936025170425,
"loss": 1.3926,
"step": 680
},
{
"epoch": 0.09034813493297543,
"grad_norm": 0.665908694267273,
"learning_rate": 0.00018219716832721555,
"loss": 1.4548,
"step": 690
},
{
"epoch": 0.09165752819287362,
"grad_norm": 0.6354024410247803,
"learning_rate": 0.00018193497640272682,
"loss": 1.5,
"step": 700
},
{
"epoch": 0.09296692145277183,
"grad_norm": 0.6588740348815918,
"learning_rate": 0.00018167278447823808,
"loss": 1.3609,
"step": 710
},
{
"epoch": 0.09427631471267002,
"grad_norm": 0.6754702925682068,
"learning_rate": 0.00018141059255374935,
"loss": 1.3432,
"step": 720
},
{
"epoch": 0.09558570797256821,
"grad_norm": 0.6337271332740784,
"learning_rate": 0.00018114840062926062,
"loss": 1.4439,
"step": 730
},
{
"epoch": 0.0968951012324664,
"grad_norm": 0.6592088937759399,
"learning_rate": 0.00018088620870477192,
"loss": 1.3949,
"step": 740
},
{
"epoch": 0.0982044944923646,
"grad_norm": 0.6700498461723328,
"learning_rate": 0.0001806240167802832,
"loss": 1.4046,
"step": 750
},
{
"epoch": 0.09951388775226279,
"grad_norm": 0.708410382270813,
"learning_rate": 0.00018036182485579446,
"loss": 1.3021,
"step": 760
},
{
"epoch": 0.100823281012161,
"grad_norm": 0.6718457937240601,
"learning_rate": 0.00018009963293130573,
"loss": 1.3769,
"step": 770
},
{
"epoch": 0.10213267427205919,
"grad_norm": 0.661522388458252,
"learning_rate": 0.00017983744100681702,
"loss": 1.434,
"step": 780
},
{
"epoch": 0.10344206753195738,
"grad_norm": 0.6615481376647949,
"learning_rate": 0.0001795752490823283,
"loss": 1.3839,
"step": 790
},
{
"epoch": 0.10475146079185557,
"grad_norm": 0.696959376335144,
"learning_rate": 0.00017931305715783956,
"loss": 1.3634,
"step": 800
},
{
"epoch": 0.10606085405175376,
"grad_norm": 0.7320592403411865,
"learning_rate": 0.00017905086523335083,
"loss": 1.2737,
"step": 810
},
{
"epoch": 0.10737024731165197,
"grad_norm": 0.7200619578361511,
"learning_rate": 0.0001787886733088621,
"loss": 1.3732,
"step": 820
},
{
"epoch": 0.10867964057155016,
"grad_norm": 0.6982961297035217,
"learning_rate": 0.00017852648138437337,
"loss": 1.3019,
"step": 830
},
{
"epoch": 0.10998903383144835,
"grad_norm": 0.7427386045455933,
"learning_rate": 0.00017826428945988464,
"loss": 1.3398,
"step": 840
},
{
"epoch": 0.11129842709134655,
"grad_norm": 0.7897806763648987,
"learning_rate": 0.0001780020975353959,
"loss": 1.3216,
"step": 850
},
{
"epoch": 0.11260782035124474,
"grad_norm": 0.7520805597305298,
"learning_rate": 0.00017773990561090717,
"loss": 1.2875,
"step": 860
},
{
"epoch": 0.11391721361114293,
"grad_norm": 0.7332555055618286,
"learning_rate": 0.00017747771368641844,
"loss": 1.272,
"step": 870
},
{
"epoch": 0.11522660687104114,
"grad_norm": 0.7135840654373169,
"learning_rate": 0.00017721552176192974,
"loss": 1.3185,
"step": 880
},
{
"epoch": 0.11653600013093933,
"grad_norm": 0.6898264288902283,
"learning_rate": 0.000176953329837441,
"loss": 1.3089,
"step": 890
},
{
"epoch": 0.11784539339083752,
"grad_norm": 0.9488328099250793,
"learning_rate": 0.00017669113791295228,
"loss": 1.2258,
"step": 900
},
{
"epoch": 0.11915478665073571,
"grad_norm": 0.7257933616638184,
"learning_rate": 0.00017642894598846355,
"loss": 1.3284,
"step": 910
},
{
"epoch": 0.1204641799106339,
"grad_norm": 0.7688736915588379,
"learning_rate": 0.00017616675406397484,
"loss": 1.2878,
"step": 920
},
{
"epoch": 0.1217735731705321,
"grad_norm": 0.8328510522842407,
"learning_rate": 0.0001759045621394861,
"loss": 1.2346,
"step": 930
},
{
"epoch": 0.1230829664304303,
"grad_norm": 0.8448120951652527,
"learning_rate": 0.00017564237021499738,
"loss": 1.2926,
"step": 940
},
{
"epoch": 0.1243923596903285,
"grad_norm": 0.8510689735412598,
"learning_rate": 0.00017538017829050865,
"loss": 1.2109,
"step": 950
},
{
"epoch": 0.12570175295022668,
"grad_norm": 0.866874098777771,
"learning_rate": 0.00017511798636601992,
"loss": 1.3091,
"step": 960
},
{
"epoch": 0.12701114621012488,
"grad_norm": 0.9010233879089355,
"learning_rate": 0.00017485579444153122,
"loss": 1.2273,
"step": 970
},
{
"epoch": 0.1283205394700231,
"grad_norm": 0.9316047430038452,
"learning_rate": 0.00017459360251704248,
"loss": 1.2611,
"step": 980
},
{
"epoch": 0.12962993272992127,
"grad_norm": 0.9005467295646667,
"learning_rate": 0.00017433141059255375,
"loss": 1.1747,
"step": 990
},
{
"epoch": 0.13093932598981947,
"grad_norm": 0.8843415975570679,
"learning_rate": 0.00017406921866806502,
"loss": 1.1915,
"step": 1000
},
{
"epoch": 0.13224871924971765,
"grad_norm": 0.8090497851371765,
"learning_rate": 0.0001738070267435763,
"loss": 1.2452,
"step": 1010
},
{
"epoch": 0.13355811250961586,
"grad_norm": 1.2498819828033447,
"learning_rate": 0.0001735448348190876,
"loss": 1.276,
"step": 1020
},
{
"epoch": 0.13486750576951406,
"grad_norm": 0.7861034870147705,
"learning_rate": 0.00017328264289459886,
"loss": 1.1989,
"step": 1030
},
{
"epoch": 0.13617689902941224,
"grad_norm": 0.9525002837181091,
"learning_rate": 0.00017302045097011013,
"loss": 1.1338,
"step": 1040
},
{
"epoch": 0.13748629228931045,
"grad_norm": 0.8066142201423645,
"learning_rate": 0.0001727582590456214,
"loss": 1.1421,
"step": 1050
},
{
"epoch": 0.13879568554920862,
"grad_norm": 0.8200965523719788,
"learning_rate": 0.0001724960671211327,
"loss": 1.1596,
"step": 1060
},
{
"epoch": 0.14010507880910683,
"grad_norm": 0.9981400370597839,
"learning_rate": 0.00017223387519664396,
"loss": 1.0562,
"step": 1070
},
{
"epoch": 0.14141447206900504,
"grad_norm": 0.9273063540458679,
"learning_rate": 0.00017197168327215523,
"loss": 1.1275,
"step": 1080
},
{
"epoch": 0.14272386532890322,
"grad_norm": 0.8812237977981567,
"learning_rate": 0.0001717094913476665,
"loss": 1.0406,
"step": 1090
},
{
"epoch": 0.14403325858880142,
"grad_norm": 0.8970304727554321,
"learning_rate": 0.00017144729942317777,
"loss": 1.1263,
"step": 1100
},
{
"epoch": 0.1453426518486996,
"grad_norm": 0.9097404479980469,
"learning_rate": 0.00017118510749868906,
"loss": 1.1956,
"step": 1110
},
{
"epoch": 0.1466520451085978,
"grad_norm": 1.0246269702911377,
"learning_rate": 0.00017092291557420033,
"loss": 1.0717,
"step": 1120
},
{
"epoch": 0.14796143836849598,
"grad_norm": 1.1149781942367554,
"learning_rate": 0.0001706607236497116,
"loss": 1.076,
"step": 1130
},
{
"epoch": 0.1492708316283942,
"grad_norm": 1.1981500387191772,
"learning_rate": 0.00017039853172522287,
"loss": 1.142,
"step": 1140
},
{
"epoch": 0.1505802248882924,
"grad_norm": 0.9477318525314331,
"learning_rate": 0.00017013633980073414,
"loss": 1.0799,
"step": 1150
},
{
"epoch": 0.15188961814819057,
"grad_norm": 1.0102957487106323,
"learning_rate": 0.00016987414787624544,
"loss": 1.0531,
"step": 1160
},
{
"epoch": 0.15319901140808878,
"grad_norm": 1.1728227138519287,
"learning_rate": 0.0001696119559517567,
"loss": 1.0903,
"step": 1170
},
{
"epoch": 0.15450840466798696,
"grad_norm": 1.0086623430252075,
"learning_rate": 0.00016934976402726797,
"loss": 1.0677,
"step": 1180
},
{
"epoch": 0.15581779792788517,
"grad_norm": 0.8586070537567139,
"learning_rate": 0.00016908757210277924,
"loss": 1.1022,
"step": 1190
},
{
"epoch": 0.15712719118778337,
"grad_norm": 1.2628968954086304,
"learning_rate": 0.00016882538017829054,
"loss": 1.0575,
"step": 1200
},
{
"epoch": 0.15843658444768155,
"grad_norm": 0.9629563689231873,
"learning_rate": 0.0001685631882538018,
"loss": 1.0844,
"step": 1210
},
{
"epoch": 0.15974597770757976,
"grad_norm": 1.0898447036743164,
"learning_rate": 0.00016830099632931308,
"loss": 1.0654,
"step": 1220
},
{
"epoch": 0.16105537096747793,
"grad_norm": 1.13120698928833,
"learning_rate": 0.00016803880440482435,
"loss": 1.0686,
"step": 1230
},
{
"epoch": 0.16236476422737614,
"grad_norm": 1.0732567310333252,
"learning_rate": 0.00016777661248033561,
"loss": 1.084,
"step": 1240
},
{
"epoch": 0.16367415748727435,
"grad_norm": 1.0681878328323364,
"learning_rate": 0.00016751442055584688,
"loss": 0.9979,
"step": 1250
},
{
"epoch": 0.16498355074717252,
"grad_norm": 0.9773361086845398,
"learning_rate": 0.00016725222863135815,
"loss": 1.0841,
"step": 1260
},
{
"epoch": 0.16629294400707073,
"grad_norm": 1.0342450141906738,
"learning_rate": 0.00016699003670686942,
"loss": 1.0176,
"step": 1270
},
{
"epoch": 0.1676023372669689,
"grad_norm": 1.0580531358718872,
"learning_rate": 0.0001667278447823807,
"loss": 0.9858,
"step": 1280
},
{
"epoch": 0.16891173052686712,
"grad_norm": 0.9744387865066528,
"learning_rate": 0.000166465652857892,
"loss": 0.9282,
"step": 1290
},
{
"epoch": 0.1702211237867653,
"grad_norm": 0.9636452198028564,
"learning_rate": 0.00016620346093340326,
"loss": 0.9414,
"step": 1300
},
{
"epoch": 0.1715305170466635,
"grad_norm": 1.1029468774795532,
"learning_rate": 0.00016594126900891453,
"loss": 0.8812,
"step": 1310
},
{
"epoch": 0.1728399103065617,
"grad_norm": 1.2941449880599976,
"learning_rate": 0.0001656790770844258,
"loss": 0.9823,
"step": 1320
},
{
"epoch": 0.17414930356645988,
"grad_norm": 1.627166509628296,
"learning_rate": 0.00016541688515993706,
"loss": 0.9585,
"step": 1330
},
{
"epoch": 0.1754586968263581,
"grad_norm": 1.091630458831787,
"learning_rate": 0.00016515469323544836,
"loss": 0.9516,
"step": 1340
},
{
"epoch": 0.17676809008625627,
"grad_norm": 1.1108227968215942,
"learning_rate": 0.00016489250131095963,
"loss": 0.8998,
"step": 1350
},
{
"epoch": 0.17807748334615447,
"grad_norm": 1.0883326530456543,
"learning_rate": 0.0001646303093864709,
"loss": 0.916,
"step": 1360
},
{
"epoch": 0.17938687660605268,
"grad_norm": 1.2917275428771973,
"learning_rate": 0.00016436811746198217,
"loss": 0.9112,
"step": 1370
},
{
"epoch": 0.18069626986595086,
"grad_norm": 1.1828432083129883,
"learning_rate": 0.00016410592553749344,
"loss": 0.9721,
"step": 1380
},
{
"epoch": 0.18200566312584907,
"grad_norm": 1.3447389602661133,
"learning_rate": 0.00016384373361300473,
"loss": 0.9198,
"step": 1390
},
{
"epoch": 0.18331505638574724,
"grad_norm": 1.0735760927200317,
"learning_rate": 0.000163581541688516,
"loss": 0.8634,
"step": 1400
},
{
"epoch": 0.18462444964564545,
"grad_norm": 1.0454446077346802,
"learning_rate": 0.00016331934976402727,
"loss": 0.9151,
"step": 1410
},
{
"epoch": 0.18593384290554366,
"grad_norm": 1.2230719327926636,
"learning_rate": 0.00016305715783953854,
"loss": 0.9202,
"step": 1420
},
{
"epoch": 0.18724323616544183,
"grad_norm": 1.1030149459838867,
"learning_rate": 0.00016279496591504984,
"loss": 0.9068,
"step": 1430
},
{
"epoch": 0.18855262942534004,
"grad_norm": 1.4471871852874756,
"learning_rate": 0.0001625327739905611,
"loss": 0.8682,
"step": 1440
},
{
"epoch": 0.18986202268523822,
"grad_norm": 1.2458796501159668,
"learning_rate": 0.00016227058206607237,
"loss": 0.8247,
"step": 1450
},
{
"epoch": 0.19117141594513642,
"grad_norm": 1.1849644184112549,
"learning_rate": 0.00016200839014158364,
"loss": 0.8987,
"step": 1460
},
{
"epoch": 0.19248080920503463,
"grad_norm": 1.2985557317733765,
"learning_rate": 0.0001617461982170949,
"loss": 0.8006,
"step": 1470
},
{
"epoch": 0.1937902024649328,
"grad_norm": 1.7127928733825684,
"learning_rate": 0.0001614840062926062,
"loss": 0.8191,
"step": 1480
},
{
"epoch": 0.19509959572483102,
"grad_norm": 1.440895915031433,
"learning_rate": 0.00016122181436811748,
"loss": 0.8129,
"step": 1490
},
{
"epoch": 0.1964089889847292,
"grad_norm": 1.252194881439209,
"learning_rate": 0.00016095962244362875,
"loss": 0.8803,
"step": 1500
},
{
"epoch": 0.1977183822446274,
"grad_norm": 1.138358235359192,
"learning_rate": 0.00016069743051914001,
"loss": 0.8744,
"step": 1510
},
{
"epoch": 0.19902777550452558,
"grad_norm": 1.080971598625183,
"learning_rate": 0.00016043523859465128,
"loss": 0.8693,
"step": 1520
},
{
"epoch": 0.20033716876442378,
"grad_norm": 1.1612547636032104,
"learning_rate": 0.00016017304667016258,
"loss": 0.7991,
"step": 1530
},
{
"epoch": 0.201646562024322,
"grad_norm": 1.1773971319198608,
"learning_rate": 0.00015991085474567385,
"loss": 0.912,
"step": 1540
},
{
"epoch": 0.20295595528422017,
"grad_norm": 1.1353998184204102,
"learning_rate": 0.00015964866282118512,
"loss": 0.7986,
"step": 1550
},
{
"epoch": 0.20426534854411837,
"grad_norm": 1.6848335266113281,
"learning_rate": 0.0001593864708966964,
"loss": 0.6932,
"step": 1560
},
{
"epoch": 0.20557474180401655,
"grad_norm": 1.4043173789978027,
"learning_rate": 0.00015912427897220768,
"loss": 0.8529,
"step": 1570
},
{
"epoch": 0.20688413506391476,
"grad_norm": 1.2601439952850342,
"learning_rate": 0.00015886208704771895,
"loss": 0.8173,
"step": 1580
},
{
"epoch": 0.20819352832381297,
"grad_norm": 1.2090034484863281,
"learning_rate": 0.00015859989512323022,
"loss": 0.7451,
"step": 1590
},
{
"epoch": 0.20950292158371114,
"grad_norm": 1.3334815502166748,
"learning_rate": 0.0001583377031987415,
"loss": 0.775,
"step": 1600
},
{
"epoch": 0.21081231484360935,
"grad_norm": 1.1993087530136108,
"learning_rate": 0.00015807551127425276,
"loss": 0.7733,
"step": 1610
},
{
"epoch": 0.21212170810350753,
"grad_norm": 1.51642906665802,
"learning_rate": 0.00015781331934976406,
"loss": 0.6907,
"step": 1620
},
{
"epoch": 0.21343110136340573,
"grad_norm": 1.3714466094970703,
"learning_rate": 0.00015755112742527532,
"loss": 0.7016,
"step": 1630
},
{
"epoch": 0.21474049462330394,
"grad_norm": 1.2519642114639282,
"learning_rate": 0.0001572889355007866,
"loss": 0.7648,
"step": 1640
},
{
"epoch": 0.21604988788320212,
"grad_norm": 1.3851202726364136,
"learning_rate": 0.00015702674357629786,
"loss": 0.7069,
"step": 1650
},
{
"epoch": 0.21735928114310032,
"grad_norm": 1.334105134010315,
"learning_rate": 0.00015676455165180913,
"loss": 0.7338,
"step": 1660
},
{
"epoch": 0.2186686744029985,
"grad_norm": 1.3785145282745361,
"learning_rate": 0.0001565023597273204,
"loss": 0.6299,
"step": 1670
},
{
"epoch": 0.2199780676628967,
"grad_norm": 1.4771215915679932,
"learning_rate": 0.00015624016780283167,
"loss": 0.6828,
"step": 1680
},
{
"epoch": 0.2212874609227949,
"grad_norm": 1.3885449171066284,
"learning_rate": 0.00015597797587834294,
"loss": 0.7141,
"step": 1690
},
{
"epoch": 0.2225968541826931,
"grad_norm": 1.2664909362792969,
"learning_rate": 0.00015571578395385423,
"loss": 0.7667,
"step": 1700
},
{
"epoch": 0.2239062474425913,
"grad_norm": 1.2576826810836792,
"learning_rate": 0.0001554535920293655,
"loss": 0.7395,
"step": 1710
},
{
"epoch": 0.22521564070248948,
"grad_norm": 1.284826636314392,
"learning_rate": 0.00015519140010487677,
"loss": 0.6832,
"step": 1720
},
{
"epoch": 0.22652503396238768,
"grad_norm": 1.272933006286621,
"learning_rate": 0.00015492920818038804,
"loss": 0.6892,
"step": 1730
},
{
"epoch": 0.22783442722228586,
"grad_norm": 1.3465379476547241,
"learning_rate": 0.0001546670162558993,
"loss": 0.6449,
"step": 1740
},
{
"epoch": 0.22914382048218407,
"grad_norm": 1.2862318754196167,
"learning_rate": 0.00015440482433141058,
"loss": 0.6883,
"step": 1750
},
{
"epoch": 0.23045321374208227,
"grad_norm": 1.2469042539596558,
"learning_rate": 0.00015414263240692188,
"loss": 0.7593,
"step": 1760
},
{
"epoch": 0.23176260700198045,
"grad_norm": 1.5080034732818604,
"learning_rate": 0.00015388044048243315,
"loss": 0.7009,
"step": 1770
},
{
"epoch": 0.23307200026187866,
"grad_norm": 0.9788569211959839,
"learning_rate": 0.00015361824855794441,
"loss": 0.602,
"step": 1780
},
{
"epoch": 0.23438139352177684,
"grad_norm": 1.3450673818588257,
"learning_rate": 0.00015335605663345568,
"loss": 0.6238,
"step": 1790
},
{
"epoch": 0.23569078678167504,
"grad_norm": 1.4177800416946411,
"learning_rate": 0.00015309386470896695,
"loss": 0.6768,
"step": 1800
},
{
"epoch": 0.23700018004157325,
"grad_norm": 1.3528062105178833,
"learning_rate": 0.00015283167278447825,
"loss": 0.6404,
"step": 1810
},
{
"epoch": 0.23830957330147143,
"grad_norm": 1.2898012399673462,
"learning_rate": 0.00015256948085998952,
"loss": 0.6606,
"step": 1820
},
{
"epoch": 0.23961896656136963,
"grad_norm": 1.311298131942749,
"learning_rate": 0.0001523072889355008,
"loss": 0.662,
"step": 1830
},
{
"epoch": 0.2409283598212678,
"grad_norm": 1.6476584672927856,
"learning_rate": 0.00015204509701101206,
"loss": 0.671,
"step": 1840
},
{
"epoch": 0.24223775308116602,
"grad_norm": 1.36719810962677,
"learning_rate": 0.00015178290508652335,
"loss": 0.7097,
"step": 1850
},
{
"epoch": 0.2435471463410642,
"grad_norm": 1.3647184371948242,
"learning_rate": 0.00015152071316203462,
"loss": 0.6604,
"step": 1860
},
{
"epoch": 0.2448565396009624,
"grad_norm": 1.2265934944152832,
"learning_rate": 0.0001512585212375459,
"loss": 0.6272,
"step": 1870
},
{
"epoch": 0.2461659328608606,
"grad_norm": 1.4882850646972656,
"learning_rate": 0.00015099632931305716,
"loss": 0.7007,
"step": 1880
},
{
"epoch": 0.2474753261207588,
"grad_norm": 1.408470869064331,
"learning_rate": 0.00015073413738856843,
"loss": 0.6526,
"step": 1890
},
{
"epoch": 0.248784719380657,
"grad_norm": 1.3388913869857788,
"learning_rate": 0.00015047194546407972,
"loss": 0.6891,
"step": 1900
},
{
"epoch": 0.2500941126405552,
"grad_norm": 1.3725926876068115,
"learning_rate": 0.000150209753539591,
"loss": 0.5763,
"step": 1910
},
{
"epoch": 0.25140350590045335,
"grad_norm": 1.40208899974823,
"learning_rate": 0.00014994756161510226,
"loss": 0.5637,
"step": 1920
},
{
"epoch": 0.25271289916035156,
"grad_norm": 1.8308840990066528,
"learning_rate": 0.00014968536969061353,
"loss": 0.6899,
"step": 1930
},
{
"epoch": 0.25402229242024976,
"grad_norm": 1.4921183586120605,
"learning_rate": 0.0001494231777661248,
"loss": 0.5764,
"step": 1940
},
{
"epoch": 0.25533168568014797,
"grad_norm": 1.5387523174285889,
"learning_rate": 0.0001491609858416361,
"loss": 0.5229,
"step": 1950
},
{
"epoch": 0.2566410789400462,
"grad_norm": 1.3345798254013062,
"learning_rate": 0.00014889879391714737,
"loss": 0.5949,
"step": 1960
},
{
"epoch": 0.2579504721999443,
"grad_norm": 1.682065486907959,
"learning_rate": 0.00014863660199265863,
"loss": 0.5619,
"step": 1970
},
{
"epoch": 0.25925986545984253,
"grad_norm": 1.480276346206665,
"learning_rate": 0.0001483744100681699,
"loss": 0.5473,
"step": 1980
},
{
"epoch": 0.26056925871974074,
"grad_norm": 1.3453810214996338,
"learning_rate": 0.0001481122181436812,
"loss": 0.5603,
"step": 1990
},
{
"epoch": 0.26187865197963894,
"grad_norm": 1.4118777513504028,
"learning_rate": 0.00014785002621919247,
"loss": 0.5543,
"step": 2000
},
{
"epoch": 0.26318804523953715,
"grad_norm": 1.2959351539611816,
"learning_rate": 0.00014758783429470374,
"loss": 0.4962,
"step": 2010
},
{
"epoch": 0.2644974384994353,
"grad_norm": 1.3605815172195435,
"learning_rate": 0.000147325642370215,
"loss": 0.5699,
"step": 2020
},
{
"epoch": 0.2658068317593335,
"grad_norm": 2.086613416671753,
"learning_rate": 0.00014706345044572628,
"loss": 0.565,
"step": 2030
},
{
"epoch": 0.2671162250192317,
"grad_norm": 1.2892887592315674,
"learning_rate": 0.00014680125852123757,
"loss": 0.6062,
"step": 2040
},
{
"epoch": 0.2684256182791299,
"grad_norm": 1.5760036706924438,
"learning_rate": 0.00014653906659674884,
"loss": 0.5642,
"step": 2050
},
{
"epoch": 0.2697350115390281,
"grad_norm": 1.21380615234375,
"learning_rate": 0.0001462768746722601,
"loss": 0.5514,
"step": 2060
},
{
"epoch": 0.2710444047989263,
"grad_norm": 1.4393121004104614,
"learning_rate": 0.00014601468274777138,
"loss": 0.5572,
"step": 2070
},
{
"epoch": 0.2723537980588245,
"grad_norm": 1.2972021102905273,
"learning_rate": 0.00014575249082328265,
"loss": 0.535,
"step": 2080
},
{
"epoch": 0.2736631913187227,
"grad_norm": 1.0208637714385986,
"learning_rate": 0.00014549029889879392,
"loss": 0.5835,
"step": 2090
},
{
"epoch": 0.2749725845786209,
"grad_norm": 1.4418736696243286,
"learning_rate": 0.00014522810697430521,
"loss": 0.4829,
"step": 2100
},
{
"epoch": 0.2762819778385191,
"grad_norm": 1.4326051473617554,
"learning_rate": 0.00014496591504981648,
"loss": 0.4711,
"step": 2110
},
{
"epoch": 0.27759137109841725,
"grad_norm": 1.497841715812683,
"learning_rate": 0.00014470372312532775,
"loss": 0.4935,
"step": 2120
},
{
"epoch": 0.27890076435831546,
"grad_norm": 1.5082463026046753,
"learning_rate": 0.00014444153120083902,
"loss": 0.4979,
"step": 2130
},
{
"epoch": 0.28021015761821366,
"grad_norm": 1.2458934783935547,
"learning_rate": 0.0001441793392763503,
"loss": 0.5644,
"step": 2140
},
{
"epoch": 0.28151955087811187,
"grad_norm": 1.730130910873413,
"learning_rate": 0.00014391714735186156,
"loss": 0.4749,
"step": 2150
},
{
"epoch": 0.2828289441380101,
"grad_norm": 1.2587112188339233,
"learning_rate": 0.00014365495542737283,
"loss": 0.5175,
"step": 2160
},
{
"epoch": 0.2841383373979082,
"grad_norm": 1.431119441986084,
"learning_rate": 0.0001433927635028841,
"loss": 0.5597,
"step": 2170
},
{
"epoch": 0.28544773065780643,
"grad_norm": 1.5383937358856201,
"learning_rate": 0.0001431305715783954,
"loss": 0.5153,
"step": 2180
},
{
"epoch": 0.28675712391770464,
"grad_norm": 1.4311727285385132,
"learning_rate": 0.00014286837965390666,
"loss": 0.5452,
"step": 2190
},
{
"epoch": 0.28806651717760284,
"grad_norm": 1.2555975914001465,
"learning_rate": 0.00014260618772941793,
"loss": 0.4937,
"step": 2200
},
{
"epoch": 0.28937591043750105,
"grad_norm": 1.3781330585479736,
"learning_rate": 0.0001423439958049292,
"loss": 0.4537,
"step": 2210
},
{
"epoch": 0.2906853036973992,
"grad_norm": 1.4810888767242432,
"learning_rate": 0.00014208180388044047,
"loss": 0.396,
"step": 2220
},
{
"epoch": 0.2919946969572974,
"grad_norm": 1.6619911193847656,
"learning_rate": 0.00014181961195595177,
"loss": 0.4756,
"step": 2230
},
{
"epoch": 0.2933040902171956,
"grad_norm": 1.3403065204620361,
"learning_rate": 0.00014155742003146303,
"loss": 0.5157,
"step": 2240
},
{
"epoch": 0.2946134834770938,
"grad_norm": 1.4188278913497925,
"learning_rate": 0.0001412952281069743,
"loss": 0.5237,
"step": 2250
},
{
"epoch": 0.29592287673699197,
"grad_norm": 1.852266550064087,
"learning_rate": 0.00014103303618248557,
"loss": 0.4558,
"step": 2260
},
{
"epoch": 0.2972322699968902,
"grad_norm": 1.3092072010040283,
"learning_rate": 0.00014077084425799687,
"loss": 0.4437,
"step": 2270
},
{
"epoch": 0.2985416632567884,
"grad_norm": 1.4190593957901,
"learning_rate": 0.00014050865233350814,
"loss": 0.4717,
"step": 2280
},
{
"epoch": 0.2998510565166866,
"grad_norm": 1.4562608003616333,
"learning_rate": 0.0001402464604090194,
"loss": 0.4744,
"step": 2290
},
{
"epoch": 0.3011604497765848,
"grad_norm": 1.4576420783996582,
"learning_rate": 0.00013998426848453068,
"loss": 0.4429,
"step": 2300
},
{
"epoch": 0.30246984303648294,
"grad_norm": 1.867145299911499,
"learning_rate": 0.00013972207656004194,
"loss": 0.4881,
"step": 2310
},
{
"epoch": 0.30377923629638115,
"grad_norm": 1.3077807426452637,
"learning_rate": 0.00013945988463555324,
"loss": 0.4067,
"step": 2320
},
{
"epoch": 0.30508862955627936,
"grad_norm": 1.3587473630905151,
"learning_rate": 0.0001391976927110645,
"loss": 0.4428,
"step": 2330
},
{
"epoch": 0.30639802281617756,
"grad_norm": 1.6012579202651978,
"learning_rate": 0.00013893550078657578,
"loss": 0.4572,
"step": 2340
},
{
"epoch": 0.30770741607607577,
"grad_norm": 1.2226955890655518,
"learning_rate": 0.00013867330886208705,
"loss": 0.4117,
"step": 2350
},
{
"epoch": 0.3090168093359739,
"grad_norm": 1.4615281820297241,
"learning_rate": 0.00013841111693759834,
"loss": 0.4561,
"step": 2360
},
{
"epoch": 0.3103262025958721,
"grad_norm": 1.401014804840088,
"learning_rate": 0.0001381489250131096,
"loss": 0.441,
"step": 2370
},
{
"epoch": 0.31163559585577033,
"grad_norm": 1.4875798225402832,
"learning_rate": 0.00013788673308862088,
"loss": 0.3991,
"step": 2380
},
{
"epoch": 0.31294498911566854,
"grad_norm": 1.1867239475250244,
"learning_rate": 0.00013762454116413215,
"loss": 0.4223,
"step": 2390
},
{
"epoch": 0.31425438237556674,
"grad_norm": 1.3172953128814697,
"learning_rate": 0.00013736234923964342,
"loss": 0.4388,
"step": 2400
},
{
"epoch": 0.3155637756354649,
"grad_norm": 1.4044665098190308,
"learning_rate": 0.00013710015731515472,
"loss": 0.4102,
"step": 2410
},
{
"epoch": 0.3168731688953631,
"grad_norm": 1.5709283351898193,
"learning_rate": 0.00013683796539066599,
"loss": 0.4837,
"step": 2420
},
{
"epoch": 0.3181825621552613,
"grad_norm": 1.2237786054611206,
"learning_rate": 0.00013657577346617725,
"loss": 0.4452,
"step": 2430
},
{
"epoch": 0.3194919554151595,
"grad_norm": 1.8869267702102661,
"learning_rate": 0.00013631358154168852,
"loss": 0.4077,
"step": 2440
},
{
"epoch": 0.3208013486750577,
"grad_norm": 1.226117491722107,
"learning_rate": 0.0001360513896171998,
"loss": 0.4109,
"step": 2450
},
{
"epoch": 0.32211074193495587,
"grad_norm": 1.6273385286331177,
"learning_rate": 0.0001357891976927111,
"loss": 0.3596,
"step": 2460
},
{
"epoch": 0.3234201351948541,
"grad_norm": 1.4535574913024902,
"learning_rate": 0.00013552700576822236,
"loss": 0.3996,
"step": 2470
},
{
"epoch": 0.3247295284547523,
"grad_norm": 1.6052360534667969,
"learning_rate": 0.00013526481384373363,
"loss": 0.4082,
"step": 2480
},
{
"epoch": 0.3260389217146505,
"grad_norm": 1.9104530811309814,
"learning_rate": 0.0001350026219192449,
"loss": 0.4089,
"step": 2490
},
{
"epoch": 0.3273483149745487,
"grad_norm": 1.6006613969802856,
"learning_rate": 0.0001347404299947562,
"loss": 0.3848,
"step": 2500
},
{
"epoch": 0.32865770823444684,
"grad_norm": 1.4406352043151855,
"learning_rate": 0.00013447823807026746,
"loss": 0.3926,
"step": 2510
},
{
"epoch": 0.32996710149434505,
"grad_norm": 1.3455756902694702,
"learning_rate": 0.00013421604614577873,
"loss": 0.4203,
"step": 2520
},
{
"epoch": 0.33127649475424326,
"grad_norm": 1.7718679904937744,
"learning_rate": 0.00013395385422129,
"loss": 0.3765,
"step": 2530
},
{
"epoch": 0.33258588801414146,
"grad_norm": 1.410130500793457,
"learning_rate": 0.00013369166229680127,
"loss": 0.3646,
"step": 2540
},
{
"epoch": 0.33389528127403967,
"grad_norm": 1.6361408233642578,
"learning_rate": 0.00013342947037231254,
"loss": 0.3917,
"step": 2550
},
{
"epoch": 0.3352046745339378,
"grad_norm": 1.7627660036087036,
"learning_rate": 0.0001331672784478238,
"loss": 0.367,
"step": 2560
},
{
"epoch": 0.336514067793836,
"grad_norm": 1.2431906461715698,
"learning_rate": 0.00013290508652333508,
"loss": 0.3708,
"step": 2570
},
{
"epoch": 0.33782346105373423,
"grad_norm": 1.4763669967651367,
"learning_rate": 0.00013264289459884634,
"loss": 0.377,
"step": 2580
},
{
"epoch": 0.33913285431363244,
"grad_norm": 2.1701712608337402,
"learning_rate": 0.00013238070267435761,
"loss": 0.344,
"step": 2590
},
{
"epoch": 0.3404422475735306,
"grad_norm": 1.4388126134872437,
"learning_rate": 0.0001321185107498689,
"loss": 0.3556,
"step": 2600
},
{
"epoch": 0.3417516408334288,
"grad_norm": 1.2981114387512207,
"learning_rate": 0.00013185631882538018,
"loss": 0.3272,
"step": 2610
},
{
"epoch": 0.343061034093327,
"grad_norm": 1.539335012435913,
"learning_rate": 0.00013159412690089145,
"loss": 0.4132,
"step": 2620
},
{
"epoch": 0.3443704273532252,
"grad_norm": 1.9272770881652832,
"learning_rate": 0.00013133193497640272,
"loss": 0.4121,
"step": 2630
},
{
"epoch": 0.3456798206131234,
"grad_norm": 1.4415314197540283,
"learning_rate": 0.000131069743051914,
"loss": 0.3595,
"step": 2640
},
{
"epoch": 0.34698921387302156,
"grad_norm": 1.3155860900878906,
"learning_rate": 0.00013080755112742528,
"loss": 0.3611,
"step": 2650
},
{
"epoch": 0.34829860713291977,
"grad_norm": 1.507858157157898,
"learning_rate": 0.00013054535920293655,
"loss": 0.3813,
"step": 2660
},
{
"epoch": 0.349608000392818,
"grad_norm": 1.5444693565368652,
"learning_rate": 0.00013028316727844782,
"loss": 0.3527,
"step": 2670
},
{
"epoch": 0.3509173936527162,
"grad_norm": 1.4008456468582153,
"learning_rate": 0.0001300209753539591,
"loss": 0.3573,
"step": 2680
},
{
"epoch": 0.3522267869126144,
"grad_norm": 1.6443661451339722,
"learning_rate": 0.00012975878342947039,
"loss": 0.3885,
"step": 2690
},
{
"epoch": 0.35353618017251254,
"grad_norm": 1.513431429862976,
"learning_rate": 0.00012949659150498165,
"loss": 0.3332,
"step": 2700
},
{
"epoch": 0.35484557343241074,
"grad_norm": 1.6663899421691895,
"learning_rate": 0.00012923439958049292,
"loss": 0.3769,
"step": 2710
},
{
"epoch": 0.35615496669230895,
"grad_norm": 1.2655925750732422,
"learning_rate": 0.0001289722076560042,
"loss": 0.4177,
"step": 2720
},
{
"epoch": 0.35746435995220716,
"grad_norm": 1.324833869934082,
"learning_rate": 0.00012871001573151546,
"loss": 0.3501,
"step": 2730
},
{
"epoch": 0.35877375321210536,
"grad_norm": 1.4842655658721924,
"learning_rate": 0.00012844782380702676,
"loss": 0.3223,
"step": 2740
},
{
"epoch": 0.3600831464720035,
"grad_norm": 1.4087761640548706,
"learning_rate": 0.00012818563188253803,
"loss": 0.3308,
"step": 2750
},
{
"epoch": 0.3613925397319017,
"grad_norm": 1.7493972778320312,
"learning_rate": 0.0001279234399580493,
"loss": 0.3655,
"step": 2760
},
{
"epoch": 0.3627019329917999,
"grad_norm": 1.4829336404800415,
"learning_rate": 0.00012766124803356056,
"loss": 0.3674,
"step": 2770
},
{
"epoch": 0.36401132625169813,
"grad_norm": 1.39944589138031,
"learning_rate": 0.00012739905610907186,
"loss": 0.3285,
"step": 2780
},
{
"epoch": 0.36532071951159634,
"grad_norm": 1.5995631217956543,
"learning_rate": 0.00012713686418458313,
"loss": 0.3431,
"step": 2790
},
{
"epoch": 0.3666301127714945,
"grad_norm": 1.0113691091537476,
"learning_rate": 0.0001268746722600944,
"loss": 0.3389,
"step": 2800
},
{
"epoch": 0.3679395060313927,
"grad_norm": 1.6544948816299438,
"learning_rate": 0.00012661248033560567,
"loss": 0.323,
"step": 2810
},
{
"epoch": 0.3692488992912909,
"grad_norm": 1.8022606372833252,
"learning_rate": 0.00012635028841111694,
"loss": 0.3777,
"step": 2820
},
{
"epoch": 0.3705582925511891,
"grad_norm": 1.6005665063858032,
"learning_rate": 0.00012608809648662823,
"loss": 0.3482,
"step": 2830
},
{
"epoch": 0.3718676858110873,
"grad_norm": 1.2550064325332642,
"learning_rate": 0.0001258259045621395,
"loss": 0.3288,
"step": 2840
},
{
"epoch": 0.37317707907098546,
"grad_norm": 2.43110728263855,
"learning_rate": 0.00012556371263765077,
"loss": 0.3511,
"step": 2850
},
{
"epoch": 0.37448647233088367,
"grad_norm": 1.5041906833648682,
"learning_rate": 0.00012530152071316204,
"loss": 0.3578,
"step": 2860
},
{
"epoch": 0.3757958655907819,
"grad_norm": 1.6031140089035034,
"learning_rate": 0.0001250393287886733,
"loss": 0.3213,
"step": 2870
},
{
"epoch": 0.3771052588506801,
"grad_norm": 1.025795817375183,
"learning_rate": 0.0001247771368641846,
"loss": 0.3352,
"step": 2880
},
{
"epoch": 0.3784146521105783,
"grad_norm": 1.934812068939209,
"learning_rate": 0.00012451494493969587,
"loss": 0.3365,
"step": 2890
},
{
"epoch": 0.37972404537047644,
"grad_norm": 1.0730398893356323,
"learning_rate": 0.00012425275301520714,
"loss": 0.3365,
"step": 2900
},
{
"epoch": 0.38103343863037464,
"grad_norm": 1.3496712446212769,
"learning_rate": 0.0001239905610907184,
"loss": 0.3548,
"step": 2910
},
{
"epoch": 0.38234283189027285,
"grad_norm": 1.3053911924362183,
"learning_rate": 0.0001237283691662297,
"loss": 0.3563,
"step": 2920
},
{
"epoch": 0.38365222515017106,
"grad_norm": 1.3640882968902588,
"learning_rate": 0.00012346617724174098,
"loss": 0.365,
"step": 2930
},
{
"epoch": 0.38496161841006926,
"grad_norm": 1.3266191482543945,
"learning_rate": 0.00012320398531725225,
"loss": 0.2981,
"step": 2940
},
{
"epoch": 0.3862710116699674,
"grad_norm": 1.32815682888031,
"learning_rate": 0.00012294179339276352,
"loss": 0.3544,
"step": 2950
},
{
"epoch": 0.3875804049298656,
"grad_norm": 1.4236459732055664,
"learning_rate": 0.00012267960146827479,
"loss": 0.3095,
"step": 2960
},
{
"epoch": 0.3888897981897638,
"grad_norm": 1.1536756753921509,
"learning_rate": 0.00012241740954378605,
"loss": 0.3125,
"step": 2970
},
{
"epoch": 0.39019919144966203,
"grad_norm": 1.4237791299819946,
"learning_rate": 0.00012215521761929732,
"loss": 0.3207,
"step": 2980
},
{
"epoch": 0.3915085847095602,
"grad_norm": 1.4023237228393555,
"learning_rate": 0.0001218930256948086,
"loss": 0.3714,
"step": 2990
},
{
"epoch": 0.3928179779694584,
"grad_norm": 1.3556010723114014,
"learning_rate": 0.00012163083377031987,
"loss": 0.3313,
"step": 3000
},
{
"epoch": 0.3941273712293566,
"grad_norm": 1.2301980257034302,
"learning_rate": 0.00012136864184583114,
"loss": 0.3062,
"step": 3010
},
{
"epoch": 0.3954367644892548,
"grad_norm": 1.3532170057296753,
"learning_rate": 0.00012110644992134244,
"loss": 0.2946,
"step": 3020
},
{
"epoch": 0.396746157749153,
"grad_norm": 1.2680764198303223,
"learning_rate": 0.00012084425799685371,
"loss": 0.3005,
"step": 3030
},
{
"epoch": 0.39805555100905116,
"grad_norm": 1.5346810817718506,
"learning_rate": 0.00012058206607236498,
"loss": 0.3363,
"step": 3040
},
{
"epoch": 0.39936494426894936,
"grad_norm": 1.423195242881775,
"learning_rate": 0.00012031987414787625,
"loss": 0.3294,
"step": 3050
},
{
"epoch": 0.40067433752884757,
"grad_norm": 1.599571704864502,
"learning_rate": 0.00012005768222338753,
"loss": 0.3469,
"step": 3060
},
{
"epoch": 0.4019837307887458,
"grad_norm": 1.2103453874588013,
"learning_rate": 0.0001197954902988988,
"loss": 0.2827,
"step": 3070
},
{
"epoch": 0.403293124048644,
"grad_norm": 1.3197276592254639,
"learning_rate": 0.00011953329837441007,
"loss": 0.3194,
"step": 3080
},
{
"epoch": 0.40460251730854213,
"grad_norm": 1.291038990020752,
"learning_rate": 0.00011927110644992135,
"loss": 0.2798,
"step": 3090
},
{
"epoch": 0.40591191056844034,
"grad_norm": 1.1556978225708008,
"learning_rate": 0.00011900891452543262,
"loss": 0.3318,
"step": 3100
},
{
"epoch": 0.40722130382833854,
"grad_norm": 1.3520278930664062,
"learning_rate": 0.0001187467226009439,
"loss": 0.3222,
"step": 3110
},
{
"epoch": 0.40853069708823675,
"grad_norm": 1.0671277046203613,
"learning_rate": 0.00011848453067645517,
"loss": 0.268,
"step": 3120
},
{
"epoch": 0.40984009034813496,
"grad_norm": 1.442131757736206,
"learning_rate": 0.00011822233875196644,
"loss": 0.3028,
"step": 3130
},
{
"epoch": 0.4111494836080331,
"grad_norm": 1.5673497915267944,
"learning_rate": 0.00011796014682747771,
"loss": 0.31,
"step": 3140
},
{
"epoch": 0.4124588768679313,
"grad_norm": 1.2009717226028442,
"learning_rate": 0.00011769795490298898,
"loss": 0.2986,
"step": 3150
},
{
"epoch": 0.4137682701278295,
"grad_norm": 1.2754930257797241,
"learning_rate": 0.00011743576297850027,
"loss": 0.3352,
"step": 3160
},
{
"epoch": 0.4150776633877277,
"grad_norm": 1.6189430952072144,
"learning_rate": 0.00011717357105401154,
"loss": 0.3804,
"step": 3170
},
{
"epoch": 0.41638705664762593,
"grad_norm": 1.6117827892303467,
"learning_rate": 0.00011691137912952281,
"loss": 0.3239,
"step": 3180
},
{
"epoch": 0.4176964499075241,
"grad_norm": 1.7495907545089722,
"learning_rate": 0.00011664918720503408,
"loss": 0.3145,
"step": 3190
},
{
"epoch": 0.4190058431674223,
"grad_norm": 1.2301905155181885,
"learning_rate": 0.00011638699528054538,
"loss": 0.2776,
"step": 3200
},
{
"epoch": 0.4203152364273205,
"grad_norm": 1.3571341037750244,
"learning_rate": 0.00011612480335605665,
"loss": 0.3019,
"step": 3210
},
{
"epoch": 0.4216246296872187,
"grad_norm": 0.9271483421325684,
"learning_rate": 0.00011586261143156792,
"loss": 0.2929,
"step": 3220
},
{
"epoch": 0.4229340229471169,
"grad_norm": 1.294146180152893,
"learning_rate": 0.00011560041950707918,
"loss": 0.3095,
"step": 3230
},
{
"epoch": 0.42424341620701506,
"grad_norm": 1.5177209377288818,
"learning_rate": 0.00011533822758259045,
"loss": 0.2714,
"step": 3240
},
{
"epoch": 0.42555280946691326,
"grad_norm": 1.1218962669372559,
"learning_rate": 0.00011507603565810175,
"loss": 0.282,
"step": 3250
},
{
"epoch": 0.42686220272681147,
"grad_norm": 1.2807728052139282,
"learning_rate": 0.00011481384373361302,
"loss": 0.3461,
"step": 3260
},
{
"epoch": 0.4281715959867097,
"grad_norm": 1.1680692434310913,
"learning_rate": 0.00011455165180912429,
"loss": 0.2842,
"step": 3270
},
{
"epoch": 0.4294809892466079,
"grad_norm": 1.6534638404846191,
"learning_rate": 0.00011428945988463556,
"loss": 0.2774,
"step": 3280
},
{
"epoch": 0.43079038250650603,
"grad_norm": 1.2321938276290894,
"learning_rate": 0.00011402726796014683,
"loss": 0.2841,
"step": 3290
},
{
"epoch": 0.43209977576640424,
"grad_norm": 1.6666522026062012,
"learning_rate": 0.00011376507603565811,
"loss": 0.2993,
"step": 3300
},
{
"epoch": 0.43340916902630244,
"grad_norm": 1.8330938816070557,
"learning_rate": 0.00011350288411116938,
"loss": 0.2834,
"step": 3310
},
{
"epoch": 0.43471856228620065,
"grad_norm": 1.570809245109558,
"learning_rate": 0.00011324069218668065,
"loss": 0.2885,
"step": 3320
},
{
"epoch": 0.4360279555460988,
"grad_norm": 1.4093183279037476,
"learning_rate": 0.00011297850026219192,
"loss": 0.2872,
"step": 3330
},
{
"epoch": 0.437337348805997,
"grad_norm": 0.8298211097717285,
"learning_rate": 0.00011271630833770321,
"loss": 0.2884,
"step": 3340
},
{
"epoch": 0.4386467420658952,
"grad_norm": 1.1143261194229126,
"learning_rate": 0.00011245411641321448,
"loss": 0.279,
"step": 3350
},
{
"epoch": 0.4399561353257934,
"grad_norm": 1.1568537950515747,
"learning_rate": 0.00011219192448872575,
"loss": 0.2724,
"step": 3360
},
{
"epoch": 0.4412655285856916,
"grad_norm": 0.8700618147850037,
"learning_rate": 0.00011192973256423702,
"loss": 0.2563,
"step": 3370
},
{
"epoch": 0.4425749218455898,
"grad_norm": 0.974319577217102,
"learning_rate": 0.00011166754063974829,
"loss": 0.2864,
"step": 3380
},
{
"epoch": 0.443884315105488,
"grad_norm": 0.9288910031318665,
"learning_rate": 0.00011140534871525958,
"loss": 0.2717,
"step": 3390
},
{
"epoch": 0.4451937083653862,
"grad_norm": 1.0942648649215698,
"learning_rate": 0.00011114315679077085,
"loss": 0.2625,
"step": 3400
},
{
"epoch": 0.4465031016252844,
"grad_norm": 1.3224159479141235,
"learning_rate": 0.00011088096486628212,
"loss": 0.2719,
"step": 3410
}
],
"logging_steps": 10,
"max_steps": 7638,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.8131850187780976e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}