DiagAgent-8B / trainer_state.json
Henrychur's picture
add model weights
6d30412 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9966329966329965,
"eval_steps": 500,
"global_step": 555,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0053872053872053875,
"grad_norm": 21.266149520874023,
"learning_rate": 0.0,
"loss": 0.9993,
"step": 1
},
{
"epoch": 0.010774410774410775,
"grad_norm": 21.13385009765625,
"learning_rate": 5.882352941176471e-07,
"loss": 1.0245,
"step": 2
},
{
"epoch": 0.01616161616161616,
"grad_norm": 20.182464599609375,
"learning_rate": 1.1764705882352942e-06,
"loss": 0.9562,
"step": 3
},
{
"epoch": 0.02154882154882155,
"grad_norm": 18.727153778076172,
"learning_rate": 1.7647058823529414e-06,
"loss": 0.9445,
"step": 4
},
{
"epoch": 0.026936026936026935,
"grad_norm": 16.479658126831055,
"learning_rate": 2.3529411764705885e-06,
"loss": 0.9854,
"step": 5
},
{
"epoch": 0.03232323232323232,
"grad_norm": 10.075958251953125,
"learning_rate": 2.9411764705882355e-06,
"loss": 0.7675,
"step": 6
},
{
"epoch": 0.03771043771043771,
"grad_norm": 8.65888500213623,
"learning_rate": 3.529411764705883e-06,
"loss": 0.7297,
"step": 7
},
{
"epoch": 0.0430976430976431,
"grad_norm": 8.33163070678711,
"learning_rate": 4.11764705882353e-06,
"loss": 0.6672,
"step": 8
},
{
"epoch": 0.048484848484848485,
"grad_norm": 7.000586032867432,
"learning_rate": 4.705882352941177e-06,
"loss": 0.6599,
"step": 9
},
{
"epoch": 0.05387205387205387,
"grad_norm": 6.877265930175781,
"learning_rate": 5.294117647058824e-06,
"loss": 0.5728,
"step": 10
},
{
"epoch": 0.05925925925925926,
"grad_norm": 6.868885040283203,
"learning_rate": 5.882352941176471e-06,
"loss": 0.6731,
"step": 11
},
{
"epoch": 0.06464646464646465,
"grad_norm": 6.862372875213623,
"learning_rate": 6.470588235294119e-06,
"loss": 0.7126,
"step": 12
},
{
"epoch": 0.07003367003367003,
"grad_norm": 6.195284843444824,
"learning_rate": 7.058823529411766e-06,
"loss": 0.644,
"step": 13
},
{
"epoch": 0.07542087542087542,
"grad_norm": 6.2631120681762695,
"learning_rate": 7.647058823529411e-06,
"loss": 0.5753,
"step": 14
},
{
"epoch": 0.08080808080808081,
"grad_norm": 5.94320011138916,
"learning_rate": 8.23529411764706e-06,
"loss": 0.6584,
"step": 15
},
{
"epoch": 0.0861952861952862,
"grad_norm": 5.2665205001831055,
"learning_rate": 8.823529411764707e-06,
"loss": 0.6102,
"step": 16
},
{
"epoch": 0.09158249158249158,
"grad_norm": 5.388559341430664,
"learning_rate": 9.411764705882354e-06,
"loss": 0.6127,
"step": 17
},
{
"epoch": 0.09696969696969697,
"grad_norm": 5.109943866729736,
"learning_rate": 1e-05,
"loss": 0.6259,
"step": 18
},
{
"epoch": 0.10235690235690235,
"grad_norm": 4.631857395172119,
"learning_rate": 9.999914754008063e-06,
"loss": 0.6064,
"step": 19
},
{
"epoch": 0.10774410774410774,
"grad_norm": 4.755272388458252,
"learning_rate": 9.999659018938999e-06,
"loss": 0.5934,
"step": 20
},
{
"epoch": 0.11313131313131314,
"grad_norm": 4.383729934692383,
"learning_rate": 9.999232803512967e-06,
"loss": 0.6137,
"step": 21
},
{
"epoch": 0.11851851851851852,
"grad_norm": 4.2614593505859375,
"learning_rate": 9.998636122263227e-06,
"loss": 0.5914,
"step": 22
},
{
"epoch": 0.12390572390572391,
"grad_norm": 4.656721591949463,
"learning_rate": 9.997868995535658e-06,
"loss": 0.599,
"step": 23
},
{
"epoch": 0.1292929292929293,
"grad_norm": 4.374063491821289,
"learning_rate": 9.996931449488046e-06,
"loss": 0.6489,
"step": 24
},
{
"epoch": 0.13468013468013468,
"grad_norm": 4.434129238128662,
"learning_rate": 9.99582351608921e-06,
"loss": 0.5895,
"step": 25
},
{
"epoch": 0.14006734006734006,
"grad_norm": 4.682045936584473,
"learning_rate": 9.994545233117904e-06,
"loss": 0.6253,
"step": 26
},
{
"epoch": 0.14545454545454545,
"grad_norm": 4.347814559936523,
"learning_rate": 9.993096644161526e-06,
"loss": 0.617,
"step": 27
},
{
"epoch": 0.15084175084175083,
"grad_norm": 4.48855447769165,
"learning_rate": 9.991477798614638e-06,
"loss": 0.6468,
"step": 28
},
{
"epoch": 0.15622895622895622,
"grad_norm": 4.433114528656006,
"learning_rate": 9.989688751677277e-06,
"loss": 0.6084,
"step": 29
},
{
"epoch": 0.16161616161616163,
"grad_norm": 3.879382610321045,
"learning_rate": 9.987729564353077e-06,
"loss": 0.5468,
"step": 30
},
{
"epoch": 0.16700336700336701,
"grad_norm": 4.3543009757995605,
"learning_rate": 9.985600303447185e-06,
"loss": 0.6268,
"step": 31
},
{
"epoch": 0.1723905723905724,
"grad_norm": 6.505020618438721,
"learning_rate": 9.98330104156398e-06,
"loss": 0.5947,
"step": 32
},
{
"epoch": 0.17777777777777778,
"grad_norm": 4.169903755187988,
"learning_rate": 9.980831857104612e-06,
"loss": 0.574,
"step": 33
},
{
"epoch": 0.18316498316498317,
"grad_norm": 4.362861633300781,
"learning_rate": 9.978192834264307e-06,
"loss": 0.5851,
"step": 34
},
{
"epoch": 0.18855218855218855,
"grad_norm": 3.7690815925598145,
"learning_rate": 9.975384063029516e-06,
"loss": 0.6023,
"step": 35
},
{
"epoch": 0.19393939393939394,
"grad_norm": 4.335365295410156,
"learning_rate": 9.972405639174833e-06,
"loss": 0.6267,
"step": 36
},
{
"epoch": 0.19932659932659932,
"grad_norm": 4.149550914764404,
"learning_rate": 9.96925766425974e-06,
"loss": 0.599,
"step": 37
},
{
"epoch": 0.2047138047138047,
"grad_norm": 4.021537780761719,
"learning_rate": 9.965940245625131e-06,
"loss": 0.5859,
"step": 38
},
{
"epoch": 0.2101010101010101,
"grad_norm": 4.439505100250244,
"learning_rate": 9.962453496389665e-06,
"loss": 0.5895,
"step": 39
},
{
"epoch": 0.21548821548821548,
"grad_norm": 4.288372039794922,
"learning_rate": 9.958797535445898e-06,
"loss": 0.6212,
"step": 40
},
{
"epoch": 0.22087542087542086,
"grad_norm": 4.0634260177612305,
"learning_rate": 9.95497248745624e-06,
"loss": 0.6061,
"step": 41
},
{
"epoch": 0.22626262626262628,
"grad_norm": 4.286866188049316,
"learning_rate": 9.950978482848694e-06,
"loss": 0.6458,
"step": 42
},
{
"epoch": 0.23164983164983166,
"grad_norm": 3.877549409866333,
"learning_rate": 9.946815657812416e-06,
"loss": 0.5868,
"step": 43
},
{
"epoch": 0.23703703703703705,
"grad_norm": 4.321531295776367,
"learning_rate": 9.94248415429306e-06,
"loss": 0.6158,
"step": 44
},
{
"epoch": 0.24242424242424243,
"grad_norm": 3.8047635555267334,
"learning_rate": 9.937984119987958e-06,
"loss": 0.5437,
"step": 45
},
{
"epoch": 0.24781144781144782,
"grad_norm": 4.01943826675415,
"learning_rate": 9.93331570834106e-06,
"loss": 0.5668,
"step": 46
},
{
"epoch": 0.2531986531986532,
"grad_norm": 4.549412250518799,
"learning_rate": 9.928479078537722e-06,
"loss": 0.6271,
"step": 47
},
{
"epoch": 0.2585858585858586,
"grad_norm": 3.865027904510498,
"learning_rate": 9.923474395499266e-06,
"loss": 0.6187,
"step": 48
},
{
"epoch": 0.26397306397306397,
"grad_norm": 3.9334516525268555,
"learning_rate": 9.91830182987736e-06,
"loss": 0.614,
"step": 49
},
{
"epoch": 0.26936026936026936,
"grad_norm": 3.9490811824798584,
"learning_rate": 9.912961558048196e-06,
"loss": 0.5716,
"step": 50
},
{
"epoch": 0.27474747474747474,
"grad_norm": 3.834277391433716,
"learning_rate": 9.907453762106484e-06,
"loss": 0.5145,
"step": 51
},
{
"epoch": 0.2801346801346801,
"grad_norm": 3.9712698459625244,
"learning_rate": 9.901778629859236e-06,
"loss": 0.627,
"step": 52
},
{
"epoch": 0.2855218855218855,
"grad_norm": 4.146055698394775,
"learning_rate": 9.895936354819362e-06,
"loss": 0.5962,
"step": 53
},
{
"epoch": 0.2909090909090909,
"grad_norm": 4.930230140686035,
"learning_rate": 9.889927136199075e-06,
"loss": 0.5974,
"step": 54
},
{
"epoch": 0.2962962962962963,
"grad_norm": 4.270641803741455,
"learning_rate": 9.883751178903095e-06,
"loss": 0.6245,
"step": 55
},
{
"epoch": 0.30168350168350166,
"grad_norm": 8.589272499084473,
"learning_rate": 9.877408693521664e-06,
"loss": 0.6359,
"step": 56
},
{
"epoch": 0.30707070707070705,
"grad_norm": 8.13204288482666,
"learning_rate": 9.870899896323368e-06,
"loss": 0.6429,
"step": 57
},
{
"epoch": 0.31245791245791243,
"grad_norm": 7.613426208496094,
"learning_rate": 9.864225009247753e-06,
"loss": 0.577,
"step": 58
},
{
"epoch": 0.3178451178451178,
"grad_norm": 4.240153789520264,
"learning_rate": 9.857384259897768e-06,
"loss": 0.6715,
"step": 59
},
{
"epoch": 0.32323232323232326,
"grad_norm": 3.9827535152435303,
"learning_rate": 9.850377881532e-06,
"loss": 0.5256,
"step": 60
},
{
"epoch": 0.32861952861952864,
"grad_norm": 5.192502975463867,
"learning_rate": 9.843206113056715e-06,
"loss": 0.5537,
"step": 61
},
{
"epoch": 0.33400673400673403,
"grad_norm": 3.669801950454712,
"learning_rate": 9.835869199017725e-06,
"loss": 0.6018,
"step": 62
},
{
"epoch": 0.3393939393939394,
"grad_norm": 4.642088890075684,
"learning_rate": 9.828367389592034e-06,
"loss": 0.5001,
"step": 63
},
{
"epoch": 0.3447811447811448,
"grad_norm": 3.983962297439575,
"learning_rate": 9.820700940579312e-06,
"loss": 0.624,
"step": 64
},
{
"epoch": 0.3501683501683502,
"grad_norm": 3.97925066947937,
"learning_rate": 9.812870113393185e-06,
"loss": 0.5945,
"step": 65
},
{
"epoch": 0.35555555555555557,
"grad_norm": 4.082148551940918,
"learning_rate": 9.804875175052304e-06,
"loss": 0.5847,
"step": 66
},
{
"epoch": 0.36094276094276095,
"grad_norm": 3.4573113918304443,
"learning_rate": 9.796716398171248e-06,
"loss": 0.5016,
"step": 67
},
{
"epoch": 0.36632996632996634,
"grad_norm": 3.9368677139282227,
"learning_rate": 9.788394060951228e-06,
"loss": 0.5582,
"step": 68
},
{
"epoch": 0.3717171717171717,
"grad_norm": 3.7513315677642822,
"learning_rate": 9.779908447170602e-06,
"loss": 0.5525,
"step": 69
},
{
"epoch": 0.3771043771043771,
"grad_norm": 3.7674131393432617,
"learning_rate": 9.771259846175195e-06,
"loss": 0.5577,
"step": 70
},
{
"epoch": 0.3824915824915825,
"grad_norm": 3.596757650375366,
"learning_rate": 9.762448552868433e-06,
"loss": 0.553,
"step": 71
},
{
"epoch": 0.3878787878787879,
"grad_norm": 3.4366366863250732,
"learning_rate": 9.753474867701294e-06,
"loss": 0.533,
"step": 72
},
{
"epoch": 0.39326599326599326,
"grad_norm": 3.8846004009246826,
"learning_rate": 9.744339096662056e-06,
"loss": 0.5755,
"step": 73
},
{
"epoch": 0.39865319865319865,
"grad_norm": 3.593231439590454,
"learning_rate": 9.735041551265862e-06,
"loss": 0.5424,
"step": 74
},
{
"epoch": 0.40404040404040403,
"grad_norm": 3.5270259380340576,
"learning_rate": 9.725582548544106e-06,
"loss": 0.5218,
"step": 75
},
{
"epoch": 0.4094276094276094,
"grad_norm": 3.9130117893218994,
"learning_rate": 9.715962411033614e-06,
"loss": 0.5529,
"step": 76
},
{
"epoch": 0.4148148148148148,
"grad_norm": 3.5708324909210205,
"learning_rate": 9.706181466765654e-06,
"loss": 0.5047,
"step": 77
},
{
"epoch": 0.4202020202020202,
"grad_norm": 3.6041488647460938,
"learning_rate": 9.696240049254744e-06,
"loss": 0.4715,
"step": 78
},
{
"epoch": 0.4255892255892256,
"grad_norm": 3.532111644744873,
"learning_rate": 9.686138497487282e-06,
"loss": 0.5443,
"step": 79
},
{
"epoch": 0.43097643097643096,
"grad_norm": 3.3798911571502686,
"learning_rate": 9.675877155909989e-06,
"loss": 0.5196,
"step": 80
},
{
"epoch": 0.43636363636363634,
"grad_norm": 3.576612949371338,
"learning_rate": 9.66545637441816e-06,
"loss": 0.5593,
"step": 81
},
{
"epoch": 0.4417508417508417,
"grad_norm": 3.6367032527923584,
"learning_rate": 9.654876508343739e-06,
"loss": 0.5199,
"step": 82
},
{
"epoch": 0.4471380471380471,
"grad_norm": 4.221003532409668,
"learning_rate": 9.644137918443198e-06,
"loss": 0.5799,
"step": 83
},
{
"epoch": 0.45252525252525255,
"grad_norm": 3.6288747787475586,
"learning_rate": 9.633240970885231e-06,
"loss": 0.5702,
"step": 84
},
{
"epoch": 0.45791245791245794,
"grad_norm": 3.6418979167938232,
"learning_rate": 9.622186037238286e-06,
"loss": 0.5463,
"step": 85
},
{
"epoch": 0.4632996632996633,
"grad_norm": 3.5099191665649414,
"learning_rate": 9.610973494457873e-06,
"loss": 0.5871,
"step": 86
},
{
"epoch": 0.4686868686868687,
"grad_norm": 3.9148519039154053,
"learning_rate": 9.599603724873725e-06,
"loss": 0.6149,
"step": 87
},
{
"epoch": 0.4740740740740741,
"grad_norm": 3.3477306365966797,
"learning_rate": 9.588077116176756e-06,
"loss": 0.5618,
"step": 88
},
{
"epoch": 0.4794612794612795,
"grad_norm": 3.632464647293091,
"learning_rate": 9.576394061405847e-06,
"loss": 0.5747,
"step": 89
},
{
"epoch": 0.48484848484848486,
"grad_norm": 5.160216808319092,
"learning_rate": 9.564554958934432e-06,
"loss": 0.6318,
"step": 90
},
{
"epoch": 0.49023569023569025,
"grad_norm": 3.320161819458008,
"learning_rate": 9.55256021245692e-06,
"loss": 0.5472,
"step": 91
},
{
"epoch": 0.49562289562289563,
"grad_norm": 3.577775716781616,
"learning_rate": 9.540410230974943e-06,
"loss": 0.584,
"step": 92
},
{
"epoch": 0.501010101010101,
"grad_norm": 3.3152377605438232,
"learning_rate": 9.52810542878339e-06,
"loss": 0.5268,
"step": 93
},
{
"epoch": 0.5063973063973064,
"grad_norm": 3.468808889389038,
"learning_rate": 9.515646225456283e-06,
"loss": 0.6323,
"step": 94
},
{
"epoch": 0.5117845117845118,
"grad_norm": 5.061112880706787,
"learning_rate": 9.503033045832484e-06,
"loss": 0.5041,
"step": 95
},
{
"epoch": 0.5171717171717172,
"grad_norm": 5.695023059844971,
"learning_rate": 9.490266320001195e-06,
"loss": 0.5678,
"step": 96
},
{
"epoch": 0.5225589225589226,
"grad_norm": 4.4895920753479,
"learning_rate": 9.4773464832873e-06,
"loss": 0.6127,
"step": 97
},
{
"epoch": 0.5279461279461279,
"grad_norm": 3.6477298736572266,
"learning_rate": 9.464273976236518e-06,
"loss": 0.539,
"step": 98
},
{
"epoch": 0.5333333333333333,
"grad_norm": 5.325118541717529,
"learning_rate": 9.451049244600381e-06,
"loss": 0.5428,
"step": 99
},
{
"epoch": 0.5387205387205387,
"grad_norm": 3.778438091278076,
"learning_rate": 9.437672739321034e-06,
"loss": 0.5781,
"step": 100
},
{
"epoch": 0.5441077441077441,
"grad_norm": 3.363888740539551,
"learning_rate": 9.424144916515863e-06,
"loss": 0.5424,
"step": 101
},
{
"epoch": 0.5494949494949495,
"grad_norm": 3.4057974815368652,
"learning_rate": 9.410466237461937e-06,
"loss": 0.527,
"step": 102
},
{
"epoch": 0.5548821548821549,
"grad_norm": 3.555009126663208,
"learning_rate": 9.396637168580282e-06,
"loss": 0.5645,
"step": 103
},
{
"epoch": 0.5602693602693603,
"grad_norm": 3.691166639328003,
"learning_rate": 9.382658181419977e-06,
"loss": 0.5689,
"step": 104
},
{
"epoch": 0.5656565656565656,
"grad_norm": 3.210749626159668,
"learning_rate": 9.36852975264207e-06,
"loss": 0.4849,
"step": 105
},
{
"epoch": 0.571043771043771,
"grad_norm": 3.507824659347534,
"learning_rate": 9.354252364003334e-06,
"loss": 0.5872,
"step": 106
},
{
"epoch": 0.5764309764309764,
"grad_norm": 3.4085872173309326,
"learning_rate": 9.339826502339828e-06,
"loss": 0.5664,
"step": 107
},
{
"epoch": 0.5818181818181818,
"grad_norm": 3.474592924118042,
"learning_rate": 9.32525265955031e-06,
"loss": 0.5818,
"step": 108
},
{
"epoch": 0.5872053872053872,
"grad_norm": 3.5888025760650635,
"learning_rate": 9.310531332579453e-06,
"loss": 0.567,
"step": 109
},
{
"epoch": 0.5925925925925926,
"grad_norm": 3.412595510482788,
"learning_rate": 9.295663023400907e-06,
"loss": 0.5482,
"step": 110
},
{
"epoch": 0.597979797979798,
"grad_norm": 3.397404193878174,
"learning_rate": 9.280648239000174e-06,
"loss": 0.5572,
"step": 111
},
{
"epoch": 0.6033670033670033,
"grad_norm": 3.6878013610839844,
"learning_rate": 9.265487491357334e-06,
"loss": 0.6044,
"step": 112
},
{
"epoch": 0.6087542087542087,
"grad_norm": 3.4067952632904053,
"learning_rate": 9.250181297429573e-06,
"loss": 0.519,
"step": 113
},
{
"epoch": 0.6141414141414141,
"grad_norm": 3.6102547645568848,
"learning_rate": 9.234730179133564e-06,
"loss": 0.5897,
"step": 114
},
{
"epoch": 0.6195286195286195,
"grad_norm": 3.254011392593384,
"learning_rate": 9.219134663327672e-06,
"loss": 0.5444,
"step": 115
},
{
"epoch": 0.6249158249158249,
"grad_norm": 3.4662082195281982,
"learning_rate": 9.203395281793979e-06,
"loss": 0.5689,
"step": 116
},
{
"epoch": 0.6303030303030303,
"grad_norm": 3.225325345993042,
"learning_rate": 9.187512571220166e-06,
"loss": 0.4967,
"step": 117
},
{
"epoch": 0.6356902356902356,
"grad_norm": 3.3803765773773193,
"learning_rate": 9.171487073181198e-06,
"loss": 0.5245,
"step": 118
},
{
"epoch": 0.641077441077441,
"grad_norm": 3.078711748123169,
"learning_rate": 9.155319334120864e-06,
"loss": 0.4871,
"step": 119
},
{
"epoch": 0.6464646464646465,
"grad_norm": 3.5471031665802,
"learning_rate": 9.139009905333147e-06,
"loss": 0.5674,
"step": 120
},
{
"epoch": 0.6518518518518519,
"grad_norm": 3.0351247787475586,
"learning_rate": 9.122559342943423e-06,
"loss": 0.4854,
"step": 121
},
{
"epoch": 0.6572390572390573,
"grad_norm": 3.3814985752105713,
"learning_rate": 9.105968207889493e-06,
"loss": 0.5141,
"step": 122
},
{
"epoch": 0.6626262626262627,
"grad_norm": 3.2874019145965576,
"learning_rate": 9.089237065902464e-06,
"loss": 0.5255,
"step": 123
},
{
"epoch": 0.6680134680134681,
"grad_norm": 3.173571825027466,
"learning_rate": 9.072366487487451e-06,
"loss": 0.5269,
"step": 124
},
{
"epoch": 0.6734006734006734,
"grad_norm": 3.3994832038879395,
"learning_rate": 9.055357047904133e-06,
"loss": 0.5768,
"step": 125
},
{
"epoch": 0.6787878787878788,
"grad_norm": 3.376079797744751,
"learning_rate": 9.038209327147134e-06,
"loss": 0.6,
"step": 126
},
{
"epoch": 0.6841750841750842,
"grad_norm": 3.5709731578826904,
"learning_rate": 9.020923909926233e-06,
"loss": 0.6137,
"step": 127
},
{
"epoch": 0.6895622895622896,
"grad_norm": 3.0871469974517822,
"learning_rate": 9.00350138564645e-06,
"loss": 0.5537,
"step": 128
},
{
"epoch": 0.694949494949495,
"grad_norm": 2.978905200958252,
"learning_rate": 8.985942348387926e-06,
"loss": 0.4888,
"step": 129
},
{
"epoch": 0.7003367003367004,
"grad_norm": 3.196749687194824,
"learning_rate": 8.968247396885685e-06,
"loss": 0.5279,
"step": 130
},
{
"epoch": 0.7057239057239058,
"grad_norm": 3.2792575359344482,
"learning_rate": 8.950417134509201e-06,
"loss": 0.5749,
"step": 131
},
{
"epoch": 0.7111111111111111,
"grad_norm": 3.157092332839966,
"learning_rate": 8.932452169241838e-06,
"loss": 0.619,
"step": 132
},
{
"epoch": 0.7164983164983165,
"grad_norm": 3.2496225833892822,
"learning_rate": 8.914353113660107e-06,
"loss": 0.5495,
"step": 133
},
{
"epoch": 0.7218855218855219,
"grad_norm": 3.2431371212005615,
"learning_rate": 8.89612058491279e-06,
"loss": 0.5297,
"step": 134
},
{
"epoch": 0.7272727272727273,
"grad_norm": 3.2148752212524414,
"learning_rate": 8.877755204699883e-06,
"loss": 0.5175,
"step": 135
},
{
"epoch": 0.7326599326599327,
"grad_norm": 3.1605641841888428,
"learning_rate": 8.859257599251408e-06,
"loss": 0.5848,
"step": 136
},
{
"epoch": 0.7380471380471381,
"grad_norm": 3.1001222133636475,
"learning_rate": 8.840628399306056e-06,
"loss": 0.539,
"step": 137
},
{
"epoch": 0.7434343434343434,
"grad_norm": 3.3802716732025146,
"learning_rate": 8.821868240089676e-06,
"loss": 0.5782,
"step": 138
},
{
"epoch": 0.7488215488215488,
"grad_norm": 3.0083656311035156,
"learning_rate": 8.802977761293625e-06,
"loss": 0.5314,
"step": 139
},
{
"epoch": 0.7542087542087542,
"grad_norm": 3.2978479862213135,
"learning_rate": 8.783957607052941e-06,
"loss": 0.548,
"step": 140
},
{
"epoch": 0.7595959595959596,
"grad_norm": 3.177548885345459,
"learning_rate": 8.764808425924392e-06,
"loss": 0.4653,
"step": 141
},
{
"epoch": 0.764983164983165,
"grad_norm": 3.2603986263275146,
"learning_rate": 8.745530870864351e-06,
"loss": 0.5768,
"step": 142
},
{
"epoch": 0.7703703703703704,
"grad_norm": 3.4270477294921875,
"learning_rate": 8.726125599206543e-06,
"loss": 0.5426,
"step": 143
},
{
"epoch": 0.7757575757575758,
"grad_norm": 3.006866693496704,
"learning_rate": 8.706593272639616e-06,
"loss": 0.5038,
"step": 144
},
{
"epoch": 0.7811447811447811,
"grad_norm": 3.9326441287994385,
"learning_rate": 8.686934557184594e-06,
"loss": 0.618,
"step": 145
},
{
"epoch": 0.7865319865319865,
"grad_norm": 3.3260936737060547,
"learning_rate": 8.667150123172159e-06,
"loss": 0.5245,
"step": 146
},
{
"epoch": 0.7919191919191919,
"grad_norm": 3.189055919647217,
"learning_rate": 8.647240645219787e-06,
"loss": 0.5403,
"step": 147
},
{
"epoch": 0.7973063973063973,
"grad_norm": 3.107164144515991,
"learning_rate": 8.62720680220876e-06,
"loss": 0.5292,
"step": 148
},
{
"epoch": 0.8026936026936027,
"grad_norm": 3.372941493988037,
"learning_rate": 8.607049277261005e-06,
"loss": 0.5486,
"step": 149
},
{
"epoch": 0.8080808080808081,
"grad_norm": 3.3730578422546387,
"learning_rate": 8.586768757715806e-06,
"loss": 0.5845,
"step": 150
},
{
"epoch": 0.8134680134680135,
"grad_norm": 3.1509501934051514,
"learning_rate": 8.566365935106367e-06,
"loss": 0.5266,
"step": 151
},
{
"epoch": 0.8188552188552188,
"grad_norm": 3.464965581893921,
"learning_rate": 8.545841505136224e-06,
"loss": 0.5701,
"step": 152
},
{
"epoch": 0.8242424242424242,
"grad_norm": 3.0586905479431152,
"learning_rate": 8.525196167655539e-06,
"loss": 0.4934,
"step": 153
},
{
"epoch": 0.8296296296296296,
"grad_norm": 3.1889281272888184,
"learning_rate": 8.504430626637215e-06,
"loss": 0.5937,
"step": 154
},
{
"epoch": 0.835016835016835,
"grad_norm": 3.2143123149871826,
"learning_rate": 8.483545590152915e-06,
"loss": 0.5358,
"step": 155
},
{
"epoch": 0.8404040404040404,
"grad_norm": 3.3132236003875732,
"learning_rate": 8.462541770348896e-06,
"loss": 0.5258,
"step": 156
},
{
"epoch": 0.8457912457912458,
"grad_norm": 3.310232400894165,
"learning_rate": 8.441419883421742e-06,
"loss": 0.5908,
"step": 157
},
{
"epoch": 0.8511784511784511,
"grad_norm": 3.13468599319458,
"learning_rate": 8.42018064959393e-06,
"loss": 0.4796,
"step": 158
},
{
"epoch": 0.8565656565656565,
"grad_norm": 3.0902316570281982,
"learning_rate": 8.398824793089287e-06,
"loss": 0.5082,
"step": 159
},
{
"epoch": 0.8619528619528619,
"grad_norm": 3.193399429321289,
"learning_rate": 8.377353042108278e-06,
"loss": 0.5388,
"step": 160
},
{
"epoch": 0.8673400673400673,
"grad_norm": 3.0939056873321533,
"learning_rate": 8.355766128803192e-06,
"loss": 0.4641,
"step": 161
},
{
"epoch": 0.8727272727272727,
"grad_norm": 3.229541540145874,
"learning_rate": 8.334064789253157e-06,
"loss": 0.5247,
"step": 162
},
{
"epoch": 0.8781144781144781,
"grad_norm": 3.2554848194122314,
"learning_rate": 8.312249763439066e-06,
"loss": 0.5491,
"step": 163
},
{
"epoch": 0.8835016835016835,
"grad_norm": 3.2184009552001953,
"learning_rate": 8.29032179521832e-06,
"loss": 0.6099,
"step": 164
},
{
"epoch": 0.8888888888888888,
"grad_norm": 3.2048168182373047,
"learning_rate": 8.268281632299483e-06,
"loss": 0.4963,
"step": 165
},
{
"epoch": 0.8942760942760942,
"grad_norm": 3.0308377742767334,
"learning_rate": 8.246130026216777e-06,
"loss": 0.5222,
"step": 166
},
{
"epoch": 0.8996632996632996,
"grad_norm": 3.189265012741089,
"learning_rate": 8.22386773230445e-06,
"loss": 0.4913,
"step": 167
},
{
"epoch": 0.9050505050505051,
"grad_norm": 3.2512941360473633,
"learning_rate": 8.201495509671036e-06,
"loss": 0.5717,
"step": 168
},
{
"epoch": 0.9104377104377105,
"grad_norm": 3.2874414920806885,
"learning_rate": 8.179014121173461e-06,
"loss": 0.5334,
"step": 169
},
{
"epoch": 0.9158249158249159,
"grad_norm": 3.307884931564331,
"learning_rate": 8.156424333391026e-06,
"loss": 0.5617,
"step": 170
},
{
"epoch": 0.9212121212121213,
"grad_norm": 3.0463500022888184,
"learning_rate": 8.13372691659928e-06,
"loss": 0.5305,
"step": 171
},
{
"epoch": 0.9265993265993266,
"grad_norm": 3.3068511486053467,
"learning_rate": 8.110922644743747e-06,
"loss": 0.549,
"step": 172
},
{
"epoch": 0.931986531986532,
"grad_norm": 3.1428866386413574,
"learning_rate": 8.088012295413536e-06,
"loss": 0.4856,
"step": 173
},
{
"epoch": 0.9373737373737374,
"grad_norm": 2.96205997467041,
"learning_rate": 8.064996649814826e-06,
"loss": 0.4599,
"step": 174
},
{
"epoch": 0.9427609427609428,
"grad_norm": 3.3233330249786377,
"learning_rate": 8.041876492744239e-06,
"loss": 0.5505,
"step": 175
},
{
"epoch": 0.9481481481481482,
"grad_norm": 3.2451870441436768,
"learning_rate": 8.018652612562061e-06,
"loss": 0.4739,
"step": 176
},
{
"epoch": 0.9535353535353536,
"grad_norm": 3.231306791305542,
"learning_rate": 7.99532580116537e-06,
"loss": 0.5119,
"step": 177
},
{
"epoch": 0.958922558922559,
"grad_norm": 3.147303342819214,
"learning_rate": 7.971896853961043e-06,
"loss": 0.496,
"step": 178
},
{
"epoch": 0.9643097643097643,
"grad_norm": 3.530423641204834,
"learning_rate": 7.948366569838612e-06,
"loss": 0.6025,
"step": 179
},
{
"epoch": 0.9696969696969697,
"grad_norm": 3.5202131271362305,
"learning_rate": 7.924735751143044e-06,
"loss": 0.4822,
"step": 180
},
{
"epoch": 0.9750841750841751,
"grad_norm": 3.288405656814575,
"learning_rate": 7.901005203647373e-06,
"loss": 0.5393,
"step": 181
},
{
"epoch": 0.9804713804713805,
"grad_norm": 3.291487693786621,
"learning_rate": 7.877175736525217e-06,
"loss": 0.6146,
"step": 182
},
{
"epoch": 0.9858585858585859,
"grad_norm": 2.933931350708008,
"learning_rate": 7.853248162323208e-06,
"loss": 0.4874,
"step": 183
},
{
"epoch": 0.9912457912457913,
"grad_norm": 3.0823869705200195,
"learning_rate": 7.829223296933259e-06,
"loss": 0.5756,
"step": 184
},
{
"epoch": 0.9966329966329966,
"grad_norm": 2.960385799407959,
"learning_rate": 7.805101959564768e-06,
"loss": 0.4738,
"step": 185
},
{
"epoch": 1.0053872053872055,
"grad_norm": 5.235624313354492,
"learning_rate": 7.780884972716663e-06,
"loss": 0.8368,
"step": 186
},
{
"epoch": 1.0107744107744108,
"grad_norm": 2.6471757888793945,
"learning_rate": 7.75657316214937e-06,
"loss": 0.2894,
"step": 187
},
{
"epoch": 1.0161616161616163,
"grad_norm": 2.7157034873962402,
"learning_rate": 7.732167356856656e-06,
"loss": 0.3068,
"step": 188
},
{
"epoch": 1.0215488215488215,
"grad_norm": 2.7159922122955322,
"learning_rate": 7.70766838903735e-06,
"loss": 0.3193,
"step": 189
},
{
"epoch": 1.026936026936027,
"grad_norm": 2.4954445362091064,
"learning_rate": 7.683077094066981e-06,
"loss": 0.2827,
"step": 190
},
{
"epoch": 1.0323232323232323,
"grad_norm": 2.3092992305755615,
"learning_rate": 7.65839431046928e-06,
"loss": 0.253,
"step": 191
},
{
"epoch": 1.0377104377104378,
"grad_norm": 2.994446039199829,
"learning_rate": 7.63362087988759e-06,
"loss": 0.2969,
"step": 192
},
{
"epoch": 1.043097643097643,
"grad_norm": 2.7718987464904785,
"learning_rate": 7.608757647056186e-06,
"loss": 0.2913,
"step": 193
},
{
"epoch": 1.0484848484848486,
"grad_norm": 2.769294500350952,
"learning_rate": 7.583805459771443e-06,
"loss": 0.2704,
"step": 194
},
{
"epoch": 1.0538720538720538,
"grad_norm": 3.459955930709839,
"learning_rate": 7.5587651688629405e-06,
"loss": 0.3051,
"step": 195
},
{
"epoch": 1.0592592592592593,
"grad_norm": 3.5698342323303223,
"learning_rate": 7.533637628164456e-06,
"loss": 0.2757,
"step": 196
},
{
"epoch": 1.0646464646464646,
"grad_norm": 3.165423631668091,
"learning_rate": 7.508423694484841e-06,
"loss": 0.2811,
"step": 197
},
{
"epoch": 1.0700336700336701,
"grad_norm": 3.1055243015289307,
"learning_rate": 7.483124227578811e-06,
"loss": 0.2594,
"step": 198
},
{
"epoch": 1.0754208754208754,
"grad_norm": 3.1683449745178223,
"learning_rate": 7.457740090117627e-06,
"loss": 0.3102,
"step": 199
},
{
"epoch": 1.0808080808080809,
"grad_norm": 3.477832317352295,
"learning_rate": 7.432272147659678e-06,
"loss": 0.3035,
"step": 200
},
{
"epoch": 1.0861952861952862,
"grad_norm": 2.9576289653778076,
"learning_rate": 7.406721268620975e-06,
"loss": 0.2653,
"step": 201
},
{
"epoch": 1.0915824915824917,
"grad_norm": 2.561279773712158,
"learning_rate": 7.381088324245526e-06,
"loss": 0.2485,
"step": 202
},
{
"epoch": 1.096969696969697,
"grad_norm": 3.1348936557769775,
"learning_rate": 7.355374188575639e-06,
"loss": 0.2715,
"step": 203
},
{
"epoch": 1.1023569023569024,
"grad_norm": 2.7675235271453857,
"learning_rate": 7.3295797384221156e-06,
"loss": 0.2805,
"step": 204
},
{
"epoch": 1.1077441077441077,
"grad_norm": 3.367643117904663,
"learning_rate": 7.303705853334353e-06,
"loss": 0.2897,
"step": 205
},
{
"epoch": 1.1131313131313132,
"grad_norm": 3.007518768310547,
"learning_rate": 7.277753415570349e-06,
"loss": 0.2699,
"step": 206
},
{
"epoch": 1.1185185185185185,
"grad_norm": 2.9317398071289062,
"learning_rate": 7.2517233100666255e-06,
"loss": 0.3282,
"step": 207
},
{
"epoch": 1.123905723905724,
"grad_norm": 3.2546324729919434,
"learning_rate": 7.225616424408045e-06,
"loss": 0.2947,
"step": 208
},
{
"epoch": 1.1292929292929292,
"grad_norm": 2.954130172729492,
"learning_rate": 7.199433648797558e-06,
"loss": 0.2994,
"step": 209
},
{
"epoch": 1.1346801346801347,
"grad_norm": 2.7771804332733154,
"learning_rate": 7.1731758760258315e-06,
"loss": 0.229,
"step": 210
},
{
"epoch": 1.14006734006734,
"grad_norm": 2.771481990814209,
"learning_rate": 7.146844001440823e-06,
"loss": 0.2725,
"step": 211
},
{
"epoch": 1.1454545454545455,
"grad_norm": 2.742431402206421,
"learning_rate": 7.120438922917237e-06,
"loss": 0.2514,
"step": 212
},
{
"epoch": 1.1508417508417508,
"grad_norm": 2.6713271141052246,
"learning_rate": 7.09396154082592e-06,
"loss": 0.2485,
"step": 213
},
{
"epoch": 1.1562289562289563,
"grad_norm": 2.492274284362793,
"learning_rate": 7.067412758003154e-06,
"loss": 0.2278,
"step": 214
},
{
"epoch": 1.1616161616161615,
"grad_norm": 2.8618505001068115,
"learning_rate": 7.040793479719864e-06,
"loss": 0.2854,
"step": 215
},
{
"epoch": 1.167003367003367,
"grad_norm": 2.6601178646087646,
"learning_rate": 7.014104613650767e-06,
"loss": 0.2966,
"step": 216
},
{
"epoch": 1.1723905723905723,
"grad_norm": 3.3377082347869873,
"learning_rate": 6.987347069843406e-06,
"loss": 0.3149,
"step": 217
},
{
"epoch": 1.1777777777777778,
"grad_norm": 2.778550863265991,
"learning_rate": 6.96052176068713e-06,
"loss": 0.2543,
"step": 218
},
{
"epoch": 1.183164983164983,
"grad_norm": 3.040800094604492,
"learning_rate": 6.93362960088197e-06,
"loss": 0.2438,
"step": 219
},
{
"epoch": 1.1885521885521886,
"grad_norm": 2.9394142627716064,
"learning_rate": 6.906671507407463e-06,
"loss": 0.2304,
"step": 220
},
{
"epoch": 1.1939393939393939,
"grad_norm": 2.869964122772217,
"learning_rate": 6.879648399491376e-06,
"loss": 0.2984,
"step": 221
},
{
"epoch": 1.1993265993265994,
"grad_norm": 2.9759936332702637,
"learning_rate": 6.852561198578364e-06,
"loss": 0.2603,
"step": 222
},
{
"epoch": 1.2047138047138046,
"grad_norm": 3.310718297958374,
"learning_rate": 6.825410828298552e-06,
"loss": 0.2233,
"step": 223
},
{
"epoch": 1.2101010101010101,
"grad_norm": 2.7231340408325195,
"learning_rate": 6.79819821443604e-06,
"loss": 0.2124,
"step": 224
},
{
"epoch": 1.2154882154882154,
"grad_norm": 2.9152019023895264,
"learning_rate": 6.7709242848973326e-06,
"loss": 0.2757,
"step": 225
},
{
"epoch": 1.220875420875421,
"grad_norm": 2.9841840267181396,
"learning_rate": 6.743589969679697e-06,
"loss": 0.2853,
"step": 226
},
{
"epoch": 1.2262626262626264,
"grad_norm": 3.3108832836151123,
"learning_rate": 6.716196200839465e-06,
"loss": 0.2608,
"step": 227
},
{
"epoch": 1.2316498316498317,
"grad_norm": 2.9652819633483887,
"learning_rate": 6.6887439124602295e-06,
"loss": 0.2598,
"step": 228
},
{
"epoch": 1.237037037037037,
"grad_norm": 2.812822103500366,
"learning_rate": 6.661234040621017e-06,
"loss": 0.2638,
"step": 229
},
{
"epoch": 1.2424242424242424,
"grad_norm": 3.03281831741333,
"learning_rate": 6.63366752336435e-06,
"loss": 0.2439,
"step": 230
},
{
"epoch": 1.247811447811448,
"grad_norm": 2.7430481910705566,
"learning_rate": 6.606045300664272e-06,
"loss": 0.2502,
"step": 231
},
{
"epoch": 1.2531986531986532,
"grad_norm": 3.0615146160125732,
"learning_rate": 6.578368314394293e-06,
"loss": 0.2494,
"step": 232
},
{
"epoch": 1.2585858585858585,
"grad_norm": 2.689999580383301,
"learning_rate": 6.550637508295272e-06,
"loss": 0.2309,
"step": 233
},
{
"epoch": 1.263973063973064,
"grad_norm": 3.2054049968719482,
"learning_rate": 6.52285382794324e-06,
"loss": 0.2942,
"step": 234
},
{
"epoch": 1.2693602693602695,
"grad_norm": 2.9260945320129395,
"learning_rate": 6.49501822071715e-06,
"loss": 0.2861,
"step": 235
},
{
"epoch": 1.2747474747474747,
"grad_norm": 3.240046262741089,
"learning_rate": 6.467131635766585e-06,
"loss": 0.2949,
"step": 236
},
{
"epoch": 1.28013468013468,
"grad_norm": 2.6747567653656006,
"learning_rate": 6.439195023979381e-06,
"loss": 0.2851,
"step": 237
},
{
"epoch": 1.2855218855218855,
"grad_norm": 3.605665445327759,
"learning_rate": 6.411209337949214e-06,
"loss": 0.3156,
"step": 238
},
{
"epoch": 1.290909090909091,
"grad_norm": 3.0257179737091064,
"learning_rate": 6.383175531943106e-06,
"loss": 0.2481,
"step": 239
},
{
"epoch": 1.2962962962962963,
"grad_norm": 3.004091739654541,
"learning_rate": 6.355094561868902e-06,
"loss": 0.2608,
"step": 240
},
{
"epoch": 1.3016835016835016,
"grad_norm": 2.927186965942383,
"learning_rate": 6.3269673852426575e-06,
"loss": 0.2298,
"step": 241
},
{
"epoch": 1.307070707070707,
"grad_norm": 2.5807888507843018,
"learning_rate": 6.298794961156004e-06,
"loss": 0.2263,
"step": 242
},
{
"epoch": 1.3124579124579125,
"grad_norm": 2.7014336585998535,
"learning_rate": 6.270578250243437e-06,
"loss": 0.2931,
"step": 243
},
{
"epoch": 1.3178451178451178,
"grad_norm": 3.1106925010681152,
"learning_rate": 6.242318214649556e-06,
"loss": 0.2789,
"step": 244
},
{
"epoch": 1.3232323232323233,
"grad_norm": 2.7850258350372314,
"learning_rate": 6.214015817996273e-06,
"loss": 0.3062,
"step": 245
},
{
"epoch": 1.3286195286195286,
"grad_norm": 2.841632127761841,
"learning_rate": 6.185672025349936e-06,
"loss": 0.2595,
"step": 246
},
{
"epoch": 1.334006734006734,
"grad_norm": 2.757871150970459,
"learning_rate": 6.157287803188432e-06,
"loss": 0.2408,
"step": 247
},
{
"epoch": 1.3393939393939394,
"grad_norm": 2.7471070289611816,
"learning_rate": 6.128864119368234e-06,
"loss": 0.2618,
"step": 248
},
{
"epoch": 1.3447811447811449,
"grad_norm": 3.062896490097046,
"learning_rate": 6.100401943091386e-06,
"loss": 0.2893,
"step": 249
},
{
"epoch": 1.3501683501683501,
"grad_norm": 2.937164068222046,
"learning_rate": 6.0719022448724705e-06,
"loss": 0.2735,
"step": 250
},
{
"epoch": 1.3555555555555556,
"grad_norm": 3.1469810009002686,
"learning_rate": 6.043365996505506e-06,
"loss": 0.3295,
"step": 251
},
{
"epoch": 1.360942760942761,
"grad_norm": 2.82350754737854,
"learning_rate": 6.014794171030811e-06,
"loss": 0.2778,
"step": 252
},
{
"epoch": 1.3663299663299664,
"grad_norm": 3.0384979248046875,
"learning_rate": 5.986187742701825e-06,
"loss": 0.2678,
"step": 253
},
{
"epoch": 1.3717171717171717,
"grad_norm": 2.782715082168579,
"learning_rate": 5.9575476869518945e-06,
"loss": 0.2664,
"step": 254
},
{
"epoch": 1.3771043771043772,
"grad_norm": 2.811166763305664,
"learning_rate": 5.928874980361005e-06,
"loss": 0.2387,
"step": 255
},
{
"epoch": 1.3824915824915824,
"grad_norm": 2.939649820327759,
"learning_rate": 5.900170600622477e-06,
"loss": 0.2957,
"step": 256
},
{
"epoch": 1.387878787878788,
"grad_norm": 3.0286529064178467,
"learning_rate": 5.871435526509647e-06,
"loss": 0.2937,
"step": 257
},
{
"epoch": 1.3932659932659932,
"grad_norm": 2.7286617755889893,
"learning_rate": 5.8426707378424675e-06,
"loss": 0.2543,
"step": 258
},
{
"epoch": 1.3986531986531987,
"grad_norm": 2.7167487144470215,
"learning_rate": 5.813877215454118e-06,
"loss": 0.2296,
"step": 259
},
{
"epoch": 1.404040404040404,
"grad_norm": 2.8393452167510986,
"learning_rate": 5.78505594115755e-06,
"loss": 0.2708,
"step": 260
},
{
"epoch": 1.4094276094276095,
"grad_norm": 2.903613567352295,
"learning_rate": 5.756207897712011e-06,
"loss": 0.267,
"step": 261
},
{
"epoch": 1.4148148148148147,
"grad_norm": 2.823423147201538,
"learning_rate": 5.727334068789529e-06,
"loss": 0.2774,
"step": 262
},
{
"epoch": 1.4202020202020202,
"grad_norm": 2.7938835620880127,
"learning_rate": 5.698435438941382e-06,
"loss": 0.2474,
"step": 263
},
{
"epoch": 1.4255892255892255,
"grad_norm": 2.9996232986450195,
"learning_rate": 5.669512993564517e-06,
"loss": 0.3086,
"step": 264
},
{
"epoch": 1.430976430976431,
"grad_norm": 3.045121908187866,
"learning_rate": 5.640567718867951e-06,
"loss": 0.2617,
"step": 265
},
{
"epoch": 1.4363636363636363,
"grad_norm": 2.7745134830474854,
"learning_rate": 5.611600601839144e-06,
"loss": 0.2604,
"step": 266
},
{
"epoch": 1.4417508417508418,
"grad_norm": 2.8739848136901855,
"learning_rate": 5.582612630210349e-06,
"loss": 0.2774,
"step": 267
},
{
"epoch": 1.447138047138047,
"grad_norm": 2.740999221801758,
"learning_rate": 5.553604792424923e-06,
"loss": 0.2341,
"step": 268
},
{
"epoch": 1.4525252525252526,
"grad_norm": 2.991398572921753,
"learning_rate": 5.524578077603627e-06,
"loss": 0.2299,
"step": 269
},
{
"epoch": 1.457912457912458,
"grad_norm": 2.636726140975952,
"learning_rate": 5.495533475510901e-06,
"loss": 0.2472,
"step": 270
},
{
"epoch": 1.4632996632996633,
"grad_norm": 3.0140764713287354,
"learning_rate": 5.4664719765211125e-06,
"loss": 0.2597,
"step": 271
},
{
"epoch": 1.4686868686868686,
"grad_norm": 2.988635778427124,
"learning_rate": 5.4373945715847845e-06,
"loss": 0.2939,
"step": 272
},
{
"epoch": 1.474074074074074,
"grad_norm": 3.1995465755462646,
"learning_rate": 5.408302252194806e-06,
"loss": 0.2678,
"step": 273
},
{
"epoch": 1.4794612794612796,
"grad_norm": 2.9540798664093018,
"learning_rate": 5.379196010352629e-06,
"loss": 0.3033,
"step": 274
},
{
"epoch": 1.4848484848484849,
"grad_norm": 3.282701253890991,
"learning_rate": 5.3500768385344345e-06,
"loss": 0.2588,
"step": 275
},
{
"epoch": 1.4902356902356901,
"grad_norm": 2.9532341957092285,
"learning_rate": 5.320945729657299e-06,
"loss": 0.289,
"step": 276
},
{
"epoch": 1.4956228956228956,
"grad_norm": 2.660553455352783,
"learning_rate": 5.2918036770453285e-06,
"loss": 0.2653,
"step": 277
},
{
"epoch": 1.5010101010101011,
"grad_norm": 2.7580904960632324,
"learning_rate": 5.262651674395799e-06,
"loss": 0.2585,
"step": 278
},
{
"epoch": 1.5063973063973064,
"grad_norm": 2.7895712852478027,
"learning_rate": 5.2334907157452605e-06,
"loss": 0.2425,
"step": 279
},
{
"epoch": 1.5117845117845117,
"grad_norm": 2.852928876876831,
"learning_rate": 5.204321795435656e-06,
"loss": 0.2702,
"step": 280
},
{
"epoch": 1.5171717171717172,
"grad_norm": 3.042116403579712,
"learning_rate": 5.1751459080803986e-06,
"loss": 0.2615,
"step": 281
},
{
"epoch": 1.5225589225589227,
"grad_norm": 2.737823724746704,
"learning_rate": 5.145964048530475e-06,
"loss": 0.2695,
"step": 282
},
{
"epoch": 1.527946127946128,
"grad_norm": 2.6959354877471924,
"learning_rate": 5.11677721184051e-06,
"loss": 0.2595,
"step": 283
},
{
"epoch": 1.5333333333333332,
"grad_norm": 3.073336601257324,
"learning_rate": 5.08758639323484e-06,
"loss": 0.249,
"step": 284
},
{
"epoch": 1.5387205387205387,
"grad_norm": 2.496995449066162,
"learning_rate": 5.058392588073583e-06,
"loss": 0.2409,
"step": 285
},
{
"epoch": 1.5441077441077442,
"grad_norm": 2.8654353618621826,
"learning_rate": 5.029196791818688e-06,
"loss": 0.2428,
"step": 286
},
{
"epoch": 1.5494949494949495,
"grad_norm": 2.753993034362793,
"learning_rate": 5e-06,
"loss": 0.2768,
"step": 287
},
{
"epoch": 1.5548821548821548,
"grad_norm": 2.972564220428467,
"learning_rate": 4.970803208181315e-06,
"loss": 0.2451,
"step": 288
},
{
"epoch": 1.5602693602693603,
"grad_norm": 3.036773681640625,
"learning_rate": 4.941607411926419e-06,
"loss": 0.2642,
"step": 289
},
{
"epoch": 1.5656565656565657,
"grad_norm": 3.0601320266723633,
"learning_rate": 4.9124136067651615e-06,
"loss": 0.2803,
"step": 290
},
{
"epoch": 1.571043771043771,
"grad_norm": 3.3641974925994873,
"learning_rate": 4.883222788159491e-06,
"loss": 0.289,
"step": 291
},
{
"epoch": 1.5764309764309763,
"grad_norm": 3.0665841102600098,
"learning_rate": 4.8540359514695266e-06,
"loss": 0.2196,
"step": 292
},
{
"epoch": 1.5818181818181818,
"grad_norm": 2.884730339050293,
"learning_rate": 4.824854091919601e-06,
"loss": 0.2532,
"step": 293
},
{
"epoch": 1.5872053872053873,
"grad_norm": 3.1136231422424316,
"learning_rate": 4.795678204564346e-06,
"loss": 0.2545,
"step": 294
},
{
"epoch": 1.5925925925925926,
"grad_norm": 2.821955919265747,
"learning_rate": 4.766509284254739e-06,
"loss": 0.2524,
"step": 295
},
{
"epoch": 1.5979797979797978,
"grad_norm": 3.191521167755127,
"learning_rate": 4.737348325604203e-06,
"loss": 0.2638,
"step": 296
},
{
"epoch": 1.6033670033670033,
"grad_norm": 2.8502752780914307,
"learning_rate": 4.708196322954673e-06,
"loss": 0.2648,
"step": 297
},
{
"epoch": 1.6087542087542088,
"grad_norm": 3.3543736934661865,
"learning_rate": 4.679054270342703e-06,
"loss": 0.2884,
"step": 298
},
{
"epoch": 1.614141414141414,
"grad_norm": 2.9385459423065186,
"learning_rate": 4.649923161465567e-06,
"loss": 0.2422,
"step": 299
},
{
"epoch": 1.6195286195286194,
"grad_norm": 2.9000279903411865,
"learning_rate": 4.620803989647373e-06,
"loss": 0.244,
"step": 300
},
{
"epoch": 1.6249158249158249,
"grad_norm": 2.7263593673706055,
"learning_rate": 4.591697747805196e-06,
"loss": 0.2452,
"step": 301
},
{
"epoch": 1.6303030303030304,
"grad_norm": 2.7036728858947754,
"learning_rate": 4.562605428415216e-06,
"loss": 0.2555,
"step": 302
},
{
"epoch": 1.6356902356902356,
"grad_norm": 2.996410608291626,
"learning_rate": 4.533528023478888e-06,
"loss": 0.2212,
"step": 303
},
{
"epoch": 1.641077441077441,
"grad_norm": 2.6675851345062256,
"learning_rate": 4.5044665244891e-06,
"loss": 0.2411,
"step": 304
},
{
"epoch": 1.6464646464646466,
"grad_norm": 2.8888285160064697,
"learning_rate": 4.475421922396375e-06,
"loss": 0.2374,
"step": 305
},
{
"epoch": 1.651851851851852,
"grad_norm": 2.5365850925445557,
"learning_rate": 4.446395207575081e-06,
"loss": 0.2443,
"step": 306
},
{
"epoch": 1.6572390572390572,
"grad_norm": 2.7890241146087646,
"learning_rate": 4.417387369789652e-06,
"loss": 0.2219,
"step": 307
},
{
"epoch": 1.6626262626262627,
"grad_norm": 3.0111935138702393,
"learning_rate": 4.388399398160857e-06,
"loss": 0.2528,
"step": 308
},
{
"epoch": 1.6680134680134682,
"grad_norm": 2.897418260574341,
"learning_rate": 4.359432281132051e-06,
"loss": 0.2432,
"step": 309
},
{
"epoch": 1.6734006734006734,
"grad_norm": 2.736621618270874,
"learning_rate": 4.330487006435485e-06,
"loss": 0.2381,
"step": 310
},
{
"epoch": 1.6787878787878787,
"grad_norm": 2.9282073974609375,
"learning_rate": 4.301564561058618e-06,
"loss": 0.2405,
"step": 311
},
{
"epoch": 1.6841750841750842,
"grad_norm": 2.8673527240753174,
"learning_rate": 4.272665931210472e-06,
"loss": 0.2638,
"step": 312
},
{
"epoch": 1.6895622895622897,
"grad_norm": 3.049126148223877,
"learning_rate": 4.243792102287991e-06,
"loss": 0.2505,
"step": 313
},
{
"epoch": 1.694949494949495,
"grad_norm": 3.018843173980713,
"learning_rate": 4.214944058842452e-06,
"loss": 0.262,
"step": 314
},
{
"epoch": 1.7003367003367003,
"grad_norm": 3.1092209815979004,
"learning_rate": 4.186122784545885e-06,
"loss": 0.2784,
"step": 315
},
{
"epoch": 1.7057239057239058,
"grad_norm": 3.118446111679077,
"learning_rate": 4.157329262157534e-06,
"loss": 0.2645,
"step": 316
},
{
"epoch": 1.7111111111111112,
"grad_norm": 3.1034669876098633,
"learning_rate": 4.128564473490357e-06,
"loss": 0.244,
"step": 317
},
{
"epoch": 1.7164983164983165,
"grad_norm": 3.1367504596710205,
"learning_rate": 4.099829399377524e-06,
"loss": 0.2526,
"step": 318
},
{
"epoch": 1.7218855218855218,
"grad_norm": 3.1213414669036865,
"learning_rate": 4.071125019638998e-06,
"loss": 0.2603,
"step": 319
},
{
"epoch": 1.7272727272727273,
"grad_norm": 2.7703611850738525,
"learning_rate": 4.0424523130481055e-06,
"loss": 0.2302,
"step": 320
},
{
"epoch": 1.7326599326599328,
"grad_norm": 3.022610902786255,
"learning_rate": 4.013812257298175e-06,
"loss": 0.2637,
"step": 321
},
{
"epoch": 1.738047138047138,
"grad_norm": 2.519594192504883,
"learning_rate": 3.985205828969191e-06,
"loss": 0.2235,
"step": 322
},
{
"epoch": 1.7434343434343433,
"grad_norm": 2.90838360786438,
"learning_rate": 3.956634003494496e-06,
"loss": 0.2834,
"step": 323
},
{
"epoch": 1.7488215488215488,
"grad_norm": 2.999645948410034,
"learning_rate": 3.9280977551275294e-06,
"loss": 0.2463,
"step": 324
},
{
"epoch": 1.7542087542087543,
"grad_norm": 2.5574517250061035,
"learning_rate": 3.899598056908615e-06,
"loss": 0.2101,
"step": 325
},
{
"epoch": 1.7595959595959596,
"grad_norm": 2.567458391189575,
"learning_rate": 3.871135880631769e-06,
"loss": 0.2576,
"step": 326
},
{
"epoch": 1.7649831649831649,
"grad_norm": 2.833789110183716,
"learning_rate": 3.842712196811569e-06,
"loss": 0.2322,
"step": 327
},
{
"epoch": 1.7703703703703704,
"grad_norm": 2.6010053157806396,
"learning_rate": 3.8143279746500665e-06,
"loss": 0.2227,
"step": 328
},
{
"epoch": 1.7757575757575759,
"grad_norm": 2.8823626041412354,
"learning_rate": 3.785984182003728e-06,
"loss": 0.2646,
"step": 329
},
{
"epoch": 1.7811447811447811,
"grad_norm": 2.988429546356201,
"learning_rate": 3.757681785350445e-06,
"loss": 0.2626,
"step": 330
},
{
"epoch": 1.7865319865319864,
"grad_norm": 2.649637222290039,
"learning_rate": 3.729421749756564e-06,
"loss": 0.2145,
"step": 331
},
{
"epoch": 1.791919191919192,
"grad_norm": 2.893730401992798,
"learning_rate": 3.701205038843997e-06,
"loss": 0.2727,
"step": 332
},
{
"epoch": 1.7973063973063974,
"grad_norm": 2.917715549468994,
"learning_rate": 3.6730326147573425e-06,
"loss": 0.2281,
"step": 333
},
{
"epoch": 1.8026936026936027,
"grad_norm": 2.8687551021575928,
"learning_rate": 3.6449054381311e-06,
"loss": 0.2531,
"step": 334
},
{
"epoch": 1.808080808080808,
"grad_norm": 2.496572256088257,
"learning_rate": 3.616824468056896e-06,
"loss": 0.2227,
"step": 335
},
{
"epoch": 1.8134680134680135,
"grad_norm": 3.0722904205322266,
"learning_rate": 3.5887906620507877e-06,
"loss": 0.2803,
"step": 336
},
{
"epoch": 1.818855218855219,
"grad_norm": 2.8439204692840576,
"learning_rate": 3.5608049760206203e-06,
"loss": 0.2315,
"step": 337
},
{
"epoch": 1.8242424242424242,
"grad_norm": 2.7868878841400146,
"learning_rate": 3.532868364233416e-06,
"loss": 0.2806,
"step": 338
},
{
"epoch": 1.8296296296296295,
"grad_norm": 2.97046160697937,
"learning_rate": 3.504981779282852e-06,
"loss": 0.2521,
"step": 339
},
{
"epoch": 1.835016835016835,
"grad_norm": 2.795283794403076,
"learning_rate": 3.4771461720567613e-06,
"loss": 0.2522,
"step": 340
},
{
"epoch": 1.8404040404040405,
"grad_norm": 2.8273348808288574,
"learning_rate": 3.4493624917047284e-06,
"loss": 0.2462,
"step": 341
},
{
"epoch": 1.8457912457912458,
"grad_norm": 2.7871997356414795,
"learning_rate": 3.4216316856057074e-06,
"loss": 0.2334,
"step": 342
},
{
"epoch": 1.851178451178451,
"grad_norm": 2.754995346069336,
"learning_rate": 3.3939546993357297e-06,
"loss": 0.2533,
"step": 343
},
{
"epoch": 1.8565656565656565,
"grad_norm": 2.96561336517334,
"learning_rate": 3.3663324766356524e-06,
"loss": 0.261,
"step": 344
},
{
"epoch": 1.861952861952862,
"grad_norm": 2.781203269958496,
"learning_rate": 3.3387659593789845e-06,
"loss": 0.2312,
"step": 345
},
{
"epoch": 1.8673400673400673,
"grad_norm": 2.9479804039001465,
"learning_rate": 3.3112560875397713e-06,
"loss": 0.2618,
"step": 346
},
{
"epoch": 1.8727272727272726,
"grad_norm": 2.9663288593292236,
"learning_rate": 3.283803799160537e-06,
"loss": 0.2554,
"step": 347
},
{
"epoch": 1.878114478114478,
"grad_norm": 2.95430064201355,
"learning_rate": 3.256410030320304e-06,
"loss": 0.2873,
"step": 348
},
{
"epoch": 1.8835016835016836,
"grad_norm": 2.834928274154663,
"learning_rate": 3.2290757151026687e-06,
"loss": 0.2407,
"step": 349
},
{
"epoch": 1.8888888888888888,
"grad_norm": 2.61153244972229,
"learning_rate": 3.2018017855639605e-06,
"loss": 0.2482,
"step": 350
},
{
"epoch": 1.8942760942760941,
"grad_norm": 2.788770914077759,
"learning_rate": 3.1745891717014477e-06,
"loss": 0.224,
"step": 351
},
{
"epoch": 1.8996632996632996,
"grad_norm": 2.7962043285369873,
"learning_rate": 3.147438801421638e-06,
"loss": 0.2526,
"step": 352
},
{
"epoch": 1.905050505050505,
"grad_norm": 2.967076539993286,
"learning_rate": 3.1203516005086276e-06,
"loss": 0.2335,
"step": 353
},
{
"epoch": 1.9104377104377104,
"grad_norm": 2.598158836364746,
"learning_rate": 3.093328492592539e-06,
"loss": 0.2127,
"step": 354
},
{
"epoch": 1.9158249158249159,
"grad_norm": 2.835001230239868,
"learning_rate": 3.0663703991180318e-06,
"loss": 0.2273,
"step": 355
},
{
"epoch": 1.9212121212121214,
"grad_norm": 2.9137284755706787,
"learning_rate": 3.0394782393128713e-06,
"loss": 0.2754,
"step": 356
},
{
"epoch": 1.9265993265993266,
"grad_norm": 3.0207886695861816,
"learning_rate": 3.0126529301565945e-06,
"loss": 0.2449,
"step": 357
},
{
"epoch": 1.931986531986532,
"grad_norm": 2.987816095352173,
"learning_rate": 2.9858953863492334e-06,
"loss": 0.2521,
"step": 358
},
{
"epoch": 1.9373737373737374,
"grad_norm": 2.8369038105010986,
"learning_rate": 2.9592065202801374e-06,
"loss": 0.2383,
"step": 359
},
{
"epoch": 1.942760942760943,
"grad_norm": 2.73996639251709,
"learning_rate": 2.9325872419968484e-06,
"loss": 0.2536,
"step": 360
},
{
"epoch": 1.9481481481481482,
"grad_norm": 3.1415867805480957,
"learning_rate": 2.906038459174081e-06,
"loss": 0.2599,
"step": 361
},
{
"epoch": 1.9535353535353535,
"grad_norm": 3.266170024871826,
"learning_rate": 2.879561077082764e-06,
"loss": 0.2544,
"step": 362
},
{
"epoch": 1.958922558922559,
"grad_norm": 2.9058427810668945,
"learning_rate": 2.853155998559179e-06,
"loss": 0.244,
"step": 363
},
{
"epoch": 1.9643097643097645,
"grad_norm": 2.8677961826324463,
"learning_rate": 2.826824123974171e-06,
"loss": 0.2192,
"step": 364
},
{
"epoch": 1.9696969696969697,
"grad_norm": 3.0954580307006836,
"learning_rate": 2.800566351202443e-06,
"loss": 0.2538,
"step": 365
},
{
"epoch": 1.975084175084175,
"grad_norm": 3.023210287094116,
"learning_rate": 2.774383575591956e-06,
"loss": 0.248,
"step": 366
},
{
"epoch": 1.9804713804713805,
"grad_norm": 2.7636148929595947,
"learning_rate": 2.748276689933377e-06,
"loss": 0.2281,
"step": 367
},
{
"epoch": 1.985858585858586,
"grad_norm": 2.7266335487365723,
"learning_rate": 2.722246584429652e-06,
"loss": 0.2492,
"step": 368
},
{
"epoch": 1.9912457912457913,
"grad_norm": 2.8604986667633057,
"learning_rate": 2.6962941466656477e-06,
"loss": 0.2358,
"step": 369
},
{
"epoch": 1.9966329966329965,
"grad_norm": 2.7491540908813477,
"learning_rate": 2.6704202615778844e-06,
"loss": 0.2366,
"step": 370
},
{
"epoch": 2.0053872053872053,
"grad_norm": 5.093502998352051,
"learning_rate": 2.6446258114243633e-06,
"loss": 0.343,
"step": 371
},
{
"epoch": 2.010774410774411,
"grad_norm": 1.904625415802002,
"learning_rate": 2.6189116757544765e-06,
"loss": 0.0965,
"step": 372
},
{
"epoch": 2.0161616161616163,
"grad_norm": 1.87295663356781,
"learning_rate": 2.593278731379027e-06,
"loss": 0.1118,
"step": 373
},
{
"epoch": 2.0215488215488215,
"grad_norm": 2.0098869800567627,
"learning_rate": 2.567727852340323e-06,
"loss": 0.0975,
"step": 374
},
{
"epoch": 2.026936026936027,
"grad_norm": 1.6401960849761963,
"learning_rate": 2.542259909882374e-06,
"loss": 0.0918,
"step": 375
},
{
"epoch": 2.0323232323232325,
"grad_norm": 1.9632785320281982,
"learning_rate": 2.51687577242119e-06,
"loss": 0.0885,
"step": 376
},
{
"epoch": 2.037710437710438,
"grad_norm": 1.801023006439209,
"learning_rate": 2.4915763055151615e-06,
"loss": 0.0849,
"step": 377
},
{
"epoch": 2.043097643097643,
"grad_norm": 1.8630132675170898,
"learning_rate": 2.4663623718355444e-06,
"loss": 0.0782,
"step": 378
},
{
"epoch": 2.0484848484848484,
"grad_norm": 1.9627724885940552,
"learning_rate": 2.4412348311370616e-06,
"loss": 0.0985,
"step": 379
},
{
"epoch": 2.053872053872054,
"grad_norm": 2.127228260040283,
"learning_rate": 2.416194540228559e-06,
"loss": 0.0885,
"step": 380
},
{
"epoch": 2.0592592592592593,
"grad_norm": 2.282618284225464,
"learning_rate": 2.3912423529438145e-06,
"loss": 0.0705,
"step": 381
},
{
"epoch": 2.0646464646464646,
"grad_norm": 2.917990207672119,
"learning_rate": 2.3663791201124093e-06,
"loss": 0.0904,
"step": 382
},
{
"epoch": 2.07003367003367,
"grad_norm": 2.867617130279541,
"learning_rate": 2.341605689530723e-06,
"loss": 0.0766,
"step": 383
},
{
"epoch": 2.0754208754208756,
"grad_norm": 2.3559350967407227,
"learning_rate": 2.316922905933022e-06,
"loss": 0.0889,
"step": 384
},
{
"epoch": 2.080808080808081,
"grad_norm": 2.959153890609741,
"learning_rate": 2.292331610962649e-06,
"loss": 0.0759,
"step": 385
},
{
"epoch": 2.086195286195286,
"grad_norm": 2.87480092048645,
"learning_rate": 2.2678326431433456e-06,
"loss": 0.0836,
"step": 386
},
{
"epoch": 2.0915824915824914,
"grad_norm": 2.830786943435669,
"learning_rate": 2.243426837850631e-06,
"loss": 0.1042,
"step": 387
},
{
"epoch": 2.096969696969697,
"grad_norm": 2.9633374214172363,
"learning_rate": 2.219115027283339e-06,
"loss": 0.0958,
"step": 388
},
{
"epoch": 2.1023569023569024,
"grad_norm": 2.6659820079803467,
"learning_rate": 2.194898040435234e-06,
"loss": 0.0772,
"step": 389
},
{
"epoch": 2.1077441077441077,
"grad_norm": 2.3520843982696533,
"learning_rate": 2.17077670306674e-06,
"loss": 0.0564,
"step": 390
},
{
"epoch": 2.113131313131313,
"grad_norm": 2.393596887588501,
"learning_rate": 2.146751837676794e-06,
"loss": 0.075,
"step": 391
},
{
"epoch": 2.1185185185185187,
"grad_norm": 2.7160770893096924,
"learning_rate": 2.122824263474784e-06,
"loss": 0.1021,
"step": 392
},
{
"epoch": 2.123905723905724,
"grad_norm": 2.5906686782836914,
"learning_rate": 2.098994796352629e-06,
"loss": 0.0886,
"step": 393
},
{
"epoch": 2.1292929292929292,
"grad_norm": 2.3228564262390137,
"learning_rate": 2.0752642488569557e-06,
"loss": 0.0807,
"step": 394
},
{
"epoch": 2.1346801346801345,
"grad_norm": 2.289416790008545,
"learning_rate": 2.0516334301613876e-06,
"loss": 0.0804,
"step": 395
},
{
"epoch": 2.1400673400673402,
"grad_norm": 2.459120750427246,
"learning_rate": 2.028103146038958e-06,
"loss": 0.1073,
"step": 396
},
{
"epoch": 2.1454545454545455,
"grad_norm": 2.474850654602051,
"learning_rate": 2.004674198834631e-06,
"loss": 0.0746,
"step": 397
},
{
"epoch": 2.1508417508417508,
"grad_norm": 2.63972806930542,
"learning_rate": 1.98134738743794e-06,
"loss": 0.0754,
"step": 398
},
{
"epoch": 2.156228956228956,
"grad_norm": 2.22719407081604,
"learning_rate": 1.9581235072557618e-06,
"loss": 0.084,
"step": 399
},
{
"epoch": 2.1616161616161618,
"grad_norm": 2.08853816986084,
"learning_rate": 1.935003350185174e-06,
"loss": 0.0779,
"step": 400
},
{
"epoch": 2.167003367003367,
"grad_norm": 1.9397152662277222,
"learning_rate": 1.911987704586466e-06,
"loss": 0.07,
"step": 401
},
{
"epoch": 2.1723905723905723,
"grad_norm": 1.917934775352478,
"learning_rate": 1.8890773552562564e-06,
"loss": 0.0725,
"step": 402
},
{
"epoch": 2.1777777777777776,
"grad_norm": 2.1869399547576904,
"learning_rate": 1.8662730834007204e-06,
"loss": 0.0745,
"step": 403
},
{
"epoch": 2.1831649831649833,
"grad_norm": 2.0088367462158203,
"learning_rate": 1.843575666608976e-06,
"loss": 0.091,
"step": 404
},
{
"epoch": 2.1885521885521886,
"grad_norm": 2.3277580738067627,
"learning_rate": 1.8209858788265411e-06,
"loss": 0.0605,
"step": 405
},
{
"epoch": 2.193939393939394,
"grad_norm": 1.99192214012146,
"learning_rate": 1.7985044903289645e-06,
"loss": 0.0706,
"step": 406
},
{
"epoch": 2.199326599326599,
"grad_norm": 2.2638256549835205,
"learning_rate": 1.7761322676955505e-06,
"loss": 0.0728,
"step": 407
},
{
"epoch": 2.204713804713805,
"grad_norm": 2.2363462448120117,
"learning_rate": 1.7538699737832237e-06,
"loss": 0.0804,
"step": 408
},
{
"epoch": 2.21010101010101,
"grad_norm": 2.1804420948028564,
"learning_rate": 1.7317183677005173e-06,
"loss": 0.0882,
"step": 409
},
{
"epoch": 2.2154882154882154,
"grad_norm": 2.3650074005126953,
"learning_rate": 1.7096782047816806e-06,
"loss": 0.0784,
"step": 410
},
{
"epoch": 2.2208754208754207,
"grad_norm": 2.209190845489502,
"learning_rate": 1.687750236560936e-06,
"loss": 0.087,
"step": 411
},
{
"epoch": 2.2262626262626264,
"grad_norm": 2.8381571769714355,
"learning_rate": 1.665935210746844e-06,
"loss": 0.0656,
"step": 412
},
{
"epoch": 2.2316498316498317,
"grad_norm": 2.1763696670532227,
"learning_rate": 1.6442338711968102e-06,
"loss": 0.0884,
"step": 413
},
{
"epoch": 2.237037037037037,
"grad_norm": 2.410494327545166,
"learning_rate": 1.622646957891722e-06,
"loss": 0.0702,
"step": 414
},
{
"epoch": 2.242424242424242,
"grad_norm": 2.365952491760254,
"learning_rate": 1.601175206910715e-06,
"loss": 0.0902,
"step": 415
},
{
"epoch": 2.247811447811448,
"grad_norm": 2.4530627727508545,
"learning_rate": 1.5798193504060693e-06,
"loss": 0.0792,
"step": 416
},
{
"epoch": 2.253198653198653,
"grad_norm": 2.4529592990875244,
"learning_rate": 1.5585801165782606e-06,
"loss": 0.0863,
"step": 417
},
{
"epoch": 2.2585858585858585,
"grad_norm": 2.298218250274658,
"learning_rate": 1.5374582296511054e-06,
"loss": 0.0854,
"step": 418
},
{
"epoch": 2.263973063973064,
"grad_norm": 2.545762538909912,
"learning_rate": 1.5164544098470862e-06,
"loss": 0.0913,
"step": 419
},
{
"epoch": 2.2693602693602695,
"grad_norm": 2.3648526668548584,
"learning_rate": 1.4955693733627869e-06,
"loss": 0.0795,
"step": 420
},
{
"epoch": 2.2747474747474747,
"grad_norm": 2.335575819015503,
"learning_rate": 1.474803832344463e-06,
"loss": 0.084,
"step": 421
},
{
"epoch": 2.28013468013468,
"grad_norm": 2.2477426528930664,
"learning_rate": 1.4541584948637777e-06,
"loss": 0.0876,
"step": 422
},
{
"epoch": 2.2855218855218853,
"grad_norm": 2.9558703899383545,
"learning_rate": 1.4336340648936342e-06,
"loss": 0.079,
"step": 423
},
{
"epoch": 2.290909090909091,
"grad_norm": 2.1282129287719727,
"learning_rate": 1.413231242284195e-06,
"loss": 0.0689,
"step": 424
},
{
"epoch": 2.2962962962962963,
"grad_norm": 2.1239535808563232,
"learning_rate": 1.3929507227389954e-06,
"loss": 0.0701,
"step": 425
},
{
"epoch": 2.3016835016835016,
"grad_norm": 2.0963549613952637,
"learning_rate": 1.3727931977912406e-06,
"loss": 0.0758,
"step": 426
},
{
"epoch": 2.3070707070707073,
"grad_norm": 3.4831295013427734,
"learning_rate": 1.352759354780215e-06,
"loss": 0.086,
"step": 427
},
{
"epoch": 2.3124579124579125,
"grad_norm": 2.0869736671447754,
"learning_rate": 1.332849876827842e-06,
"loss": 0.072,
"step": 428
},
{
"epoch": 2.317845117845118,
"grad_norm": 2.1851084232330322,
"learning_rate": 1.3130654428154066e-06,
"loss": 0.0644,
"step": 429
},
{
"epoch": 2.323232323232323,
"grad_norm": 1.7817176580429077,
"learning_rate": 1.2934067273603855e-06,
"loss": 0.0522,
"step": 430
},
{
"epoch": 2.328619528619529,
"grad_norm": 2.0074706077575684,
"learning_rate": 1.2738744007934595e-06,
"loss": 0.0744,
"step": 431
},
{
"epoch": 2.334006734006734,
"grad_norm": 2.3214468955993652,
"learning_rate": 1.2544691291356497e-06,
"loss": 0.0759,
"step": 432
},
{
"epoch": 2.3393939393939394,
"grad_norm": 2.294804096221924,
"learning_rate": 1.2351915740756087e-06,
"loss": 0.068,
"step": 433
},
{
"epoch": 2.3447811447811446,
"grad_norm": 2.0611894130706787,
"learning_rate": 1.2160423929470584e-06,
"loss": 0.0667,
"step": 434
},
{
"epoch": 2.3501683501683504,
"grad_norm": 2.080531120300293,
"learning_rate": 1.1970222387063756e-06,
"loss": 0.0749,
"step": 435
},
{
"epoch": 2.3555555555555556,
"grad_norm": 2.0696070194244385,
"learning_rate": 1.1781317599103238e-06,
"loss": 0.0773,
"step": 436
},
{
"epoch": 2.360942760942761,
"grad_norm": 2.34531569480896,
"learning_rate": 1.1593716006939455e-06,
"loss": 0.0752,
"step": 437
},
{
"epoch": 2.366329966329966,
"grad_norm": 2.6101057529449463,
"learning_rate": 1.140742400748593e-06,
"loss": 0.0605,
"step": 438
},
{
"epoch": 2.371717171717172,
"grad_norm": 2.1780221462249756,
"learning_rate": 1.1222447953001182e-06,
"loss": 0.0638,
"step": 439
},
{
"epoch": 2.377104377104377,
"grad_norm": 2.247965097427368,
"learning_rate": 1.1038794150872117e-06,
"loss": 0.0714,
"step": 440
},
{
"epoch": 2.3824915824915824,
"grad_norm": 1.9487817287445068,
"learning_rate": 1.0856468863398917e-06,
"loss": 0.0654,
"step": 441
},
{
"epoch": 2.3878787878787877,
"grad_norm": 2.285243272781372,
"learning_rate": 1.0675478307581627e-06,
"loss": 0.0706,
"step": 442
},
{
"epoch": 2.3932659932659934,
"grad_norm": 2.08785343170166,
"learning_rate": 1.0495828654907991e-06,
"loss": 0.0828,
"step": 443
},
{
"epoch": 2.3986531986531987,
"grad_norm": 2.6061668395996094,
"learning_rate": 1.0317526031143161e-06,
"loss": 0.06,
"step": 444
},
{
"epoch": 2.404040404040404,
"grad_norm": 1.9994468688964844,
"learning_rate": 1.014057651612076e-06,
"loss": 0.0678,
"step": 445
},
{
"epoch": 2.4094276094276093,
"grad_norm": 2.335872173309326,
"learning_rate": 9.964986143535515e-07,
"loss": 0.0696,
"step": 446
},
{
"epoch": 2.414814814814815,
"grad_norm": 2.4777722358703613,
"learning_rate": 9.790760900737683e-07,
"loss": 0.0651,
"step": 447
},
{
"epoch": 2.4202020202020202,
"grad_norm": 2.2628719806671143,
"learning_rate": 9.61790672852868e-07,
"loss": 0.0789,
"step": 448
},
{
"epoch": 2.4255892255892255,
"grad_norm": 2.406503677368164,
"learning_rate": 9.446429520958666e-07,
"loss": 0.0812,
"step": 449
},
{
"epoch": 2.430976430976431,
"grad_norm": 2.8397791385650635,
"learning_rate": 9.276335125125502e-07,
"loss": 0.0678,
"step": 450
},
{
"epoch": 2.4363636363636365,
"grad_norm": 2.485055923461914,
"learning_rate": 9.107629340975388e-07,
"loss": 0.0619,
"step": 451
},
{
"epoch": 2.441750841750842,
"grad_norm": 2.066659927368164,
"learning_rate": 8.940317921105085e-07,
"loss": 0.0611,
"step": 452
},
{
"epoch": 2.447138047138047,
"grad_norm": 2.2130823135375977,
"learning_rate": 8.774406570565791e-07,
"loss": 0.0674,
"step": 453
},
{
"epoch": 2.4525252525252528,
"grad_norm": 2.1492106914520264,
"learning_rate": 8.609900946668536e-07,
"loss": 0.0744,
"step": 454
},
{
"epoch": 2.457912457912458,
"grad_norm": 2.2511839866638184,
"learning_rate": 8.446806658791373e-07,
"loss": 0.0689,
"step": 455
},
{
"epoch": 2.4632996632996633,
"grad_norm": 2.078249454498291,
"learning_rate": 8.285129268188042e-07,
"loss": 0.0726,
"step": 456
},
{
"epoch": 2.4686868686868686,
"grad_norm": 2.2379488945007324,
"learning_rate": 8.124874287798352e-07,
"loss": 0.0748,
"step": 457
},
{
"epoch": 2.474074074074074,
"grad_norm": 2.272982120513916,
"learning_rate": 7.966047182060226e-07,
"loss": 0.0549,
"step": 458
},
{
"epoch": 2.4794612794612796,
"grad_norm": 1.9955648183822632,
"learning_rate": 7.808653366723296e-07,
"loss": 0.0603,
"step": 459
},
{
"epoch": 2.484848484848485,
"grad_norm": 1.8981883525848389,
"learning_rate": 7.652698208664377e-07,
"loss": 0.0675,
"step": 460
},
{
"epoch": 2.49023569023569,
"grad_norm": 2.4488866329193115,
"learning_rate": 7.498187025704296e-07,
"loss": 0.0768,
"step": 461
},
{
"epoch": 2.495622895622896,
"grad_norm": 2.1295886039733887,
"learning_rate": 7.345125086426675e-07,
"loss": 0.0662,
"step": 462
},
{
"epoch": 2.501010101010101,
"grad_norm": 2.2743725776672363,
"learning_rate": 7.193517609998263e-07,
"loss": 0.0686,
"step": 463
},
{
"epoch": 2.5063973063973064,
"grad_norm": 2.235623836517334,
"learning_rate": 7.043369765990943e-07,
"loss": 0.0615,
"step": 464
},
{
"epoch": 2.5117845117845117,
"grad_norm": 2.076993942260742,
"learning_rate": 6.894686674205481e-07,
"loss": 0.0803,
"step": 465
},
{
"epoch": 2.517171717171717,
"grad_norm": 2.2475011348724365,
"learning_rate": 6.747473404496902e-07,
"loss": 0.0851,
"step": 466
},
{
"epoch": 2.5225589225589227,
"grad_norm": 2.5577120780944824,
"learning_rate": 6.601734976601737e-07,
"loss": 0.0735,
"step": 467
},
{
"epoch": 2.527946127946128,
"grad_norm": 2.3084797859191895,
"learning_rate": 6.457476359966685e-07,
"loss": 0.0724,
"step": 468
},
{
"epoch": 2.533333333333333,
"grad_norm": 2.051790237426758,
"learning_rate": 6.314702473579309e-07,
"loss": 0.0851,
"step": 469
},
{
"epoch": 2.538720538720539,
"grad_norm": 2.8228673934936523,
"learning_rate": 6.17341818580024e-07,
"loss": 0.0715,
"step": 470
},
{
"epoch": 2.544107744107744,
"grad_norm": 2.070128917694092,
"learning_rate": 6.033628314197176e-07,
"loss": 0.0615,
"step": 471
},
{
"epoch": 2.5494949494949495,
"grad_norm": 2.154543876647949,
"learning_rate": 5.895337625380632e-07,
"loss": 0.0646,
"step": 472
},
{
"epoch": 2.5548821548821548,
"grad_norm": 1.9985536336898804,
"learning_rate": 5.758550834841381e-07,
"loss": 0.0574,
"step": 473
},
{
"epoch": 2.56026936026936,
"grad_norm": 2.2103183269500732,
"learning_rate": 5.62327260678967e-07,
"loss": 0.0694,
"step": 474
},
{
"epoch": 2.5656565656565657,
"grad_norm": 2.3436076641082764,
"learning_rate": 5.489507553996204e-07,
"loss": 0.065,
"step": 475
},
{
"epoch": 2.571043771043771,
"grad_norm": 2.371115207672119,
"learning_rate": 5.357260237634826e-07,
"loss": 0.0804,
"step": 476
},
{
"epoch": 2.5764309764309763,
"grad_norm": 2.1717820167541504,
"learning_rate": 5.226535167127e-07,
"loss": 0.0744,
"step": 477
},
{
"epoch": 2.581818181818182,
"grad_norm": 2.0997424125671387,
"learning_rate": 5.097336799988067e-07,
"loss": 0.0582,
"step": 478
},
{
"epoch": 2.5872053872053873,
"grad_norm": 1.9539695978164673,
"learning_rate": 4.96966954167517e-07,
"loss": 0.0843,
"step": 479
},
{
"epoch": 2.5925925925925926,
"grad_norm": 2.401609182357788,
"learning_rate": 4.843537745437188e-07,
"loss": 0.0628,
"step": 480
},
{
"epoch": 2.597979797979798,
"grad_norm": 2.3277831077575684,
"learning_rate": 4.718945712166123e-07,
"loss": 0.0904,
"step": 481
},
{
"epoch": 2.603367003367003,
"grad_norm": 2.537806510925293,
"learning_rate": 4.595897690250567e-07,
"loss": 0.0653,
"step": 482
},
{
"epoch": 2.608754208754209,
"grad_norm": 2.5211031436920166,
"learning_rate": 4.4743978754308027e-07,
"loss": 0.0762,
"step": 483
},
{
"epoch": 2.614141414141414,
"grad_norm": 2.538830280303955,
"learning_rate": 4.3544504106557026e-07,
"loss": 0.0722,
"step": 484
},
{
"epoch": 2.6195286195286194,
"grad_norm": 2.389099597930908,
"learning_rate": 4.2360593859415433e-07,
"loss": 0.0669,
"step": 485
},
{
"epoch": 2.624915824915825,
"grad_norm": 2.186370372772217,
"learning_rate": 4.1192288382324363e-07,
"loss": 0.0719,
"step": 486
},
{
"epoch": 2.6303030303030304,
"grad_norm": 2.426302909851074,
"learning_rate": 4.003962751262763e-07,
"loss": 0.065,
"step": 487
},
{
"epoch": 2.6356902356902356,
"grad_norm": 2.080082893371582,
"learning_rate": 3.890265055421283e-07,
"loss": 0.0677,
"step": 488
},
{
"epoch": 2.641077441077441,
"grad_norm": 2.4764468669891357,
"learning_rate": 3.77813962761715e-07,
"loss": 0.0775,
"step": 489
},
{
"epoch": 2.6464646464646466,
"grad_norm": 2.2122390270233154,
"learning_rate": 3.6675902911476937e-07,
"loss": 0.0754,
"step": 490
},
{
"epoch": 2.651851851851852,
"grad_norm": 2.6265482902526855,
"learning_rate": 3.558620815568048e-07,
"loss": 0.0631,
"step": 491
},
{
"epoch": 2.657239057239057,
"grad_norm": 2.3554742336273193,
"learning_rate": 3.451234916562618e-07,
"loss": 0.0653,
"step": 492
},
{
"epoch": 2.6626262626262625,
"grad_norm": 2.077880382537842,
"learning_rate": 3.3454362558184075e-07,
"loss": 0.0749,
"step": 493
},
{
"epoch": 2.668013468013468,
"grad_norm": 2.258436918258667,
"learning_rate": 3.241228440900124e-07,
"loss": 0.067,
"step": 494
},
{
"epoch": 2.6734006734006734,
"grad_norm": 2.1589324474334717,
"learning_rate": 3.1386150251271897e-07,
"loss": 0.0814,
"step": 495
},
{
"epoch": 2.6787878787878787,
"grad_norm": 2.316006898880005,
"learning_rate": 3.0375995074525764e-07,
"loss": 0.0624,
"step": 496
},
{
"epoch": 2.6841750841750844,
"grad_norm": 2.2028238773345947,
"learning_rate": 2.9381853323434627e-07,
"loss": 0.0583,
"step": 497
},
{
"epoch": 2.6895622895622897,
"grad_norm": 2.372264862060547,
"learning_rate": 2.840375889663871e-07,
"loss": 0.0638,
"step": 498
},
{
"epoch": 2.694949494949495,
"grad_norm": 2.3102543354034424,
"learning_rate": 2.744174514558956e-07,
"loss": 0.0601,
"step": 499
},
{
"epoch": 2.7003367003367003,
"grad_norm": 2.3564910888671875,
"learning_rate": 2.6495844873413944e-07,
"loss": 0.0721,
"step": 500
},
{
"epoch": 2.7057239057239055,
"grad_norm": 2.442258834838867,
"learning_rate": 2.556609033379459e-07,
"loss": 0.0616,
"step": 501
},
{
"epoch": 2.7111111111111112,
"grad_norm": 2.313163995742798,
"learning_rate": 2.465251322987061e-07,
"loss": 0.0634,
"step": 502
},
{
"epoch": 2.7164983164983165,
"grad_norm": 2.4522969722747803,
"learning_rate": 2.3755144713156819e-07,
"loss": 0.0613,
"step": 503
},
{
"epoch": 2.721885521885522,
"grad_norm": 2.2570788860321045,
"learning_rate": 2.287401538248074e-07,
"loss": 0.0737,
"step": 504
},
{
"epoch": 2.7272727272727275,
"grad_norm": 2.2716591358184814,
"learning_rate": 2.20091552829399e-07,
"loss": 0.0639,
"step": 505
},
{
"epoch": 2.732659932659933,
"grad_norm": 2.105753183364868,
"learning_rate": 2.1160593904877236e-07,
"loss": 0.0625,
"step": 506
},
{
"epoch": 2.738047138047138,
"grad_norm": 2.383596658706665,
"learning_rate": 2.0328360182875262e-07,
"loss": 0.0682,
"step": 507
},
{
"epoch": 2.7434343434343433,
"grad_norm": 2.4483511447906494,
"learning_rate": 1.9512482494769613e-07,
"loss": 0.0649,
"step": 508
},
{
"epoch": 2.7488215488215486,
"grad_norm": 2.1391537189483643,
"learning_rate": 1.8712988660681498e-07,
"loss": 0.0704,
"step": 509
},
{
"epoch": 2.7542087542087543,
"grad_norm": 2.9412190914154053,
"learning_rate": 1.7929905942068836e-07,
"loss": 0.0717,
"step": 510
},
{
"epoch": 2.7595959595959596,
"grad_norm": 2.366955280303955,
"learning_rate": 1.7163261040796797e-07,
"loss": 0.0645,
"step": 511
},
{
"epoch": 2.764983164983165,
"grad_norm": 2.511876344680786,
"learning_rate": 1.6413080098227562e-07,
"loss": 0.0762,
"step": 512
},
{
"epoch": 2.7703703703703706,
"grad_norm": 2.14850115776062,
"learning_rate": 1.5679388694328446e-07,
"loss": 0.0613,
"step": 513
},
{
"epoch": 2.775757575757576,
"grad_norm": 2.2042980194091797,
"learning_rate": 1.4962211846800078e-07,
"loss": 0.0648,
"step": 514
},
{
"epoch": 2.781144781144781,
"grad_norm": 2.243152379989624,
"learning_rate": 1.426157401022321e-07,
"loss": 0.0769,
"step": 515
},
{
"epoch": 2.7865319865319864,
"grad_norm": 2.4439616203308105,
"learning_rate": 1.3577499075224821e-07,
"loss": 0.0726,
"step": 516
},
{
"epoch": 2.7919191919191917,
"grad_norm": 2.2987587451934814,
"learning_rate": 1.2910010367663317e-07,
"loss": 0.0665,
"step": 517
},
{
"epoch": 2.7973063973063974,
"grad_norm": 2.111358642578125,
"learning_rate": 1.2259130647833627e-07,
"loss": 0.0523,
"step": 518
},
{
"epoch": 2.8026936026936027,
"grad_norm": 2.131275177001953,
"learning_rate": 1.162488210969065e-07,
"loss": 0.0687,
"step": 519
},
{
"epoch": 2.808080808080808,
"grad_norm": 2.1112232208251953,
"learning_rate": 1.100728638009263e-07,
"loss": 0.0603,
"step": 520
},
{
"epoch": 2.8134680134680137,
"grad_norm": 2.212636947631836,
"learning_rate": 1.0406364518063927e-07,
"loss": 0.0565,
"step": 521
},
{
"epoch": 2.818855218855219,
"grad_norm": 2.0088913440704346,
"learning_rate": 9.822137014076472e-08,
"loss": 0.0597,
"step": 522
},
{
"epoch": 2.824242424242424,
"grad_norm": 2.1878387928009033,
"learning_rate": 9.254623789351714e-08,
"loss": 0.0751,
"step": 523
},
{
"epoch": 2.8296296296296295,
"grad_norm": 2.465935230255127,
"learning_rate": 8.703844195180555e-08,
"loss": 0.0753,
"step": 524
},
{
"epoch": 2.8350168350168348,
"grad_norm": 2.2098045349121094,
"learning_rate": 8.169817012264214e-08,
"loss": 0.0586,
"step": 525
},
{
"epoch": 2.8404040404040405,
"grad_norm": 2.3172450065612793,
"learning_rate": 7.652560450073454e-08,
"loss": 0.0639,
"step": 526
},
{
"epoch": 2.8457912457912458,
"grad_norm": 1.8119255304336548,
"learning_rate": 7.152092146227806e-08,
"loss": 0.0762,
"step": 527
},
{
"epoch": 2.851178451178451,
"grad_norm": 2.9995367527008057,
"learning_rate": 6.668429165893996e-08,
"loss": 0.0802,
"step": 528
},
{
"epoch": 2.8565656565656568,
"grad_norm": 2.5341761112213135,
"learning_rate": 6.20158800120435e-08,
"loss": 0.0751,
"step": 529
},
{
"epoch": 2.861952861952862,
"grad_norm": 2.48641300201416,
"learning_rate": 5.7515845706940246e-08,
"loss": 0.0678,
"step": 530
},
{
"epoch": 2.8673400673400673,
"grad_norm": 2.129096746444702,
"learning_rate": 5.31843421875855e-08,
"loss": 0.057,
"step": 531
},
{
"epoch": 2.8727272727272726,
"grad_norm": 1.9396432638168335,
"learning_rate": 4.9021517151305875e-08,
"loss": 0.0492,
"step": 532
},
{
"epoch": 2.878114478114478,
"grad_norm": 2.2072770595550537,
"learning_rate": 4.502751254375992e-08,
"loss": 0.0734,
"step": 533
},
{
"epoch": 2.8835016835016836,
"grad_norm": 2.1861319541931152,
"learning_rate": 4.120246455410204e-08,
"loss": 0.0537,
"step": 534
},
{
"epoch": 2.888888888888889,
"grad_norm": 2.1539671421051025,
"learning_rate": 3.7546503610336183e-08,
"loss": 0.0496,
"step": 535
},
{
"epoch": 2.894276094276094,
"grad_norm": 1.8679490089416504,
"learning_rate": 3.405975437486997e-08,
"loss": 0.0702,
"step": 536
},
{
"epoch": 2.8996632996633,
"grad_norm": 2.585775375366211,
"learning_rate": 3.074233574026087e-08,
"loss": 0.0626,
"step": 537
},
{
"epoch": 2.905050505050505,
"grad_norm": 2.1468751430511475,
"learning_rate": 2.7594360825166644e-08,
"loss": 0.0575,
"step": 538
},
{
"epoch": 2.9104377104377104,
"grad_norm": 2.1872782707214355,
"learning_rate": 2.4615936970485144e-08,
"loss": 0.0712,
"step": 539
},
{
"epoch": 2.915824915824916,
"grad_norm": 2.4800572395324707,
"learning_rate": 2.180716573569386e-08,
"loss": 0.0646,
"step": 540
},
{
"epoch": 2.9212121212121214,
"grad_norm": 2.528630495071411,
"learning_rate": 1.9168142895389376e-08,
"loss": 0.075,
"step": 541
},
{
"epoch": 2.9265993265993266,
"grad_norm": 2.408411741256714,
"learning_rate": 1.6698958436019986e-08,
"loss": 0.0717,
"step": 542
},
{
"epoch": 2.931986531986532,
"grad_norm": 2.5206246376037598,
"learning_rate": 1.4399696552816477e-08,
"loss": 0.088,
"step": 543
},
{
"epoch": 2.937373737373737,
"grad_norm": 2.5237998962402344,
"learning_rate": 1.2270435646922763e-08,
"loss": 0.0667,
"step": 544
},
{
"epoch": 2.942760942760943,
"grad_norm": 2.4456536769866943,
"learning_rate": 1.031124832272301e-08,
"loss": 0.0484,
"step": 545
},
{
"epoch": 2.948148148148148,
"grad_norm": 2.1876487731933594,
"learning_rate": 8.522201385362528e-09,
"loss": 0.0683,
"step": 546
},
{
"epoch": 2.9535353535353535,
"grad_norm": 2.1443562507629395,
"learning_rate": 6.903355838475123e-09,
"loss": 0.0688,
"step": 547
},
{
"epoch": 2.958922558922559,
"grad_norm": 2.5573830604553223,
"learning_rate": 5.454766882097007e-09,
"loss": 0.0497,
"step": 548
},
{
"epoch": 2.9643097643097645,
"grad_norm": 2.2723801136016846,
"learning_rate": 4.1764839107905074e-09,
"loss": 0.0777,
"step": 549
},
{
"epoch": 2.9696969696969697,
"grad_norm": 2.2643778324127197,
"learning_rate": 3.068550511955426e-09,
"loss": 0.0725,
"step": 550
},
{
"epoch": 2.975084175084175,
"grad_norm": 2.395113945007324,
"learning_rate": 2.131004464343556e-09,
"loss": 0.0715,
"step": 551
},
{
"epoch": 2.9804713804713803,
"grad_norm": 2.2361373901367188,
"learning_rate": 1.3638777367724898e-09,
"loss": 0.0843,
"step": 552
},
{
"epoch": 2.985858585858586,
"grad_norm": 2.993990182876587,
"learning_rate": 7.671964870337168e-10,
"loss": 0.0732,
"step": 553
},
{
"epoch": 2.9912457912457913,
"grad_norm": 2.1068203449249268,
"learning_rate": 3.4098106100166616e-10,
"loss": 0.0671,
"step": 554
},
{
"epoch": 2.9966329966329965,
"grad_norm": 2.407553195953369,
"learning_rate": 8.52459919381543e-11,
"loss": 0.0774,
"step": 555
},
{
"epoch": 2.9966329966329965,
"step": 555,
"total_flos": 9.477952550322831e+17,
"train_loss": 0.30530234318193017,
"train_runtime": 3941.2467,
"train_samples_per_second": 4.521,
"train_steps_per_second": 0.141
}
],
"logging_steps": 1.0,
"max_steps": 555,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.477952550322831e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}