{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 615, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008146639511201629, "grad_norm": 0.4904627755196518, "learning_rate": 0.0, "loss": 0.0296, "num_tokens": 468319.0, "step": 1 }, { "epoch": 0.016293279022403257, "grad_norm": 0.44639106202217055, "learning_rate": 5.263157894736843e-07, "loss": 0.0278, "num_tokens": 931744.0, "step": 2 }, { "epoch": 0.024439918533604887, "grad_norm": 0.5235906052705608, "learning_rate": 1.0526315789473685e-06, "loss": 0.0346, "num_tokens": 1382492.0, "step": 3 }, { "epoch": 0.032586558044806514, "grad_norm": 0.48829378532794426, "learning_rate": 1.5789473684210526e-06, "loss": 0.0298, "num_tokens": 1822837.0, "step": 4 }, { "epoch": 0.04073319755600815, "grad_norm": 0.46192310755459265, "learning_rate": 2.105263157894737e-06, "loss": 0.0282, "num_tokens": 2324341.0, "step": 5 }, { "epoch": 0.048879837067209775, "grad_norm": 0.42590423154372875, "learning_rate": 2.631578947368421e-06, "loss": 0.0259, "num_tokens": 2786402.0, "step": 6 }, { "epoch": 0.05702647657841141, "grad_norm": 0.3780878258784539, "learning_rate": 3.157894736842105e-06, "loss": 0.0257, "num_tokens": 3249490.0, "step": 7 }, { "epoch": 0.06517311608961303, "grad_norm": 0.3621520375009199, "learning_rate": 3.6842105263157896e-06, "loss": 0.0244, "num_tokens": 3691588.0, "step": 8 }, { "epoch": 0.07331975560081466, "grad_norm": 0.34785014992590463, "learning_rate": 4.210526315789474e-06, "loss": 0.0243, "num_tokens": 4145266.0, "step": 9 }, { "epoch": 0.0814663951120163, "grad_norm": 0.4866514034362246, "learning_rate": 4.736842105263158e-06, "loss": 0.0283, "num_tokens": 4589804.0, "step": 10 }, { "epoch": 0.08961303462321792, "grad_norm": 0.6175459481284201, "learning_rate": 5.263157894736842e-06, "loss": 0.0314, "num_tokens": 5028241.0, "step": 11 }, { "epoch": 0.09775967413441955, "grad_norm": 0.6496287922379511, "learning_rate": 5.789473684210527e-06, "loss": 0.0345, "num_tokens": 5485301.0, "step": 12 }, { "epoch": 0.10590631364562118, "grad_norm": 0.6359630764639106, "learning_rate": 6.31578947368421e-06, "loss": 0.0325, "num_tokens": 5919605.0, "step": 13 }, { "epoch": 0.11405295315682282, "grad_norm": 0.5165849518665486, "learning_rate": 6.842105263157896e-06, "loss": 0.027, "num_tokens": 6397578.0, "step": 14 }, { "epoch": 0.12219959266802444, "grad_norm": 0.5201523052382824, "learning_rate": 7.368421052631579e-06, "loss": 0.034, "num_tokens": 6859276.0, "step": 15 }, { "epoch": 0.13034623217922606, "grad_norm": 0.46155792727423955, "learning_rate": 7.894736842105265e-06, "loss": 0.0301, "num_tokens": 7339093.0, "step": 16 }, { "epoch": 0.1384928716904277, "grad_norm": 0.42614925852564395, "learning_rate": 8.421052631578948e-06, "loss": 0.03, "num_tokens": 7829130.0, "step": 17 }, { "epoch": 0.14663951120162932, "grad_norm": 0.4234114672689651, "learning_rate": 8.947368421052632e-06, "loss": 0.0318, "num_tokens": 8284433.0, "step": 18 }, { "epoch": 0.15478615071283094, "grad_norm": 0.4251114371201452, "learning_rate": 9.473684210526315e-06, "loss": 0.0303, "num_tokens": 8720891.0, "step": 19 }, { "epoch": 0.1629327902240326, "grad_norm": 0.40955870197241373, "learning_rate": 1e-05, "loss": 0.0297, "num_tokens": 9262899.0, "step": 20 }, { "epoch": 0.1710794297352342, "grad_norm": 0.4087963920152161, "learning_rate": 9.999937484351817e-06, "loss": 0.0306, "num_tokens": 9716643.0, "step": 21 }, { "epoch": 0.17922606924643583, "grad_norm": 0.3693398530680141, "learning_rate": 9.999749939144244e-06, "loss": 0.0298, "num_tokens": 10180183.0, "step": 22 }, { "epoch": 0.18737270875763748, "grad_norm": 0.41806848178827605, "learning_rate": 9.99943736958818e-06, "loss": 0.0321, "num_tokens": 10631870.0, "step": 23 }, { "epoch": 0.1955193482688391, "grad_norm": 0.3809861049653307, "learning_rate": 9.998999784368282e-06, "loss": 0.0309, "num_tokens": 11100352.0, "step": 24 }, { "epoch": 0.20366598778004075, "grad_norm": 0.39880645680339244, "learning_rate": 9.99843719564274e-06, "loss": 0.0352, "num_tokens": 11561399.0, "step": 25 }, { "epoch": 0.21181262729124237, "grad_norm": 0.35747673914837313, "learning_rate": 9.997749619042932e-06, "loss": 0.0302, "num_tokens": 12052978.0, "step": 26 }, { "epoch": 0.219959266802444, "grad_norm": 0.3646974156800663, "learning_rate": 9.996937073672988e-06, "loss": 0.0326, "num_tokens": 12510505.0, "step": 27 }, { "epoch": 0.22810590631364563, "grad_norm": 0.37781095340314147, "learning_rate": 9.995999582109266e-06, "loss": 0.0329, "num_tokens": 12972726.0, "step": 28 }, { "epoch": 0.23625254582484725, "grad_norm": 0.32601449306874525, "learning_rate": 9.994937170399715e-06, "loss": 0.0338, "num_tokens": 13415015.0, "step": 29 }, { "epoch": 0.24439918533604887, "grad_norm": 0.3124719848123855, "learning_rate": 9.993749868063162e-06, "loss": 0.0321, "num_tokens": 13862924.0, "step": 30 }, { "epoch": 0.2525458248472505, "grad_norm": 0.33624412669801873, "learning_rate": 9.992437708088487e-06, "loss": 0.0343, "num_tokens": 14336744.0, "step": 31 }, { "epoch": 0.2606924643584521, "grad_norm": 0.2822892406548645, "learning_rate": 9.991000726933702e-06, "loss": 0.0317, "num_tokens": 14787461.0, "step": 32 }, { "epoch": 0.26883910386965376, "grad_norm": 0.3141509965138737, "learning_rate": 9.989438964524943e-06, "loss": 0.0348, "num_tokens": 15260166.0, "step": 33 }, { "epoch": 0.2769857433808554, "grad_norm": 0.28289872142233236, "learning_rate": 9.987752464255365e-06, "loss": 0.0328, "num_tokens": 15716455.0, "step": 34 }, { "epoch": 0.285132382892057, "grad_norm": 0.2988023355786341, "learning_rate": 9.98594127298392e-06, "loss": 0.0327, "num_tokens": 16208303.0, "step": 35 }, { "epoch": 0.29327902240325865, "grad_norm": 0.2827377533678326, "learning_rate": 9.984005441034079e-06, "loss": 0.0316, "num_tokens": 16661734.0, "step": 36 }, { "epoch": 0.3014256619144603, "grad_norm": 0.3277891610784485, "learning_rate": 9.981945022192412e-06, "loss": 0.0363, "num_tokens": 17117668.0, "step": 37 }, { "epoch": 0.3095723014256619, "grad_norm": 0.29741031623668746, "learning_rate": 9.979760073707106e-06, "loss": 0.0322, "num_tokens": 17568922.0, "step": 38 }, { "epoch": 0.31771894093686354, "grad_norm": 0.28575184625841077, "learning_rate": 9.977450656286371e-06, "loss": 0.0317, "num_tokens": 18032936.0, "step": 39 }, { "epoch": 0.3258655804480652, "grad_norm": 0.28459652313484274, "learning_rate": 9.97501683409675e-06, "loss": 0.0334, "num_tokens": 18462483.0, "step": 40 }, { "epoch": 0.3340122199592668, "grad_norm": 0.2853320257498706, "learning_rate": 9.972458674761347e-06, "loss": 0.0325, "num_tokens": 18918154.0, "step": 41 }, { "epoch": 0.3421588594704684, "grad_norm": 0.31245376505929573, "learning_rate": 9.96977624935793e-06, "loss": 0.0356, "num_tokens": 19392456.0, "step": 42 }, { "epoch": 0.35030549898167007, "grad_norm": 0.29339121491288905, "learning_rate": 9.96696963241697e-06, "loss": 0.0358, "num_tokens": 19864410.0, "step": 43 }, { "epoch": 0.35845213849287166, "grad_norm": 0.308001575808903, "learning_rate": 9.964038901919573e-06, "loss": 0.0344, "num_tokens": 20325616.0, "step": 44 }, { "epoch": 0.3665987780040733, "grad_norm": 0.29777121599268264, "learning_rate": 9.9609841392953e-06, "loss": 0.0361, "num_tokens": 20754956.0, "step": 45 }, { "epoch": 0.37474541751527496, "grad_norm": 0.27446985734348617, "learning_rate": 9.95780542941991e-06, "loss": 0.0367, "num_tokens": 21197697.0, "step": 46 }, { "epoch": 0.38289205702647655, "grad_norm": 0.2723208448567585, "learning_rate": 9.954502860613011e-06, "loss": 0.0355, "num_tokens": 21644714.0, "step": 47 }, { "epoch": 0.3910386965376782, "grad_norm": 0.34829072831093, "learning_rate": 9.951076524635593e-06, "loss": 0.0343, "num_tokens": 22094029.0, "step": 48 }, { "epoch": 0.39918533604887985, "grad_norm": 0.2633667374393046, "learning_rate": 9.947526516687484e-06, "loss": 0.0342, "num_tokens": 22577438.0, "step": 49 }, { "epoch": 0.4073319755600815, "grad_norm": 0.2781504189612014, "learning_rate": 9.943852935404706e-06, "loss": 0.0356, "num_tokens": 23046436.0, "step": 50 }, { "epoch": 0.4154786150712831, "grad_norm": 0.29581469873784194, "learning_rate": 9.940055882856734e-06, "loss": 0.038, "num_tokens": 23498243.0, "step": 51 }, { "epoch": 0.42362525458248473, "grad_norm": 0.2656899667965322, "learning_rate": 9.936135464543652e-06, "loss": 0.0347, "num_tokens": 23972330.0, "step": 52 }, { "epoch": 0.4317718940936864, "grad_norm": 0.2543418233162407, "learning_rate": 9.93209178939324e-06, "loss": 0.0341, "num_tokens": 24453685.0, "step": 53 }, { "epoch": 0.439918533604888, "grad_norm": 0.25163009959008703, "learning_rate": 9.927924969757926e-06, "loss": 0.034, "num_tokens": 24926242.0, "step": 54 }, { "epoch": 0.4480651731160896, "grad_norm": 0.2530048416696052, "learning_rate": 9.923635121411683e-06, "loss": 0.0341, "num_tokens": 25365241.0, "step": 55 }, { "epoch": 0.45621181262729127, "grad_norm": 0.2591530319599859, "learning_rate": 9.919222363546797e-06, "loss": 0.0353, "num_tokens": 25833971.0, "step": 56 }, { "epoch": 0.46435845213849286, "grad_norm": 0.23005642120058867, "learning_rate": 9.914686818770567e-06, "loss": 0.0328, "num_tokens": 26279628.0, "step": 57 }, { "epoch": 0.4725050916496945, "grad_norm": 0.2612401425726277, "learning_rate": 9.910028613101888e-06, "loss": 0.0343, "num_tokens": 26734776.0, "step": 58 }, { "epoch": 0.48065173116089616, "grad_norm": 0.25501336518012946, "learning_rate": 9.905247875967764e-06, "loss": 0.035, "num_tokens": 27206001.0, "step": 59 }, { "epoch": 0.48879837067209775, "grad_norm": 0.25907516477795234, "learning_rate": 9.900344740199691e-06, "loss": 0.0342, "num_tokens": 27647448.0, "step": 60 }, { "epoch": 0.4969450101832994, "grad_norm": 0.2627756492187737, "learning_rate": 9.895319342029992e-06, "loss": 0.0352, "num_tokens": 28116087.0, "step": 61 }, { "epoch": 0.505091649694501, "grad_norm": 0.2520744974011735, "learning_rate": 9.890171821088006e-06, "loss": 0.034, "num_tokens": 28556029.0, "step": 62 }, { "epoch": 0.5132382892057027, "grad_norm": 0.25566988242695377, "learning_rate": 9.884902320396228e-06, "loss": 0.0345, "num_tokens": 29003546.0, "step": 63 }, { "epoch": 0.5213849287169042, "grad_norm": 0.26761657061201327, "learning_rate": 9.879510986366321e-06, "loss": 0.0386, "num_tokens": 29464833.0, "step": 64 }, { "epoch": 0.5295315682281059, "grad_norm": 0.25151679573138824, "learning_rate": 9.873997968795066e-06, "loss": 0.0361, "num_tokens": 29908906.0, "step": 65 }, { "epoch": 0.5376782077393075, "grad_norm": 0.25192725491977325, "learning_rate": 9.868363420860176e-06, "loss": 0.0363, "num_tokens": 30339618.0, "step": 66 }, { "epoch": 0.5458248472505092, "grad_norm": 0.2558097074022343, "learning_rate": 9.86260749911606e-06, "loss": 0.0359, "num_tokens": 30798302.0, "step": 67 }, { "epoch": 0.5539714867617108, "grad_norm": 0.23903896250926235, "learning_rate": 9.856730363489465e-06, "loss": 0.0321, "num_tokens": 31270382.0, "step": 68 }, { "epoch": 0.5621181262729125, "grad_norm": 0.23678636099022307, "learning_rate": 9.85073217727503e-06, "loss": 0.0332, "num_tokens": 31743990.0, "step": 69 }, { "epoch": 0.570264765784114, "grad_norm": 0.2718182538363666, "learning_rate": 9.844613107130758e-06, "loss": 0.0377, "num_tokens": 32188589.0, "step": 70 }, { "epoch": 0.5784114052953157, "grad_norm": 0.2447806090135222, "learning_rate": 9.838373323073376e-06, "loss": 0.0335, "num_tokens": 32654341.0, "step": 71 }, { "epoch": 0.5865580448065173, "grad_norm": 0.24791886655928558, "learning_rate": 9.832012998473612e-06, "loss": 0.0357, "num_tokens": 33133443.0, "step": 72 }, { "epoch": 0.594704684317719, "grad_norm": 0.2602111918495323, "learning_rate": 9.825532310051383e-06, "loss": 0.0369, "num_tokens": 33600590.0, "step": 73 }, { "epoch": 0.6028513238289206, "grad_norm": 0.23958876317959238, "learning_rate": 9.818931437870888e-06, "loss": 0.0347, "num_tokens": 34081907.0, "step": 74 }, { "epoch": 0.6109979633401222, "grad_norm": 0.24528240723597736, "learning_rate": 9.812210565335591e-06, "loss": 0.0347, "num_tokens": 34528542.0, "step": 75 }, { "epoch": 0.6191446028513238, "grad_norm": 0.2511113811601625, "learning_rate": 9.805369879183143e-06, "loss": 0.0358, "num_tokens": 34984490.0, "step": 76 }, { "epoch": 0.6272912423625254, "grad_norm": 0.23964291648975655, "learning_rate": 9.798409569480171e-06, "loss": 0.0368, "num_tokens": 35438413.0, "step": 77 }, { "epoch": 0.6354378818737271, "grad_norm": 0.22854430928208863, "learning_rate": 9.791329829617025e-06, "loss": 0.0329, "num_tokens": 35861862.0, "step": 78 }, { "epoch": 0.6435845213849287, "grad_norm": 0.25934229180134305, "learning_rate": 9.784130856302383e-06, "loss": 0.0352, "num_tokens": 36334726.0, "step": 79 }, { "epoch": 0.6517311608961304, "grad_norm": 0.249853867356781, "learning_rate": 9.77681284955779e-06, "loss": 0.0334, "num_tokens": 36806966.0, "step": 80 }, { "epoch": 0.659877800407332, "grad_norm": 0.24228111972158922, "learning_rate": 9.769376012712107e-06, "loss": 0.0355, "num_tokens": 37255978.0, "step": 81 }, { "epoch": 0.6680244399185336, "grad_norm": 0.24656941383849604, "learning_rate": 9.761820552395857e-06, "loss": 0.0372, "num_tokens": 37695349.0, "step": 82 }, { "epoch": 0.6761710794297352, "grad_norm": 0.24557463844035055, "learning_rate": 9.754146678535483e-06, "loss": 0.0364, "num_tokens": 38137196.0, "step": 83 }, { "epoch": 0.6843177189409368, "grad_norm": 0.25045832824836683, "learning_rate": 9.74635460434752e-06, "loss": 0.036, "num_tokens": 38601156.0, "step": 84 }, { "epoch": 0.6924643584521385, "grad_norm": 0.23961222253413397, "learning_rate": 9.738444546332663e-06, "loss": 0.0348, "num_tokens": 39098917.0, "step": 85 }, { "epoch": 0.7006109979633401, "grad_norm": 0.21623543203559747, "learning_rate": 9.73041672426976e-06, "loss": 0.0313, "num_tokens": 39589476.0, "step": 86 }, { "epoch": 0.7087576374745418, "grad_norm": 0.2454384444263673, "learning_rate": 9.722271361209698e-06, "loss": 0.035, "num_tokens": 40040757.0, "step": 87 }, { "epoch": 0.7169042769857433, "grad_norm": 0.2514790044121715, "learning_rate": 9.714008683469212e-06, "loss": 0.035, "num_tokens": 40503981.0, "step": 88 }, { "epoch": 0.725050916496945, "grad_norm": 0.2574428715510541, "learning_rate": 9.705628920624592e-06, "loss": 0.0365, "num_tokens": 40969365.0, "step": 89 }, { "epoch": 0.7331975560081466, "grad_norm": 0.25017040048121353, "learning_rate": 9.69713230550531e-06, "loss": 0.0349, "num_tokens": 41427533.0, "step": 90 }, { "epoch": 0.7413441955193483, "grad_norm": 0.2526246003424556, "learning_rate": 9.68851907418754e-06, "loss": 0.0385, "num_tokens": 41894302.0, "step": 91 }, { "epoch": 0.7494908350305499, "grad_norm": 0.2461082056251613, "learning_rate": 9.679789465987614e-06, "loss": 0.0357, "num_tokens": 42349463.0, "step": 92 }, { "epoch": 0.7576374745417516, "grad_norm": 0.2617726018040813, "learning_rate": 9.67094372345536e-06, "loss": 0.0389, "num_tokens": 42774515.0, "step": 93 }, { "epoch": 0.7657841140529531, "grad_norm": 0.24705231631404728, "learning_rate": 9.661982092367366e-06, "loss": 0.036, "num_tokens": 43230624.0, "step": 94 }, { "epoch": 0.7739307535641547, "grad_norm": 0.235679439367168, "learning_rate": 9.652904821720158e-06, "loss": 0.0365, "num_tokens": 43672523.0, "step": 95 }, { "epoch": 0.7820773930753564, "grad_norm": 0.2510768490849978, "learning_rate": 9.643712163723271e-06, "loss": 0.0377, "num_tokens": 44158995.0, "step": 96 }, { "epoch": 0.790224032586558, "grad_norm": 0.2533074838565773, "learning_rate": 9.63440437379225e-06, "loss": 0.0376, "num_tokens": 44636347.0, "step": 97 }, { "epoch": 0.7983706720977597, "grad_norm": 0.23715260979777855, "learning_rate": 9.624981710541548e-06, "loss": 0.0356, "num_tokens": 45086574.0, "step": 98 }, { "epoch": 0.8065173116089613, "grad_norm": 0.23369067636824356, "learning_rate": 9.615444435777343e-06, "loss": 0.0357, "num_tokens": 45541713.0, "step": 99 }, { "epoch": 0.814663951120163, "grad_norm": 0.22571635640078413, "learning_rate": 9.605792814490263e-06, "loss": 0.0348, "num_tokens": 46007566.0, "step": 100 }, { "epoch": 0.8228105906313645, "grad_norm": 0.23077275204681233, "learning_rate": 9.596027114848025e-06, "loss": 0.0345, "num_tokens": 46477746.0, "step": 101 }, { "epoch": 0.8309572301425662, "grad_norm": 0.22566558819394333, "learning_rate": 9.58614760818798e-06, "loss": 0.0338, "num_tokens": 46929999.0, "step": 102 }, { "epoch": 0.8391038696537678, "grad_norm": 0.21695625400644095, "learning_rate": 9.57615456900958e-06, "loss": 0.0347, "num_tokens": 47395766.0, "step": 103 }, { "epoch": 0.8472505091649695, "grad_norm": 0.2620473147070263, "learning_rate": 9.566048274966745e-06, "loss": 0.0383, "num_tokens": 47845971.0, "step": 104 }, { "epoch": 0.8553971486761711, "grad_norm": 0.2410799135804227, "learning_rate": 9.55582900686015e-06, "loss": 0.0365, "num_tokens": 48287919.0, "step": 105 }, { "epoch": 0.8635437881873728, "grad_norm": 0.2396885428184001, "learning_rate": 9.545497048629427e-06, "loss": 0.0348, "num_tokens": 48749479.0, "step": 106 }, { "epoch": 0.8716904276985743, "grad_norm": 0.24688227687368633, "learning_rate": 9.535052687345273e-06, "loss": 0.0387, "num_tokens": 49192411.0, "step": 107 }, { "epoch": 0.879837067209776, "grad_norm": 0.24294158661413467, "learning_rate": 9.524496213201473e-06, "loss": 0.0378, "num_tokens": 49653484.0, "step": 108 }, { "epoch": 0.8879837067209776, "grad_norm": 0.2405101629778957, "learning_rate": 9.513827919506835e-06, "loss": 0.0363, "num_tokens": 50112406.0, "step": 109 }, { "epoch": 0.8961303462321792, "grad_norm": 0.23181354337095814, "learning_rate": 9.503048102677048e-06, "loss": 0.0349, "num_tokens": 50574830.0, "step": 110 }, { "epoch": 0.9042769857433809, "grad_norm": 0.23382747597194983, "learning_rate": 9.492157062226438e-06, "loss": 0.0341, "num_tokens": 51043765.0, "step": 111 }, { "epoch": 0.9124236252545825, "grad_norm": 0.22729966362083456, "learning_rate": 9.481155100759651e-06, "loss": 0.0345, "num_tokens": 51491061.0, "step": 112 }, { "epoch": 0.9205702647657841, "grad_norm": 0.24513175538240015, "learning_rate": 9.470042523963243e-06, "loss": 0.039, "num_tokens": 51927088.0, "step": 113 }, { "epoch": 0.9287169042769857, "grad_norm": 0.24880865741998745, "learning_rate": 9.458819640597193e-06, "loss": 0.0379, "num_tokens": 52372997.0, "step": 114 }, { "epoch": 0.9368635437881874, "grad_norm": 0.2220343898509789, "learning_rate": 9.447486762486307e-06, "loss": 0.034, "num_tokens": 52812484.0, "step": 115 }, { "epoch": 0.945010183299389, "grad_norm": 0.22431667653715365, "learning_rate": 9.436044204511575e-06, "loss": 0.0346, "num_tokens": 53269746.0, "step": 116 }, { "epoch": 0.9531568228105907, "grad_norm": 0.2278604942336719, "learning_rate": 9.42449228460141e-06, "loss": 0.0364, "num_tokens": 53715464.0, "step": 117 }, { "epoch": 0.9613034623217923, "grad_norm": 0.2233927678176066, "learning_rate": 9.412831323722813e-06, "loss": 0.0354, "num_tokens": 54163779.0, "step": 118 }, { "epoch": 0.9694501018329938, "grad_norm": 0.2258095576824266, "learning_rate": 9.401061645872469e-06, "loss": 0.0356, "num_tokens": 54622927.0, "step": 119 }, { "epoch": 0.9775967413441955, "grad_norm": 0.21843742724066828, "learning_rate": 9.389183578067725e-06, "loss": 0.0332, "num_tokens": 55117094.0, "step": 120 }, { "epoch": 0.9857433808553971, "grad_norm": 0.23443623658924626, "learning_rate": 9.37719745033752e-06, "loss": 0.0372, "num_tokens": 55571058.0, "step": 121 }, { "epoch": 0.9938900203665988, "grad_norm": 0.22544218766750995, "learning_rate": 9.365103595713206e-06, "loss": 0.0347, "num_tokens": 56023909.0, "step": 122 }, { "epoch": 1.0, "grad_norm": 0.23694297938614514, "learning_rate": 9.352902350219298e-06, "loss": 0.0325, "num_tokens": 56069607.0, "step": 123 }, { "epoch": 1.0, "eval_loss": 0.041923802345991135, "eval_num_tokens": 56069607.0, "eval_runtime": 59.8807, "eval_samples_per_second": 40.731, "eval_steps_per_second": 5.093, "step": 123 }, { "epoch": 1.0081466395112015, "grad_norm": 0.18235571925323477, "learning_rate": 9.34059405286414e-06, "loss": 0.0242, "num_tokens": 56508815.0, "step": 124 }, { "epoch": 1.0162932790224033, "grad_norm": 0.19439769536061022, "learning_rate": 9.32817904563048e-06, "loss": 0.0234, "num_tokens": 56965411.0, "step": 125 }, { "epoch": 1.0244399185336048, "grad_norm": 0.1736558218986549, "learning_rate": 9.315657673465978e-06, "loss": 0.0225, "num_tokens": 57414294.0, "step": 126 }, { "epoch": 1.0325865580448066, "grad_norm": 0.19113275019426793, "learning_rate": 9.303030284273606e-06, "loss": 0.0225, "num_tokens": 57877954.0, "step": 127 }, { "epoch": 1.0407331975560081, "grad_norm": 0.19036081030160895, "learning_rate": 9.290297228901994e-06, "loss": 0.022, "num_tokens": 58325030.0, "step": 128 }, { "epoch": 1.0488798370672097, "grad_norm": 0.1984639840701536, "learning_rate": 9.277458861135684e-06, "loss": 0.0219, "num_tokens": 58808552.0, "step": 129 }, { "epoch": 1.0570264765784114, "grad_norm": 0.2043532515942055, "learning_rate": 9.264515537685289e-06, "loss": 0.0217, "num_tokens": 59306149.0, "step": 130 }, { "epoch": 1.065173116089613, "grad_norm": 0.24055798224631966, "learning_rate": 9.251467618177588e-06, "loss": 0.0238, "num_tokens": 59747428.0, "step": 131 }, { "epoch": 1.0733197556008147, "grad_norm": 0.1995629506004207, "learning_rate": 9.238315465145536e-06, "loss": 0.0204, "num_tokens": 60204616.0, "step": 132 }, { "epoch": 1.0814663951120163, "grad_norm": 0.2525900691277178, "learning_rate": 9.225059444018185e-06, "loss": 0.0239, "num_tokens": 60656969.0, "step": 133 }, { "epoch": 1.089613034623218, "grad_norm": 0.2318763327290573, "learning_rate": 9.21169992311054e-06, "loss": 0.0218, "num_tokens": 61138427.0, "step": 134 }, { "epoch": 1.0977596741344195, "grad_norm": 0.24997426536385803, "learning_rate": 9.198237273613311e-06, "loss": 0.0249, "num_tokens": 61577876.0, "step": 135 }, { "epoch": 1.105906313645621, "grad_norm": 0.2271197177471986, "learning_rate": 9.184671869582617e-06, "loss": 0.0229, "num_tokens": 62045028.0, "step": 136 }, { "epoch": 1.1140529531568228, "grad_norm": 0.22400406233634754, "learning_rate": 9.17100408792958e-06, "loss": 0.0207, "num_tokens": 62486192.0, "step": 137 }, { "epoch": 1.1221995926680244, "grad_norm": 0.23845965068678432, "learning_rate": 9.157234308409859e-06, "loss": 0.0225, "num_tokens": 62956027.0, "step": 138 }, { "epoch": 1.1303462321792261, "grad_norm": 0.2106619550266949, "learning_rate": 9.14336291361309e-06, "loss": 0.0213, "num_tokens": 63414690.0, "step": 139 }, { "epoch": 1.1384928716904277, "grad_norm": 0.2187838784331104, "learning_rate": 9.129390288952273e-06, "loss": 0.0228, "num_tokens": 63863726.0, "step": 140 }, { "epoch": 1.1466395112016294, "grad_norm": 0.19858994365963545, "learning_rate": 9.115316822653043e-06, "loss": 0.0203, "num_tokens": 64339457.0, "step": 141 }, { "epoch": 1.154786150712831, "grad_norm": 0.2091640165384244, "learning_rate": 9.101142905742898e-06, "loss": 0.0224, "num_tokens": 64797748.0, "step": 142 }, { "epoch": 1.1629327902240325, "grad_norm": 0.21848028557367125, "learning_rate": 9.086868932040327e-06, "loss": 0.0237, "num_tokens": 65261816.0, "step": 143 }, { "epoch": 1.1710794297352343, "grad_norm": 0.21024580943693202, "learning_rate": 9.072495298143876e-06, "loss": 0.0222, "num_tokens": 65710096.0, "step": 144 }, { "epoch": 1.1792260692464358, "grad_norm": 0.21112539693299767, "learning_rate": 9.058022403421112e-06, "loss": 0.0234, "num_tokens": 66180522.0, "step": 145 }, { "epoch": 1.1873727087576376, "grad_norm": 0.20089428024021022, "learning_rate": 9.043450649997546e-06, "loss": 0.0221, "num_tokens": 66643220.0, "step": 146 }, { "epoch": 1.195519348268839, "grad_norm": 0.21269472349968574, "learning_rate": 9.028780442745452e-06, "loss": 0.0236, "num_tokens": 67103696.0, "step": 147 }, { "epoch": 1.2036659877800409, "grad_norm": 0.19919608780198533, "learning_rate": 9.014012189272612e-06, "loss": 0.0215, "num_tokens": 67578752.0, "step": 148 }, { "epoch": 1.2118126272912424, "grad_norm": 0.19516667942695456, "learning_rate": 8.999146299911001e-06, "loss": 0.0226, "num_tokens": 68024730.0, "step": 149 }, { "epoch": 1.219959266802444, "grad_norm": 0.21431471881204775, "learning_rate": 8.984183187705376e-06, "loss": 0.0237, "num_tokens": 68513599.0, "step": 150 }, { "epoch": 1.2281059063136457, "grad_norm": 0.19529826679401555, "learning_rate": 8.969123268401807e-06, "loss": 0.0207, "num_tokens": 68988237.0, "step": 151 }, { "epoch": 1.2362525458248472, "grad_norm": 0.2014146714986417, "learning_rate": 8.953966960436125e-06, "loss": 0.0231, "num_tokens": 69430574.0, "step": 152 }, { "epoch": 1.2443991853360488, "grad_norm": 0.21239498172005217, "learning_rate": 8.938714684922294e-06, "loss": 0.0233, "num_tokens": 69884264.0, "step": 153 }, { "epoch": 1.2525458248472505, "grad_norm": 0.213004792751643, "learning_rate": 8.923366865640708e-06, "loss": 0.0237, "num_tokens": 70361322.0, "step": 154 }, { "epoch": 1.260692464358452, "grad_norm": 0.21940926870270266, "learning_rate": 8.90792392902642e-06, "loss": 0.0224, "num_tokens": 70825081.0, "step": 155 }, { "epoch": 1.2688391038696538, "grad_norm": 0.21496750501528322, "learning_rate": 8.892386304157297e-06, "loss": 0.0221, "num_tokens": 71283936.0, "step": 156 }, { "epoch": 1.2769857433808554, "grad_norm": 0.21230254367904663, "learning_rate": 8.876754422742084e-06, "loss": 0.0246, "num_tokens": 71732305.0, "step": 157 }, { "epoch": 1.2851323828920571, "grad_norm": 0.2188832765541447, "learning_rate": 8.861028719108431e-06, "loss": 0.0224, "num_tokens": 72199220.0, "step": 158 }, { "epoch": 1.2932790224032586, "grad_norm": 0.215744449219536, "learning_rate": 8.845209630190804e-06, "loss": 0.0232, "num_tokens": 72686777.0, "step": 159 }, { "epoch": 1.3014256619144602, "grad_norm": 0.2027419921199597, "learning_rate": 8.829297595518357e-06, "loss": 0.0217, "num_tokens": 73141281.0, "step": 160 }, { "epoch": 1.309572301425662, "grad_norm": 0.1999503892860215, "learning_rate": 8.81329305720272e-06, "loss": 0.0236, "num_tokens": 73601661.0, "step": 161 }, { "epoch": 1.3177189409368635, "grad_norm": 0.17428881801329021, "learning_rate": 8.797196459925707e-06, "loss": 0.0205, "num_tokens": 74058631.0, "step": 162 }, { "epoch": 1.3258655804480652, "grad_norm": 0.18566703149612335, "learning_rate": 8.78100825092697e-06, "loss": 0.0218, "num_tokens": 74506287.0, "step": 163 }, { "epoch": 1.3340122199592668, "grad_norm": 0.21095321978269194, "learning_rate": 8.764728879991563e-06, "loss": 0.0233, "num_tokens": 74961649.0, "step": 164 }, { "epoch": 1.3421588594704685, "grad_norm": 0.19297708933381486, "learning_rate": 8.748358799437454e-06, "loss": 0.0218, "num_tokens": 75451492.0, "step": 165 }, { "epoch": 1.35030549898167, "grad_norm": 0.21567434563638074, "learning_rate": 8.731898464102955e-06, "loss": 0.0233, "num_tokens": 75906898.0, "step": 166 }, { "epoch": 1.3584521384928716, "grad_norm": 0.20493170237350014, "learning_rate": 8.715348331334079e-06, "loss": 0.0225, "num_tokens": 76352518.0, "step": 167 }, { "epoch": 1.3665987780040734, "grad_norm": 0.20033609244286213, "learning_rate": 8.698708860971837e-06, "loss": 0.0217, "num_tokens": 76833416.0, "step": 168 }, { "epoch": 1.374745417515275, "grad_norm": 0.19488013729722037, "learning_rate": 8.681980515339464e-06, "loss": 0.0228, "num_tokens": 77274089.0, "step": 169 }, { "epoch": 1.3828920570264764, "grad_norm": 0.23159956916525645, "learning_rate": 8.66516375922957e-06, "loss": 0.026, "num_tokens": 77722945.0, "step": 170 }, { "epoch": 1.3910386965376782, "grad_norm": 0.20950731710653633, "learning_rate": 8.648259059891222e-06, "loss": 0.0238, "num_tokens": 78165800.0, "step": 171 }, { "epoch": 1.39918533604888, "grad_norm": 0.20384280771126798, "learning_rate": 8.631266887016973e-06, "loss": 0.0234, "num_tokens": 78606210.0, "step": 172 }, { "epoch": 1.4073319755600815, "grad_norm": 0.2174107340618266, "learning_rate": 8.614187712729801e-06, "loss": 0.025, "num_tokens": 79049600.0, "step": 173 }, { "epoch": 1.415478615071283, "grad_norm": 0.21419711356536544, "learning_rate": 8.597022011569993e-06, "loss": 0.0236, "num_tokens": 79519022.0, "step": 174 }, { "epoch": 1.4236252545824848, "grad_norm": 0.19839711735747953, "learning_rate": 8.579770260481967e-06, "loss": 0.0233, "num_tokens": 79988589.0, "step": 175 }, { "epoch": 1.4317718940936863, "grad_norm": 0.20080459279151233, "learning_rate": 8.56243293880101e-06, "loss": 0.0222, "num_tokens": 80468185.0, "step": 176 }, { "epoch": 1.4399185336048879, "grad_norm": 0.1934313717220866, "learning_rate": 8.545010528239969e-06, "loss": 0.0236, "num_tokens": 80915153.0, "step": 177 }, { "epoch": 1.4480651731160896, "grad_norm": 0.20353533737845392, "learning_rate": 8.527503512875862e-06, "loss": 0.023, "num_tokens": 81406072.0, "step": 178 }, { "epoch": 1.4562118126272914, "grad_norm": 0.1913760568401795, "learning_rate": 8.509912379136429e-06, "loss": 0.0213, "num_tokens": 81861174.0, "step": 179 }, { "epoch": 1.464358452138493, "grad_norm": 0.18760725003184955, "learning_rate": 8.492237615786613e-06, "loss": 0.0232, "num_tokens": 82291515.0, "step": 180 }, { "epoch": 1.4725050916496945, "grad_norm": 0.2073497810013695, "learning_rate": 8.474479713914985e-06, "loss": 0.0241, "num_tokens": 82746207.0, "step": 181 }, { "epoch": 1.4806517311608962, "grad_norm": 0.20252547578412558, "learning_rate": 8.456639166920104e-06, "loss": 0.023, "num_tokens": 83217896.0, "step": 182 }, { "epoch": 1.4887983706720977, "grad_norm": 0.19100843841767107, "learning_rate": 8.438716470496793e-06, "loss": 0.0234, "num_tokens": 83673415.0, "step": 183 }, { "epoch": 1.4969450101832993, "grad_norm": 0.18768759409970084, "learning_rate": 8.42071212262238e-06, "loss": 0.0217, "num_tokens": 84165622.0, "step": 184 }, { "epoch": 1.505091649694501, "grad_norm": 0.21146877851004245, "learning_rate": 8.402626623542853e-06, "loss": 0.0236, "num_tokens": 84623691.0, "step": 185 }, { "epoch": 1.5132382892057028, "grad_norm": 0.209209778931465, "learning_rate": 8.384460475758967e-06, "loss": 0.0244, "num_tokens": 85066604.0, "step": 186 }, { "epoch": 1.5213849287169041, "grad_norm": 0.20794230796465518, "learning_rate": 8.36621418401228e-06, "loss": 0.0245, "num_tokens": 85500800.0, "step": 187 }, { "epoch": 1.5295315682281059, "grad_norm": 0.19401787928805586, "learning_rate": 8.347888255271126e-06, "loss": 0.0227, "num_tokens": 85950718.0, "step": 188 }, { "epoch": 1.5376782077393076, "grad_norm": 0.19358587269712685, "learning_rate": 8.329483198716536e-06, "loss": 0.0216, "num_tokens": 86425214.0, "step": 189 }, { "epoch": 1.5458248472505092, "grad_norm": 0.19988901116993596, "learning_rate": 8.310999525728083e-06, "loss": 0.0237, "num_tokens": 86872612.0, "step": 190 }, { "epoch": 1.5539714867617107, "grad_norm": 0.21347868715899784, "learning_rate": 8.292437749869676e-06, "loss": 0.0237, "num_tokens": 87321247.0, "step": 191 }, { "epoch": 1.5621181262729125, "grad_norm": 0.21370368402938023, "learning_rate": 8.273798386875292e-06, "loss": 0.0247, "num_tokens": 87762936.0, "step": 192 }, { "epoch": 1.570264765784114, "grad_norm": 0.20394116229065584, "learning_rate": 8.255081954634646e-06, "loss": 0.0224, "num_tokens": 88233384.0, "step": 193 }, { "epoch": 1.5784114052953155, "grad_norm": 0.21271701085924696, "learning_rate": 8.236288973178806e-06, "loss": 0.024, "num_tokens": 88702888.0, "step": 194 }, { "epoch": 1.5865580448065173, "grad_norm": 0.20525261813526166, "learning_rate": 8.217419964665728e-06, "loss": 0.0228, "num_tokens": 89157902.0, "step": 195 }, { "epoch": 1.594704684317719, "grad_norm": 0.20518578666067122, "learning_rate": 8.198475453365772e-06, "loss": 0.0239, "num_tokens": 89596892.0, "step": 196 }, { "epoch": 1.6028513238289206, "grad_norm": 0.20424504177429212, "learning_rate": 8.179455965647117e-06, "loss": 0.024, "num_tokens": 90043689.0, "step": 197 }, { "epoch": 1.6109979633401221, "grad_norm": 0.21550795243608867, "learning_rate": 8.16036202996114e-06, "loss": 0.0255, "num_tokens": 90493255.0, "step": 198 }, { "epoch": 1.6191446028513239, "grad_norm": 0.21505565048112654, "learning_rate": 8.141194176827738e-06, "loss": 0.0246, "num_tokens": 90933700.0, "step": 199 }, { "epoch": 1.6272912423625254, "grad_norm": 0.18057787414765422, "learning_rate": 8.12195293882058e-06, "loss": 0.0205, "num_tokens": 91402906.0, "step": 200 }, { "epoch": 1.635437881873727, "grad_norm": 0.2128757560225609, "learning_rate": 8.102638850552323e-06, "loss": 0.0236, "num_tokens": 91854715.0, "step": 201 }, { "epoch": 1.6435845213849287, "grad_norm": 0.1977918829414763, "learning_rate": 8.083252448659742e-06, "loss": 0.022, "num_tokens": 92317914.0, "step": 202 }, { "epoch": 1.6517311608961305, "grad_norm": 0.22533293918121253, "learning_rate": 8.063794271788826e-06, "loss": 0.0256, "num_tokens": 92775730.0, "step": 203 }, { "epoch": 1.659877800407332, "grad_norm": 0.21714220334981602, "learning_rate": 8.044264860579816e-06, "loss": 0.0255, "num_tokens": 93222261.0, "step": 204 }, { "epoch": 1.6680244399185336, "grad_norm": 0.2061132581627763, "learning_rate": 8.02466475765218e-06, "loss": 0.0229, "num_tokens": 93713195.0, "step": 205 }, { "epoch": 1.6761710794297353, "grad_norm": 0.21189634915409705, "learning_rate": 8.004994507589532e-06, "loss": 0.0244, "num_tokens": 94167787.0, "step": 206 }, { "epoch": 1.6843177189409368, "grad_norm": 0.2049250916068622, "learning_rate": 7.985254656924512e-06, "loss": 0.0227, "num_tokens": 94634140.0, "step": 207 }, { "epoch": 1.6924643584521384, "grad_norm": 0.21854416704059987, "learning_rate": 7.965445754123592e-06, "loss": 0.0252, "num_tokens": 95093967.0, "step": 208 }, { "epoch": 1.7006109979633401, "grad_norm": 0.20098034036974133, "learning_rate": 7.945568349571834e-06, "loss": 0.0233, "num_tokens": 95578447.0, "step": 209 }, { "epoch": 1.708757637474542, "grad_norm": 0.19707920391781453, "learning_rate": 7.925622995557609e-06, "loss": 0.0234, "num_tokens": 96028708.0, "step": 210 }, { "epoch": 1.7169042769857432, "grad_norm": 0.183646916505621, "learning_rate": 7.905610246257243e-06, "loss": 0.0219, "num_tokens": 96490579.0, "step": 211 }, { "epoch": 1.725050916496945, "grad_norm": 0.21800938479643353, "learning_rate": 7.885530657719623e-06, "loss": 0.0245, "num_tokens": 96939215.0, "step": 212 }, { "epoch": 1.7331975560081467, "grad_norm": 0.17805921063304794, "learning_rate": 7.865384787850742e-06, "loss": 0.0207, "num_tokens": 97416826.0, "step": 213 }, { "epoch": 1.7413441955193483, "grad_norm": 0.20335070394293855, "learning_rate": 7.845173196398213e-06, "loss": 0.023, "num_tokens": 97870409.0, "step": 214 }, { "epoch": 1.7494908350305498, "grad_norm": 0.2014363721260783, "learning_rate": 7.824896444935692e-06, "loss": 0.023, "num_tokens": 98303923.0, "step": 215 }, { "epoch": 1.7576374745417516, "grad_norm": 0.19767917831916373, "learning_rate": 7.804555096847298e-06, "loss": 0.0206, "num_tokens": 98792735.0, "step": 216 }, { "epoch": 1.765784114052953, "grad_norm": 0.18927709030960627, "learning_rate": 7.784149717311947e-06, "loss": 0.0228, "num_tokens": 99283099.0, "step": 217 }, { "epoch": 1.7739307535641546, "grad_norm": 0.19540533688345146, "learning_rate": 7.763680873287648e-06, "loss": 0.0224, "num_tokens": 99728623.0, "step": 218 }, { "epoch": 1.7820773930753564, "grad_norm": 0.2021434762578394, "learning_rate": 7.743149133495763e-06, "loss": 0.0226, "num_tokens": 100217105.0, "step": 219 }, { "epoch": 1.7902240325865582, "grad_norm": 0.20319556075451253, "learning_rate": 7.722555068405186e-06, "loss": 0.024, "num_tokens": 100658986.0, "step": 220 }, { "epoch": 1.7983706720977597, "grad_norm": 0.2037408366987311, "learning_rate": 7.70189925021651e-06, "loss": 0.0243, "num_tokens": 101137134.0, "step": 221 }, { "epoch": 1.8065173116089612, "grad_norm": 0.21058268386430223, "learning_rate": 7.681182252846115e-06, "loss": 0.0241, "num_tokens": 101594654.0, "step": 222 }, { "epoch": 1.814663951120163, "grad_norm": 0.20499883443387898, "learning_rate": 7.660404651910236e-06, "loss": 0.0263, "num_tokens": 102027887.0, "step": 223 }, { "epoch": 1.8228105906313645, "grad_norm": 0.2084551925346071, "learning_rate": 7.639567024708953e-06, "loss": 0.0234, "num_tokens": 102479243.0, "step": 224 }, { "epoch": 1.830957230142566, "grad_norm": 0.21438521035457928, "learning_rate": 7.6186699502101676e-06, "loss": 0.0226, "num_tokens": 102944020.0, "step": 225 }, { "epoch": 1.8391038696537678, "grad_norm": 0.20743883238353383, "learning_rate": 7.597714009033505e-06, "loss": 0.0243, "num_tokens": 103377204.0, "step": 226 }, { "epoch": 1.8472505091649696, "grad_norm": 0.19590114337198036, "learning_rate": 7.5766997834341836e-06, "loss": 0.0229, "num_tokens": 103836520.0, "step": 227 }, { "epoch": 1.8553971486761711, "grad_norm": 0.2072497473244054, "learning_rate": 7.555627857286843e-06, "loss": 0.0247, "num_tokens": 104285481.0, "step": 228 }, { "epoch": 1.8635437881873727, "grad_norm": 0.18899125629327573, "learning_rate": 7.534498816069315e-06, "loss": 0.0213, "num_tokens": 104746152.0, "step": 229 }, { "epoch": 1.8716904276985744, "grad_norm": 0.21687392806104466, "learning_rate": 7.513313246846357e-06, "loss": 0.0232, "num_tokens": 105207211.0, "step": 230 }, { "epoch": 1.879837067209776, "grad_norm": 0.20114168053955322, "learning_rate": 7.492071738253343e-06, "loss": 0.0243, "num_tokens": 105657445.0, "step": 231 }, { "epoch": 1.8879837067209775, "grad_norm": 0.31880562870408674, "learning_rate": 7.470774880479909e-06, "loss": 0.0216, "num_tokens": 106145000.0, "step": 232 }, { "epoch": 1.8961303462321792, "grad_norm": 0.17709039062644658, "learning_rate": 7.449423265253551e-06, "loss": 0.0195, "num_tokens": 106619177.0, "step": 233 }, { "epoch": 1.904276985743381, "grad_norm": 0.1941234160393901, "learning_rate": 7.428017485823189e-06, "loss": 0.0221, "num_tokens": 107100389.0, "step": 234 }, { "epoch": 1.9124236252545825, "grad_norm": 0.21047496416728861, "learning_rate": 7.406558136942677e-06, "loss": 0.0253, "num_tokens": 107531535.0, "step": 235 }, { "epoch": 1.920570264765784, "grad_norm": 0.1811130030622756, "learning_rate": 7.3850458148542835e-06, "loss": 0.0218, "num_tokens": 108000369.0, "step": 236 }, { "epoch": 1.9287169042769858, "grad_norm": 0.18791035767087905, "learning_rate": 7.363481117272125e-06, "loss": 0.0217, "num_tokens": 108465611.0, "step": 237 }, { "epoch": 1.9368635437881874, "grad_norm": 0.174382304685201, "learning_rate": 7.341864643365557e-06, "loss": 0.0214, "num_tokens": 108923767.0, "step": 238 }, { "epoch": 1.945010183299389, "grad_norm": 0.1996921946422325, "learning_rate": 7.320196993742522e-06, "loss": 0.023, "num_tokens": 109367680.0, "step": 239 }, { "epoch": 1.9531568228105907, "grad_norm": 0.21146568012414002, "learning_rate": 7.29847877043287e-06, "loss": 0.0231, "num_tokens": 109818455.0, "step": 240 }, { "epoch": 1.9613034623217924, "grad_norm": 0.20624057045002148, "learning_rate": 7.2767105768716295e-06, "loss": 0.024, "num_tokens": 110268348.0, "step": 241 }, { "epoch": 1.9694501018329937, "grad_norm": 0.17960428006685406, "learning_rate": 7.254893017882233e-06, "loss": 0.0222, "num_tokens": 110696800.0, "step": 242 }, { "epoch": 1.9775967413441955, "grad_norm": 0.18718314902352962, "learning_rate": 7.233026699659723e-06, "loss": 0.0226, "num_tokens": 111154475.0, "step": 243 }, { "epoch": 1.9857433808553973, "grad_norm": 0.18787650373147796, "learning_rate": 7.211112229753901e-06, "loss": 0.0213, "num_tokens": 111620815.0, "step": 244 }, { "epoch": 1.9938900203665988, "grad_norm": 0.19693361518983973, "learning_rate": 7.189150217052455e-06, "loss": 0.0216, "num_tokens": 112092986.0, "step": 245 }, { "epoch": 2.0, "grad_norm": 0.2093105825813619, "learning_rate": 7.1671412717640295e-06, "loss": 0.0201, "num_tokens": 112131036.0, "step": 246 }, { "epoch": 2.0, "eval_loss": 0.04159076511859894, "eval_num_tokens": 112131036.0, "eval_runtime": 57.7607, "eval_samples_per_second": 42.226, "eval_steps_per_second": 5.28, "step": 246 }, { "epoch": 2.0081466395112018, "grad_norm": 0.12416538079579213, "learning_rate": 7.145086005401287e-06, "loss": 0.0126, "num_tokens": 112602682.0, "step": 247 }, { "epoch": 2.016293279022403, "grad_norm": 0.15057303383190754, "learning_rate": 7.122985030763901e-06, "loss": 0.0145, "num_tokens": 113073432.0, "step": 248 }, { "epoch": 2.024439918533605, "grad_norm": 0.14759632900226355, "learning_rate": 7.10083896192154e-06, "loss": 0.0124, "num_tokens": 113577827.0, "step": 249 }, { "epoch": 2.0325865580448066, "grad_norm": 0.14133368502923574, "learning_rate": 7.078648414196805e-06, "loss": 0.0128, "num_tokens": 114048831.0, "step": 250 }, { "epoch": 2.0407331975560083, "grad_norm": 0.15715348160815634, "learning_rate": 7.056414004148128e-06, "loss": 0.0136, "num_tokens": 114548364.0, "step": 251 }, { "epoch": 2.0488798370672097, "grad_norm": 0.17716027065421572, "learning_rate": 7.034136349552647e-06, "loss": 0.016, "num_tokens": 114999500.0, "step": 252 }, { "epoch": 2.0570264765784114, "grad_norm": 0.1589991111261928, "learning_rate": 7.011816069389034e-06, "loss": 0.0145, "num_tokens": 115456071.0, "step": 253 }, { "epoch": 2.065173116089613, "grad_norm": 0.1543372807006171, "learning_rate": 6.989453783820304e-06, "loss": 0.0134, "num_tokens": 115926758.0, "step": 254 }, { "epoch": 2.0733197556008145, "grad_norm": 0.1691364992847739, "learning_rate": 6.9670501141765825e-06, "loss": 0.014, "num_tokens": 116385952.0, "step": 255 }, { "epoch": 2.0814663951120163, "grad_norm": 0.16602983431455004, "learning_rate": 6.944605682937834e-06, "loss": 0.0137, "num_tokens": 116820035.0, "step": 256 }, { "epoch": 2.089613034623218, "grad_norm": 0.18962015294617535, "learning_rate": 6.92212111371658e-06, "loss": 0.0143, "num_tokens": 117297850.0, "step": 257 }, { "epoch": 2.0977596741344193, "grad_norm": 0.17122221487492462, "learning_rate": 6.8995970312405615e-06, "loss": 0.0126, "num_tokens": 117759960.0, "step": 258 }, { "epoch": 2.105906313645621, "grad_norm": 0.17247701616442646, "learning_rate": 6.877034061335384e-06, "loss": 0.0139, "num_tokens": 118229929.0, "step": 259 }, { "epoch": 2.114052953156823, "grad_norm": 0.18706937438179935, "learning_rate": 6.854432830907135e-06, "loss": 0.0151, "num_tokens": 118689637.0, "step": 260 }, { "epoch": 2.1221995926680246, "grad_norm": 0.1756410658036281, "learning_rate": 6.831793967924953e-06, "loss": 0.0136, "num_tokens": 119159530.0, "step": 261 }, { "epoch": 2.130346232179226, "grad_norm": 0.17325809294266983, "learning_rate": 6.8091181014035935e-06, "loss": 0.014, "num_tokens": 119598302.0, "step": 262 }, { "epoch": 2.1384928716904277, "grad_norm": 0.1831164025049776, "learning_rate": 6.7864058613859395e-06, "loss": 0.0138, "num_tokens": 120108425.0, "step": 263 }, { "epoch": 2.1466395112016294, "grad_norm": 0.18048260933108903, "learning_rate": 6.763657878925508e-06, "loss": 0.015, "num_tokens": 120578186.0, "step": 264 }, { "epoch": 2.1547861507128308, "grad_norm": 0.16327229194519086, "learning_rate": 6.740874786068906e-06, "loss": 0.0126, "num_tokens": 121055989.0, "step": 265 }, { "epoch": 2.1629327902240325, "grad_norm": 0.16958620370157418, "learning_rate": 6.718057215838274e-06, "loss": 0.0144, "num_tokens": 121502528.0, "step": 266 }, { "epoch": 2.1710794297352343, "grad_norm": 0.1792898292473741, "learning_rate": 6.695205802213699e-06, "loss": 0.0136, "num_tokens": 121956627.0, "step": 267 }, { "epoch": 2.179226069246436, "grad_norm": 0.15481947737459167, "learning_rate": 6.672321180115595e-06, "loss": 0.0125, "num_tokens": 122426773.0, "step": 268 }, { "epoch": 2.1873727087576373, "grad_norm": 0.16707758315087737, "learning_rate": 6.6494039853870676e-06, "loss": 0.0132, "num_tokens": 122875336.0, "step": 269 }, { "epoch": 2.195519348268839, "grad_norm": 0.16476693800658634, "learning_rate": 6.6264548547762395e-06, "loss": 0.0138, "num_tokens": 123320079.0, "step": 270 }, { "epoch": 2.203665987780041, "grad_norm": 0.16291392396662507, "learning_rate": 6.603474425918573e-06, "loss": 0.0136, "num_tokens": 123791870.0, "step": 271 }, { "epoch": 2.211812627291242, "grad_norm": 0.1703687751088918, "learning_rate": 6.580463337319128e-06, "loss": 0.0133, "num_tokens": 124260736.0, "step": 272 }, { "epoch": 2.219959266802444, "grad_norm": 0.17901374374090187, "learning_rate": 6.557422228334852e-06, "loss": 0.0159, "num_tokens": 124681807.0, "step": 273 }, { "epoch": 2.2281059063136457, "grad_norm": 0.16798711219930113, "learning_rate": 6.534351739156797e-06, "loss": 0.0142, "num_tokens": 125127429.0, "step": 274 }, { "epoch": 2.2362525458248474, "grad_norm": 0.16305596345258705, "learning_rate": 6.5112525107923296e-06, "loss": 0.0135, "num_tokens": 125567336.0, "step": 275 }, { "epoch": 2.2443991853360488, "grad_norm": 0.17643316822000632, "learning_rate": 6.488125185047334e-06, "loss": 0.0147, "num_tokens": 126021900.0, "step": 276 }, { "epoch": 2.2525458248472505, "grad_norm": 0.167997975045288, "learning_rate": 6.464970404508369e-06, "loss": 0.0139, "num_tokens": 126491133.0, "step": 277 }, { "epoch": 2.2606924643584523, "grad_norm": 0.1808990629197575, "learning_rate": 6.4417888125248195e-06, "loss": 0.0153, "num_tokens": 126949660.0, "step": 278 }, { "epoch": 2.2688391038696536, "grad_norm": 0.18179273918150798, "learning_rate": 6.418581053191017e-06, "loss": 0.0155, "num_tokens": 127395046.0, "step": 279 }, { "epoch": 2.2769857433808554, "grad_norm": 0.16186916571289603, "learning_rate": 6.39534777132835e-06, "loss": 0.0141, "num_tokens": 127879266.0, "step": 280 }, { "epoch": 2.285132382892057, "grad_norm": 0.1687611769820901, "learning_rate": 6.3720896124673356e-06, "loss": 0.0142, "num_tokens": 128345971.0, "step": 281 }, { "epoch": 2.293279022403259, "grad_norm": 0.18415607421229815, "learning_rate": 6.348807222829704e-06, "loss": 0.0155, "num_tokens": 128804402.0, "step": 282 }, { "epoch": 2.30142566191446, "grad_norm": 0.16514691991418554, "learning_rate": 6.325501249310416e-06, "loss": 0.0146, "num_tokens": 129261613.0, "step": 283 }, { "epoch": 2.309572301425662, "grad_norm": 0.16769380960540944, "learning_rate": 6.302172339459717e-06, "loss": 0.0136, "num_tokens": 129748258.0, "step": 284 }, { "epoch": 2.3177189409368637, "grad_norm": 0.17542238238137692, "learning_rate": 6.278821141465126e-06, "loss": 0.0147, "num_tokens": 130203139.0, "step": 285 }, { "epoch": 2.325865580448065, "grad_norm": 0.1703028823912319, "learning_rate": 6.255448304133435e-06, "loss": 0.0144, "num_tokens": 130680052.0, "step": 286 }, { "epoch": 2.3340122199592668, "grad_norm": 0.15875518919149162, "learning_rate": 6.232054476872674e-06, "loss": 0.013, "num_tokens": 131145142.0, "step": 287 }, { "epoch": 2.3421588594704685, "grad_norm": 0.1559999046320083, "learning_rate": 6.208640309674081e-06, "loss": 0.0138, "num_tokens": 131606714.0, "step": 288 }, { "epoch": 2.35030549898167, "grad_norm": 0.16638792870478772, "learning_rate": 6.185206453094026e-06, "loss": 0.0133, "num_tokens": 132070874.0, "step": 289 }, { "epoch": 2.3584521384928716, "grad_norm": 0.16556273278032177, "learning_rate": 6.161753558235945e-06, "loss": 0.0144, "num_tokens": 132523899.0, "step": 290 }, { "epoch": 2.3665987780040734, "grad_norm": 0.1627153835397699, "learning_rate": 6.138282276732251e-06, "loss": 0.0141, "num_tokens": 132984150.0, "step": 291 }, { "epoch": 2.374745417515275, "grad_norm": 0.17420180567604815, "learning_rate": 6.1147932607262215e-06, "loss": 0.0153, "num_tokens": 133423004.0, "step": 292 }, { "epoch": 2.3828920570264764, "grad_norm": 0.1715872000392912, "learning_rate": 6.091287162853883e-06, "loss": 0.0143, "num_tokens": 133885515.0, "step": 293 }, { "epoch": 2.391038696537678, "grad_norm": 0.15875189010502294, "learning_rate": 6.067764636225881e-06, "loss": 0.0144, "num_tokens": 134334800.0, "step": 294 }, { "epoch": 2.39918533604888, "grad_norm": 0.14222500494759646, "learning_rate": 6.0442263344093224e-06, "loss": 0.0128, "num_tokens": 134794203.0, "step": 295 }, { "epoch": 2.4073319755600817, "grad_norm": 0.16587875726539164, "learning_rate": 6.020672911409626e-06, "loss": 0.014, "num_tokens": 135246488.0, "step": 296 }, { "epoch": 2.415478615071283, "grad_norm": 0.16945883731715217, "learning_rate": 5.997105021652355e-06, "loss": 0.0142, "num_tokens": 135734483.0, "step": 297 }, { "epoch": 2.423625254582485, "grad_norm": 0.17856893575225632, "learning_rate": 5.97352331996502e-06, "loss": 0.0149, "num_tokens": 136180989.0, "step": 298 }, { "epoch": 2.4317718940936865, "grad_norm": 0.16901738376392064, "learning_rate": 5.949928461558894e-06, "loss": 0.0145, "num_tokens": 136633463.0, "step": 299 }, { "epoch": 2.439918533604888, "grad_norm": 0.19440740998217734, "learning_rate": 5.926321102010808e-06, "loss": 0.0153, "num_tokens": 137065466.0, "step": 300 }, { "epoch": 2.4480651731160896, "grad_norm": 0.17396280168075312, "learning_rate": 5.902701897244932e-06, "loss": 0.014, "num_tokens": 137519052.0, "step": 301 }, { "epoch": 2.4562118126272914, "grad_norm": 0.1962070967726784, "learning_rate": 5.879071503514555e-06, "loss": 0.0167, "num_tokens": 137969737.0, "step": 302 }, { "epoch": 2.4643584521384927, "grad_norm": 0.17287895679065615, "learning_rate": 5.855430577383842e-06, "loss": 0.0148, "num_tokens": 138433151.0, "step": 303 }, { "epoch": 2.4725050916496945, "grad_norm": 0.19400622443946244, "learning_rate": 5.831779775709606e-06, "loss": 0.0148, "num_tokens": 138875359.0, "step": 304 }, { "epoch": 2.480651731160896, "grad_norm": 0.16969104274852342, "learning_rate": 5.808119755623045e-06, "loss": 0.0141, "num_tokens": 139333435.0, "step": 305 }, { "epoch": 2.4887983706720975, "grad_norm": 0.17975044746142824, "learning_rate": 5.784451174511486e-06, "loss": 0.0155, "num_tokens": 139787251.0, "step": 306 }, { "epoch": 2.4969450101832993, "grad_norm": 0.18637909822915394, "learning_rate": 5.760774690000128e-06, "loss": 0.014, "num_tokens": 140263010.0, "step": 307 }, { "epoch": 2.505091649694501, "grad_norm": 0.1755752695664621, "learning_rate": 5.7370909599337585e-06, "loss": 0.0143, "num_tokens": 140730852.0, "step": 308 }, { "epoch": 2.513238289205703, "grad_norm": 0.17738520787824683, "learning_rate": 5.713400642358483e-06, "loss": 0.015, "num_tokens": 141160459.0, "step": 309 }, { "epoch": 2.521384928716904, "grad_norm": 0.17114933786372763, "learning_rate": 5.689704395503438e-06, "loss": 0.0137, "num_tokens": 141652980.0, "step": 310 }, { "epoch": 2.529531568228106, "grad_norm": 0.1702830061303869, "learning_rate": 5.666002877762506e-06, "loss": 0.0153, "num_tokens": 142092423.0, "step": 311 }, { "epoch": 2.5376782077393076, "grad_norm": 0.17360071510124675, "learning_rate": 5.642296747676016e-06, "loss": 0.0145, "num_tokens": 142533489.0, "step": 312 }, { "epoch": 2.5458248472505094, "grad_norm": 0.1607500590426996, "learning_rate": 5.618586663912452e-06, "loss": 0.0133, "num_tokens": 142991787.0, "step": 313 }, { "epoch": 2.5539714867617107, "grad_norm": 0.16048833714516317, "learning_rate": 5.594873285250151e-06, "loss": 0.0136, "num_tokens": 143468508.0, "step": 314 }, { "epoch": 2.5621181262729125, "grad_norm": 0.1838591156346174, "learning_rate": 5.571157270558995e-06, "loss": 0.0163, "num_tokens": 143916886.0, "step": 315 }, { "epoch": 2.5702647657841142, "grad_norm": 0.162005395980572, "learning_rate": 5.5474392787821096e-06, "loss": 0.0135, "num_tokens": 144388134.0, "step": 316 }, { "epoch": 2.5784114052953155, "grad_norm": 0.14863787001529957, "learning_rate": 5.52371996891755e-06, "loss": 0.0132, "num_tokens": 144871370.0, "step": 317 }, { "epoch": 2.5865580448065173, "grad_norm": 0.16754180761222826, "learning_rate": 5.500000000000001e-06, "loss": 0.0146, "num_tokens": 145320563.0, "step": 318 }, { "epoch": 2.594704684317719, "grad_norm": 0.18005757817722826, "learning_rate": 5.476280031082451e-06, "loss": 0.016, "num_tokens": 145758817.0, "step": 319 }, { "epoch": 2.6028513238289204, "grad_norm": 0.17034049730069928, "learning_rate": 5.452560721217892e-06, "loss": 0.0155, "num_tokens": 146189214.0, "step": 320 }, { "epoch": 2.610997963340122, "grad_norm": 0.15413011304140098, "learning_rate": 5.428842729441008e-06, "loss": 0.0144, "num_tokens": 146640888.0, "step": 321 }, { "epoch": 2.619144602851324, "grad_norm": 0.17242345415805765, "learning_rate": 5.405126714749852e-06, "loss": 0.0144, "num_tokens": 147089993.0, "step": 322 }, { "epoch": 2.627291242362525, "grad_norm": 0.15793761105384327, "learning_rate": 5.38141333608755e-06, "loss": 0.0137, "num_tokens": 147549085.0, "step": 323 }, { "epoch": 2.635437881873727, "grad_norm": 0.15260230173501832, "learning_rate": 5.357703252323985e-06, "loss": 0.0127, "num_tokens": 148018238.0, "step": 324 }, { "epoch": 2.6435845213849287, "grad_norm": 0.17616115019719872, "learning_rate": 5.333997122237497e-06, "loss": 0.0142, "num_tokens": 148467378.0, "step": 325 }, { "epoch": 2.6517311608961305, "grad_norm": 0.16869830739625263, "learning_rate": 5.310295604496563e-06, "loss": 0.0145, "num_tokens": 148924273.0, "step": 326 }, { "epoch": 2.6598778004073322, "grad_norm": 0.1516947132562575, "learning_rate": 5.286599357641519e-06, "loss": 0.0132, "num_tokens": 149394678.0, "step": 327 }, { "epoch": 2.6680244399185336, "grad_norm": 0.1644528806031863, "learning_rate": 5.262909040066243e-06, "loss": 0.0138, "num_tokens": 149841850.0, "step": 328 }, { "epoch": 2.6761710794297353, "grad_norm": 0.1958369178369615, "learning_rate": 5.239225309999875e-06, "loss": 0.0156, "num_tokens": 150282571.0, "step": 329 }, { "epoch": 2.684317718940937, "grad_norm": 0.18244134325631398, "learning_rate": 5.215548825488514e-06, "loss": 0.0148, "num_tokens": 150723879.0, "step": 330 }, { "epoch": 2.6924643584521384, "grad_norm": 0.16252807203895167, "learning_rate": 5.191880244376957e-06, "loss": 0.015, "num_tokens": 151164471.0, "step": 331 }, { "epoch": 2.70061099796334, "grad_norm": 0.16907582749071554, "learning_rate": 5.168220224290395e-06, "loss": 0.013, "num_tokens": 151627236.0, "step": 332 }, { "epoch": 2.708757637474542, "grad_norm": 0.1521247341456988, "learning_rate": 5.144569422616159e-06, "loss": 0.0128, "num_tokens": 152112152.0, "step": 333 }, { "epoch": 2.716904276985743, "grad_norm": 0.15854466142189536, "learning_rate": 5.120928496485448e-06, "loss": 0.013, "num_tokens": 152585932.0, "step": 334 }, { "epoch": 2.725050916496945, "grad_norm": 0.17382518355000084, "learning_rate": 5.097298102755069e-06, "loss": 0.0139, "num_tokens": 153055065.0, "step": 335 }, { "epoch": 2.7331975560081467, "grad_norm": 0.17769717946639274, "learning_rate": 5.073678897989194e-06, "loss": 0.0146, "num_tokens": 153518977.0, "step": 336 }, { "epoch": 2.741344195519348, "grad_norm": 0.16803081279999066, "learning_rate": 5.050071538441107e-06, "loss": 0.0138, "num_tokens": 153976769.0, "step": 337 }, { "epoch": 2.74949083503055, "grad_norm": 0.17280566271506004, "learning_rate": 5.026476680034983e-06, "loss": 0.0154, "num_tokens": 154408635.0, "step": 338 }, { "epoch": 2.7576374745417516, "grad_norm": 0.169314934778943, "learning_rate": 5.002894978347646e-06, "loss": 0.0145, "num_tokens": 154856201.0, "step": 339 }, { "epoch": 2.765784114052953, "grad_norm": 0.18005778973651862, "learning_rate": 4.979327088590375e-06, "loss": 0.0144, "num_tokens": 155310653.0, "step": 340 }, { "epoch": 2.7739307535641546, "grad_norm": 0.16841593789310932, "learning_rate": 4.95577366559068e-06, "loss": 0.0136, "num_tokens": 155788563.0, "step": 341 }, { "epoch": 2.7820773930753564, "grad_norm": 0.17699144806638442, "learning_rate": 4.932235363774121e-06, "loss": 0.015, "num_tokens": 156228468.0, "step": 342 }, { "epoch": 2.790224032586558, "grad_norm": 0.15791462498013234, "learning_rate": 4.908712837146118e-06, "loss": 0.014, "num_tokens": 156664176.0, "step": 343 }, { "epoch": 2.79837067209776, "grad_norm": 0.15227035101116576, "learning_rate": 4.88520673927378e-06, "loss": 0.0128, "num_tokens": 157134252.0, "step": 344 }, { "epoch": 2.8065173116089612, "grad_norm": 0.1594189404919013, "learning_rate": 4.861717723267752e-06, "loss": 0.0136, "num_tokens": 157599805.0, "step": 345 }, { "epoch": 2.814663951120163, "grad_norm": 0.15995395220658057, "learning_rate": 4.838246441764056e-06, "loss": 0.0134, "num_tokens": 158053673.0, "step": 346 }, { "epoch": 2.8228105906313647, "grad_norm": 0.18202172640131933, "learning_rate": 4.814793546905977e-06, "loss": 0.0157, "num_tokens": 158485241.0, "step": 347 }, { "epoch": 2.830957230142566, "grad_norm": 0.15339369785350124, "learning_rate": 4.791359690325921e-06, "loss": 0.0123, "num_tokens": 158947625.0, "step": 348 }, { "epoch": 2.839103869653768, "grad_norm": 0.16788836990713416, "learning_rate": 4.767945523127327e-06, "loss": 0.0137, "num_tokens": 159423146.0, "step": 349 }, { "epoch": 2.8472505091649696, "grad_norm": 0.16165924770039114, "learning_rate": 4.744551695866567e-06, "loss": 0.0148, "num_tokens": 159861129.0, "step": 350 }, { "epoch": 2.855397148676171, "grad_norm": 0.1774028782674121, "learning_rate": 4.721178858534876e-06, "loss": 0.0148, "num_tokens": 160329576.0, "step": 351 }, { "epoch": 2.8635437881873727, "grad_norm": 0.16285854302808034, "learning_rate": 4.697827660540285e-06, "loss": 0.014, "num_tokens": 160797840.0, "step": 352 }, { "epoch": 2.8716904276985744, "grad_norm": 0.18877633260447374, "learning_rate": 4.674498750689585e-06, "loss": 0.0147, "num_tokens": 161243065.0, "step": 353 }, { "epoch": 2.8798370672097757, "grad_norm": 0.1601336012550065, "learning_rate": 4.651192777170298e-06, "loss": 0.0143, "num_tokens": 161699619.0, "step": 354 }, { "epoch": 2.8879837067209775, "grad_norm": 0.16259855324262715, "learning_rate": 4.627910387532663e-06, "loss": 0.014, "num_tokens": 162166184.0, "step": 355 }, { "epoch": 2.8961303462321792, "grad_norm": 0.17610385289208558, "learning_rate": 4.604652228671653e-06, "loss": 0.0147, "num_tokens": 162610492.0, "step": 356 }, { "epoch": 2.904276985743381, "grad_norm": 0.1838783740248808, "learning_rate": 4.581418946808983e-06, "loss": 0.0153, "num_tokens": 163056383.0, "step": 357 }, { "epoch": 2.9124236252545828, "grad_norm": 0.15216837256215965, "learning_rate": 4.558211187475181e-06, "loss": 0.0123, "num_tokens": 163543282.0, "step": 358 }, { "epoch": 2.920570264765784, "grad_norm": 0.15127415963746377, "learning_rate": 4.535029595491632e-06, "loss": 0.0125, "num_tokens": 163999105.0, "step": 359 }, { "epoch": 2.928716904276986, "grad_norm": 0.17498160266417795, "learning_rate": 4.511874814952668e-06, "loss": 0.0136, "num_tokens": 164458000.0, "step": 360 }, { "epoch": 2.9368635437881876, "grad_norm": 0.16423843849992176, "learning_rate": 4.488747489207672e-06, "loss": 0.0133, "num_tokens": 164928642.0, "step": 361 }, { "epoch": 2.945010183299389, "grad_norm": 0.15573818980541582, "learning_rate": 4.4656482608432054e-06, "loss": 0.0123, "num_tokens": 165408976.0, "step": 362 }, { "epoch": 2.9531568228105907, "grad_norm": 0.17543688765013044, "learning_rate": 4.442577771665147e-06, "loss": 0.014, "num_tokens": 165886616.0, "step": 363 }, { "epoch": 2.9613034623217924, "grad_norm": 0.16511980145949437, "learning_rate": 4.419536662680873e-06, "loss": 0.0127, "num_tokens": 166343018.0, "step": 364 }, { "epoch": 2.9694501018329937, "grad_norm": 0.1587473778216488, "learning_rate": 4.39652557408143e-06, "loss": 0.0124, "num_tokens": 166801376.0, "step": 365 }, { "epoch": 2.9775967413441955, "grad_norm": 0.17009893552558653, "learning_rate": 4.373545145223761e-06, "loss": 0.0145, "num_tokens": 167280403.0, "step": 366 }, { "epoch": 2.9857433808553973, "grad_norm": 0.15750436171587542, "learning_rate": 4.350596014612935e-06, "loss": 0.0134, "num_tokens": 167726691.0, "step": 367 }, { "epoch": 2.9938900203665986, "grad_norm": 0.17197192413672613, "learning_rate": 4.327678819884405e-06, "loss": 0.0143, "num_tokens": 168173644.0, "step": 368 }, { "epoch": 3.0, "grad_norm": 0.19850738254699854, "learning_rate": 4.304794197786304e-06, "loss": 0.0141, "num_tokens": 168216390.0, "step": 369 }, { "epoch": 3.0, "eval_loss": 0.041459400206804276, "eval_num_tokens": 168216390.0, "eval_runtime": 58.1871, "eval_samples_per_second": 41.917, "eval_steps_per_second": 5.242, "step": 369 }, { "epoch": 3.0081466395112018, "grad_norm": 0.12585174258784562, "learning_rate": 4.281942784161728e-06, "loss": 0.0099, "num_tokens": 168660422.0, "step": 370 }, { "epoch": 3.016293279022403, "grad_norm": 0.11765413035696883, "learning_rate": 4.2591252139310945e-06, "loss": 0.0082, "num_tokens": 169121635.0, "step": 371 }, { "epoch": 3.024439918533605, "grad_norm": 0.11700242994990097, "learning_rate": 4.2363421210744925e-06, "loss": 0.0083, "num_tokens": 169588292.0, "step": 372 }, { "epoch": 3.0325865580448066, "grad_norm": 0.13410847188727293, "learning_rate": 4.213594138614062e-06, "loss": 0.0097, "num_tokens": 170048576.0, "step": 373 }, { "epoch": 3.0407331975560083, "grad_norm": 0.11184500956394558, "learning_rate": 4.190881898596409e-06, "loss": 0.0079, "num_tokens": 170553649.0, "step": 374 }, { "epoch": 3.0488798370672097, "grad_norm": 0.12083327220094565, "learning_rate": 4.168206032075048e-06, "loss": 0.0086, "num_tokens": 171011806.0, "step": 375 }, { "epoch": 3.0570264765784114, "grad_norm": 0.13145187085930216, "learning_rate": 4.1455671690928666e-06, "loss": 0.009, "num_tokens": 171488462.0, "step": 376 }, { "epoch": 3.065173116089613, "grad_norm": 0.13334793710473314, "learning_rate": 4.122965938664616e-06, "loss": 0.0086, "num_tokens": 171943130.0, "step": 377 }, { "epoch": 3.0733197556008145, "grad_norm": 0.1332625062123775, "learning_rate": 4.100402968759441e-06, "loss": 0.0093, "num_tokens": 172384061.0, "step": 378 }, { "epoch": 3.0814663951120163, "grad_norm": 0.13147800386811567, "learning_rate": 4.077878886283422e-06, "loss": 0.0085, "num_tokens": 172832702.0, "step": 379 }, { "epoch": 3.089613034623218, "grad_norm": 0.1411078689570707, "learning_rate": 4.055394317062168e-06, "loss": 0.0104, "num_tokens": 173290817.0, "step": 380 }, { "epoch": 3.0977596741344193, "grad_norm": 0.1284905098348191, "learning_rate": 4.03294988582342e-06, "loss": 0.0079, "num_tokens": 173766754.0, "step": 381 }, { "epoch": 3.105906313645621, "grad_norm": 0.13291783263584392, "learning_rate": 4.010546216179697e-06, "loss": 0.008, "num_tokens": 174227586.0, "step": 382 }, { "epoch": 3.114052953156823, "grad_norm": 0.13439803780962148, "learning_rate": 3.988183930610967e-06, "loss": 0.0084, "num_tokens": 174684443.0, "step": 383 }, { "epoch": 3.1221995926680246, "grad_norm": 0.1318097744846226, "learning_rate": 3.965863650447355e-06, "loss": 0.0081, "num_tokens": 175153040.0, "step": 384 }, { "epoch": 3.130346232179226, "grad_norm": 0.14505278918262016, "learning_rate": 3.943585995851872e-06, "loss": 0.0088, "num_tokens": 175616900.0, "step": 385 }, { "epoch": 3.1384928716904277, "grad_norm": 0.143736668078946, "learning_rate": 3.9213515858031984e-06, "loss": 0.0085, "num_tokens": 176098251.0, "step": 386 }, { "epoch": 3.1466395112016294, "grad_norm": 0.13749127082571724, "learning_rate": 3.8991610380784626e-06, "loss": 0.0076, "num_tokens": 176570672.0, "step": 387 }, { "epoch": 3.1547861507128308, "grad_norm": 0.15661494242610496, "learning_rate": 3.877014969236102e-06, "loss": 0.0101, "num_tokens": 177008465.0, "step": 388 }, { "epoch": 3.1629327902240325, "grad_norm": 0.15062683514898298, "learning_rate": 3.854913994598715e-06, "loss": 0.0089, "num_tokens": 177466175.0, "step": 389 }, { "epoch": 3.1710794297352343, "grad_norm": 0.1391922011105707, "learning_rate": 3.832858728235971e-06, "loss": 0.0093, "num_tokens": 177917874.0, "step": 390 }, { "epoch": 3.179226069246436, "grad_norm": 0.1552031660404893, "learning_rate": 3.8108497829475465e-06, "loss": 0.0105, "num_tokens": 178367628.0, "step": 391 }, { "epoch": 3.1873727087576373, "grad_norm": 0.13811754646428342, "learning_rate": 3.7888877702460992e-06, "loss": 0.0091, "num_tokens": 178825445.0, "step": 392 }, { "epoch": 3.195519348268839, "grad_norm": 0.12162345237220032, "learning_rate": 3.7669733003402775e-06, "loss": 0.0073, "num_tokens": 179301109.0, "step": 393 }, { "epoch": 3.203665987780041, "grad_norm": 0.13707719742366498, "learning_rate": 3.7451069821177677e-06, "loss": 0.0092, "num_tokens": 179757593.0, "step": 394 }, { "epoch": 3.211812627291242, "grad_norm": 0.13095735092161556, "learning_rate": 3.7232894231283724e-06, "loss": 0.0092, "num_tokens": 180213993.0, "step": 395 }, { "epoch": 3.219959266802444, "grad_norm": 0.13262472070811615, "learning_rate": 3.701521229567131e-06, "loss": 0.0085, "num_tokens": 180668901.0, "step": 396 }, { "epoch": 3.2281059063136457, "grad_norm": 0.13971045948367564, "learning_rate": 3.6798030062574807e-06, "loss": 0.0088, "num_tokens": 181137029.0, "step": 397 }, { "epoch": 3.2362525458248474, "grad_norm": 0.15719898296312626, "learning_rate": 3.6581353566344447e-06, "loss": 0.0091, "num_tokens": 181583795.0, "step": 398 }, { "epoch": 3.2443991853360488, "grad_norm": 0.13349745981088976, "learning_rate": 3.6365188827278752e-06, "loss": 0.0083, "num_tokens": 182040738.0, "step": 399 }, { "epoch": 3.2525458248472505, "grad_norm": 0.1507228385771512, "learning_rate": 3.6149541851457183e-06, "loss": 0.0093, "num_tokens": 182494412.0, "step": 400 }, { "epoch": 3.2606924643584523, "grad_norm": 0.13598098409095466, "learning_rate": 3.593441863057325e-06, "loss": 0.0092, "num_tokens": 182943146.0, "step": 401 }, { "epoch": 3.2688391038696536, "grad_norm": 0.13606743657097284, "learning_rate": 3.5719825141768128e-06, "loss": 0.0092, "num_tokens": 183393591.0, "step": 402 }, { "epoch": 3.2769857433808554, "grad_norm": 0.14156987679154379, "learning_rate": 3.5505767347464504e-06, "loss": 0.009, "num_tokens": 183862449.0, "step": 403 }, { "epoch": 3.285132382892057, "grad_norm": 0.13512553050700174, "learning_rate": 3.5292251195200932e-06, "loss": 0.0093, "num_tokens": 184305229.0, "step": 404 }, { "epoch": 3.293279022403259, "grad_norm": 0.11472791583197466, "learning_rate": 3.5079282617466594e-06, "loss": 0.0078, "num_tokens": 184802522.0, "step": 405 }, { "epoch": 3.30142566191446, "grad_norm": 0.12789474002800086, "learning_rate": 3.486686753153645e-06, "loss": 0.0083, "num_tokens": 185274960.0, "step": 406 }, { "epoch": 3.309572301425662, "grad_norm": 0.1275610588019882, "learning_rate": 3.4655011839306866e-06, "loss": 0.009, "num_tokens": 185709382.0, "step": 407 }, { "epoch": 3.3177189409368637, "grad_norm": 0.1404980269677411, "learning_rate": 3.4443721427131593e-06, "loss": 0.0095, "num_tokens": 186161144.0, "step": 408 }, { "epoch": 3.325865580448065, "grad_norm": 0.13529566839707055, "learning_rate": 3.423300216565819e-06, "loss": 0.0086, "num_tokens": 186619778.0, "step": 409 }, { "epoch": 3.3340122199592668, "grad_norm": 0.1387178170918977, "learning_rate": 3.4022859909664957e-06, "loss": 0.0098, "num_tokens": 187041856.0, "step": 410 }, { "epoch": 3.3421588594704685, "grad_norm": 0.13789162045155967, "learning_rate": 3.3813300497898326e-06, "loss": 0.0083, "num_tokens": 187505631.0, "step": 411 }, { "epoch": 3.35030549898167, "grad_norm": 0.137718313724877, "learning_rate": 3.3604329752910468e-06, "loss": 0.0095, "num_tokens": 187962839.0, "step": 412 }, { "epoch": 3.3584521384928716, "grad_norm": 0.13226236747300735, "learning_rate": 3.339595348089767e-06, "loss": 0.0095, "num_tokens": 188406846.0, "step": 413 }, { "epoch": 3.3665987780040734, "grad_norm": 0.13283015288873243, "learning_rate": 3.3188177471538864e-06, "loss": 0.0088, "num_tokens": 188859539.0, "step": 414 }, { "epoch": 3.374745417515275, "grad_norm": 0.13902664596528255, "learning_rate": 3.2981007497834922e-06, "loss": 0.0085, "num_tokens": 189323101.0, "step": 415 }, { "epoch": 3.3828920570264764, "grad_norm": 0.13509517554370873, "learning_rate": 3.2774449315948147e-06, "loss": 0.0089, "num_tokens": 189823493.0, "step": 416 }, { "epoch": 3.391038696537678, "grad_norm": 0.1366523338854662, "learning_rate": 3.2568508665042383e-06, "loss": 0.0084, "num_tokens": 190301541.0, "step": 417 }, { "epoch": 3.39918533604888, "grad_norm": 0.125577137562613, "learning_rate": 3.2363191267123517e-06, "loss": 0.0072, "num_tokens": 190798114.0, "step": 418 }, { "epoch": 3.4073319755600817, "grad_norm": 0.14591111241424826, "learning_rate": 3.215850282688055e-06, "loss": 0.0098, "num_tokens": 191261005.0, "step": 419 }, { "epoch": 3.415478615071283, "grad_norm": 0.12604467726858234, "learning_rate": 3.195444903152703e-06, "loss": 0.008, "num_tokens": 191709305.0, "step": 420 }, { "epoch": 3.423625254582485, "grad_norm": 0.13382954324399682, "learning_rate": 3.1751035550643107e-06, "loss": 0.0084, "num_tokens": 192209220.0, "step": 421 }, { "epoch": 3.4317718940936865, "grad_norm": 0.13698395980312603, "learning_rate": 3.1548268036017904e-06, "loss": 0.0091, "num_tokens": 192639412.0, "step": 422 }, { "epoch": 3.439918533604888, "grad_norm": 0.13829425626998468, "learning_rate": 3.134615212149258e-06, "loss": 0.0092, "num_tokens": 193098241.0, "step": 423 }, { "epoch": 3.4480651731160896, "grad_norm": 0.11711892810797479, "learning_rate": 3.114469342280379e-06, "loss": 0.0084, "num_tokens": 193574245.0, "step": 424 }, { "epoch": 3.4562118126272914, "grad_norm": 0.1309214084812048, "learning_rate": 3.094389753742758e-06, "loss": 0.0088, "num_tokens": 194017166.0, "step": 425 }, { "epoch": 3.4643584521384927, "grad_norm": 0.14133229462166405, "learning_rate": 3.0743770044423936e-06, "loss": 0.0093, "num_tokens": 194461022.0, "step": 426 }, { "epoch": 3.4725050916496945, "grad_norm": 0.12915594606644895, "learning_rate": 3.0544316504281677e-06, "loss": 0.0084, "num_tokens": 194921886.0, "step": 427 }, { "epoch": 3.480651731160896, "grad_norm": 0.13019588847393995, "learning_rate": 3.03455424587641e-06, "loss": 0.0082, "num_tokens": 195394552.0, "step": 428 }, { "epoch": 3.4887983706720975, "grad_norm": 0.12493252602627915, "learning_rate": 3.014745343075488e-06, "loss": 0.009, "num_tokens": 195853843.0, "step": 429 }, { "epoch": 3.4969450101832993, "grad_norm": 0.13292973796735513, "learning_rate": 2.995005492410469e-06, "loss": 0.0085, "num_tokens": 196316073.0, "step": 430 }, { "epoch": 3.505091649694501, "grad_norm": 0.15361936626468706, "learning_rate": 2.975335242347822e-06, "loss": 0.0097, "num_tokens": 196747650.0, "step": 431 }, { "epoch": 3.513238289205703, "grad_norm": 0.12126261520512835, "learning_rate": 2.9557351394201855e-06, "loss": 0.0078, "num_tokens": 197222644.0, "step": 432 }, { "epoch": 3.521384928716904, "grad_norm": 0.14364063312304898, "learning_rate": 2.9362057282111754e-06, "loss": 0.0084, "num_tokens": 197703977.0, "step": 433 }, { "epoch": 3.529531568228106, "grad_norm": 0.1285606277274214, "learning_rate": 2.9167475513402592e-06, "loss": 0.0085, "num_tokens": 198159184.0, "step": 434 }, { "epoch": 3.5376782077393076, "grad_norm": 0.12784246623295054, "learning_rate": 2.897361149447679e-06, "loss": 0.0086, "num_tokens": 198611287.0, "step": 435 }, { "epoch": 3.5458248472505094, "grad_norm": 0.1297694309800873, "learning_rate": 2.878047061179422e-06, "loss": 0.0082, "num_tokens": 199069757.0, "step": 436 }, { "epoch": 3.5539714867617107, "grad_norm": 0.1292114725276358, "learning_rate": 2.858805823172264e-06, "loss": 0.0088, "num_tokens": 199540737.0, "step": 437 }, { "epoch": 3.5621181262729125, "grad_norm": 0.12887249746822058, "learning_rate": 2.839637970038861e-06, "loss": 0.009, "num_tokens": 199982367.0, "step": 438 }, { "epoch": 3.5702647657841142, "grad_norm": 0.1280981502556342, "learning_rate": 2.8205440343528856e-06, "loss": 0.0089, "num_tokens": 200427445.0, "step": 439 }, { "epoch": 3.5784114052953155, "grad_norm": 0.13979378072527007, "learning_rate": 2.8015245466342287e-06, "loss": 0.0089, "num_tokens": 200889454.0, "step": 440 }, { "epoch": 3.5865580448065173, "grad_norm": 0.14089395360902868, "learning_rate": 2.7825800353342734e-06, "loss": 0.0089, "num_tokens": 201331340.0, "step": 441 }, { "epoch": 3.594704684317719, "grad_norm": 0.1380485690052255, "learning_rate": 2.763711026821196e-06, "loss": 0.0087, "num_tokens": 201788908.0, "step": 442 }, { "epoch": 3.6028513238289204, "grad_norm": 0.13663809301177426, "learning_rate": 2.7449180453653544e-06, "loss": 0.009, "num_tokens": 202225257.0, "step": 443 }, { "epoch": 3.610997963340122, "grad_norm": 0.1473692732003636, "learning_rate": 2.72620161312471e-06, "loss": 0.0085, "num_tokens": 202692568.0, "step": 444 }, { "epoch": 3.619144602851324, "grad_norm": 0.1257774235275037, "learning_rate": 2.7075622501303255e-06, "loss": 0.0093, "num_tokens": 203149741.0, "step": 445 }, { "epoch": 3.627291242362525, "grad_norm": 0.13888313118631118, "learning_rate": 2.689000474271918e-06, "loss": 0.008, "num_tokens": 203602311.0, "step": 446 }, { "epoch": 3.635437881873727, "grad_norm": 0.15749316142966002, "learning_rate": 2.670516801283464e-06, "loss": 0.0108, "num_tokens": 204036522.0, "step": 447 }, { "epoch": 3.6435845213849287, "grad_norm": 0.12907115857092855, "learning_rate": 2.652111744728876e-06, "loss": 0.0084, "num_tokens": 204486691.0, "step": 448 }, { "epoch": 3.6517311608961305, "grad_norm": 0.13596062968350994, "learning_rate": 2.6337858159877226e-06, "loss": 0.0081, "num_tokens": 204952023.0, "step": 449 }, { "epoch": 3.6598778004073322, "grad_norm": 0.13346166766765533, "learning_rate": 2.615539524241036e-06, "loss": 0.0081, "num_tokens": 205402274.0, "step": 450 }, { "epoch": 3.6680244399185336, "grad_norm": 0.12166137102621093, "learning_rate": 2.5973733764571486e-06, "loss": 0.0077, "num_tokens": 205859233.0, "step": 451 }, { "epoch": 3.6761710794297353, "grad_norm": 0.13150089757352357, "learning_rate": 2.5792878773776225e-06, "loss": 0.0086, "num_tokens": 206314665.0, "step": 452 }, { "epoch": 3.684317718940937, "grad_norm": 0.137357932504932, "learning_rate": 2.561283529503208e-06, "loss": 0.0094, "num_tokens": 206766146.0, "step": 453 }, { "epoch": 3.6924643584521384, "grad_norm": 0.13479268397128444, "learning_rate": 2.5433608330798974e-06, "loss": 0.0094, "num_tokens": 207200864.0, "step": 454 }, { "epoch": 3.70061099796334, "grad_norm": 0.11930317957334262, "learning_rate": 2.5255202860850157e-06, "loss": 0.0081, "num_tokens": 207685884.0, "step": 455 }, { "epoch": 3.708757637474542, "grad_norm": 0.13838840583020326, "learning_rate": 2.5077623842133895e-06, "loss": 0.0086, "num_tokens": 208130253.0, "step": 456 }, { "epoch": 3.716904276985743, "grad_norm": 0.11517621045103824, "learning_rate": 2.490087620863573e-06, "loss": 0.0069, "num_tokens": 208602611.0, "step": 457 }, { "epoch": 3.725050916496945, "grad_norm": 0.13060665615710568, "learning_rate": 2.4724964871241387e-06, "loss": 0.0091, "num_tokens": 209056174.0, "step": 458 }, { "epoch": 3.7331975560081467, "grad_norm": 0.12008096972228131, "learning_rate": 2.454989471760031e-06, "loss": 0.0081, "num_tokens": 209524636.0, "step": 459 }, { "epoch": 3.741344195519348, "grad_norm": 0.12402851148890304, "learning_rate": 2.437567061198991e-06, "loss": 0.0079, "num_tokens": 209994196.0, "step": 460 }, { "epoch": 3.74949083503055, "grad_norm": 0.1386116097689955, "learning_rate": 2.4202297395180353e-06, "loss": 0.0086, "num_tokens": 210466756.0, "step": 461 }, { "epoch": 3.7576374745417516, "grad_norm": 0.13027448435015335, "learning_rate": 2.4029779884300084e-06, "loss": 0.0075, "num_tokens": 210950806.0, "step": 462 }, { "epoch": 3.765784114052953, "grad_norm": 0.13406681887661104, "learning_rate": 2.3858122872702004e-06, "loss": 0.0085, "num_tokens": 211404708.0, "step": 463 }, { "epoch": 3.7739307535641546, "grad_norm": 0.12564076418855794, "learning_rate": 2.3687331129830276e-06, "loss": 0.0078, "num_tokens": 211866245.0, "step": 464 }, { "epoch": 3.7820773930753564, "grad_norm": 0.14390639481706682, "learning_rate": 2.3517409401087787e-06, "loss": 0.01, "num_tokens": 212305458.0, "step": 465 }, { "epoch": 3.790224032586558, "grad_norm": 0.12121296971812623, "learning_rate": 2.3348362407704313e-06, "loss": 0.0083, "num_tokens": 212752369.0, "step": 466 }, { "epoch": 3.79837067209776, "grad_norm": 0.13138683401901344, "learning_rate": 2.3180194846605367e-06, "loss": 0.0082, "num_tokens": 213222393.0, "step": 467 }, { "epoch": 3.8065173116089612, "grad_norm": 0.12175266765217344, "learning_rate": 2.301291139028164e-06, "loss": 0.0079, "num_tokens": 213681819.0, "step": 468 }, { "epoch": 3.814663951120163, "grad_norm": 0.1371049624563703, "learning_rate": 2.284651668665923e-06, "loss": 0.0086, "num_tokens": 214137525.0, "step": 469 }, { "epoch": 3.8228105906313647, "grad_norm": 0.12055815969853237, "learning_rate": 2.268101535897046e-06, "loss": 0.0083, "num_tokens": 214589391.0, "step": 470 }, { "epoch": 3.830957230142566, "grad_norm": 0.14290097179006628, "learning_rate": 2.2516412005625465e-06, "loss": 0.0097, "num_tokens": 215032404.0, "step": 471 }, { "epoch": 3.839103869653768, "grad_norm": 0.12498594646510086, "learning_rate": 2.235271120008439e-06, "loss": 0.008, "num_tokens": 215507249.0, "step": 472 }, { "epoch": 3.8472505091649696, "grad_norm": 0.12951287934361264, "learning_rate": 2.218991749073032e-06, "loss": 0.0081, "num_tokens": 215963900.0, "step": 473 }, { "epoch": 3.855397148676171, "grad_norm": 0.1340360089239412, "learning_rate": 2.2028035400742946e-06, "loss": 0.0086, "num_tokens": 216413372.0, "step": 474 }, { "epoch": 3.8635437881873727, "grad_norm": 0.1428815749513004, "learning_rate": 2.1867069427972814e-06, "loss": 0.0091, "num_tokens": 216874891.0, "step": 475 }, { "epoch": 3.8716904276985744, "grad_norm": 0.15258894927804814, "learning_rate": 2.1707024044816433e-06, "loss": 0.01, "num_tokens": 217335057.0, "step": 476 }, { "epoch": 3.8798370672097757, "grad_norm": 0.11854572238956909, "learning_rate": 2.1547903698091975e-06, "loss": 0.0083, "num_tokens": 217810990.0, "step": 477 }, { "epoch": 3.8879837067209775, "grad_norm": 0.12413149337289436, "learning_rate": 2.13897128089157e-06, "loss": 0.0078, "num_tokens": 218271262.0, "step": 478 }, { "epoch": 3.8961303462321792, "grad_norm": 0.13054286386457706, "learning_rate": 2.1232455772579164e-06, "loss": 0.0088, "num_tokens": 218733996.0, "step": 479 }, { "epoch": 3.904276985743381, "grad_norm": 0.14174809290893123, "learning_rate": 2.107613695842705e-06, "loss": 0.0083, "num_tokens": 219193703.0, "step": 480 }, { "epoch": 3.9124236252545828, "grad_norm": 0.13172558951478341, "learning_rate": 2.09207607097358e-06, "loss": 0.0091, "num_tokens": 219637585.0, "step": 481 }, { "epoch": 3.920570264765784, "grad_norm": 0.12825966468997463, "learning_rate": 2.0766331343592935e-06, "loss": 0.0087, "num_tokens": 220100782.0, "step": 482 }, { "epoch": 3.928716904276986, "grad_norm": 0.11619016881857674, "learning_rate": 2.0612853150777083e-06, "loss": 0.0074, "num_tokens": 220548817.0, "step": 483 }, { "epoch": 3.9368635437881876, "grad_norm": 0.12824773954267013, "learning_rate": 2.0460330395638754e-06, "loss": 0.0089, "num_tokens": 220986452.0, "step": 484 }, { "epoch": 3.945010183299389, "grad_norm": 0.12829553056162407, "learning_rate": 2.030876731598194e-06, "loss": 0.0083, "num_tokens": 221480796.0, "step": 485 }, { "epoch": 3.9531568228105907, "grad_norm": 0.12618210454698364, "learning_rate": 2.0158168122946254e-06, "loss": 0.0082, "num_tokens": 221927605.0, "step": 486 }, { "epoch": 3.9613034623217924, "grad_norm": 0.1351329578005386, "learning_rate": 2.000853700089001e-06, "loss": 0.0089, "num_tokens": 222378433.0, "step": 487 }, { "epoch": 3.9694501018329937, "grad_norm": 0.12217015553170964, "learning_rate": 1.9859878107273884e-06, "loss": 0.0083, "num_tokens": 222829736.0, "step": 488 }, { "epoch": 3.9775967413441955, "grad_norm": 0.132799994524403, "learning_rate": 1.971219557254548e-06, "loss": 0.0095, "num_tokens": 223285731.0, "step": 489 }, { "epoch": 3.9857433808553973, "grad_norm": 0.12499217681603624, "learning_rate": 1.956549350002454e-06, "loss": 0.0069, "num_tokens": 223757013.0, "step": 490 }, { "epoch": 3.9938900203665986, "grad_norm": 0.1140878012499302, "learning_rate": 1.9419775965788897e-06, "loss": 0.0073, "num_tokens": 224255873.0, "step": 491 }, { "epoch": 4.0, "grad_norm": 0.16568246569300987, "learning_rate": 1.9275047018561265e-06, "loss": 0.0087, "num_tokens": 224299419.0, "step": 492 }, { "epoch": 4.0, "eval_loss": 0.04265177622437477, "eval_num_tokens": 224299419.0, "eval_runtime": 57.842, "eval_samples_per_second": 42.167, "eval_steps_per_second": 5.273, "step": 492 }, { "epoch": 4.008146639511201, "grad_norm": 0.10953124483033916, "learning_rate": 1.913131067959673e-06, "loss": 0.0064, "num_tokens": 224740301.0, "step": 493 }, { "epoch": 4.0162932790224035, "grad_norm": 0.09661175616513212, "learning_rate": 1.8988570942571039e-06, "loss": 0.0064, "num_tokens": 225184668.0, "step": 494 }, { "epoch": 4.024439918533605, "grad_norm": 0.09108581236399259, "learning_rate": 1.8846831773469587e-06, "loss": 0.0054, "num_tokens": 225644004.0, "step": 495 }, { "epoch": 4.032586558044806, "grad_norm": 0.08286087324083283, "learning_rate": 1.8706097110477298e-06, "loss": 0.0049, "num_tokens": 226150717.0, "step": 496 }, { "epoch": 4.040733197556008, "grad_norm": 0.0987575217688521, "learning_rate": 1.8566370863869122e-06, "loss": 0.0058, "num_tokens": 226596638.0, "step": 497 }, { "epoch": 4.04887983706721, "grad_norm": 0.10313893410726134, "learning_rate": 1.8427656915901428e-06, "loss": 0.0063, "num_tokens": 227070697.0, "step": 498 }, { "epoch": 4.057026476578411, "grad_norm": 0.08409803434100602, "learning_rate": 1.8289959120704204e-06, "loss": 0.0045, "num_tokens": 227563263.0, "step": 499 }, { "epoch": 4.065173116089613, "grad_norm": 0.10477823554325051, "learning_rate": 1.8153281304173842e-06, "loss": 0.0059, "num_tokens": 228039640.0, "step": 500 }, { "epoch": 4.0733197556008145, "grad_norm": 0.09829199382018614, "learning_rate": 1.801762726386691e-06, "loss": 0.0056, "num_tokens": 228524467.0, "step": 501 }, { "epoch": 4.081466395112017, "grad_norm": 0.09331418832378849, "learning_rate": 1.7883000768894627e-06, "loss": 0.0047, "num_tokens": 228994748.0, "step": 502 }, { "epoch": 4.089613034623218, "grad_norm": 0.09581281591436303, "learning_rate": 1.7749405559818162e-06, "loss": 0.0053, "num_tokens": 229450908.0, "step": 503 }, { "epoch": 4.097759674134419, "grad_norm": 0.10440722069541235, "learning_rate": 1.7616845348544657e-06, "loss": 0.0065, "num_tokens": 229910862.0, "step": 504 }, { "epoch": 4.1059063136456215, "grad_norm": 0.09700791936550231, "learning_rate": 1.7485323818224126e-06, "loss": 0.0051, "num_tokens": 230369276.0, "step": 505 }, { "epoch": 4.114052953156823, "grad_norm": 0.10857998726786411, "learning_rate": 1.7354844623147116e-06, "loss": 0.0059, "num_tokens": 230827234.0, "step": 506 }, { "epoch": 4.122199592668024, "grad_norm": 0.10148060818665218, "learning_rate": 1.722541138864316e-06, "loss": 0.0056, "num_tokens": 231311328.0, "step": 507 }, { "epoch": 4.130346232179226, "grad_norm": 0.09790332100802439, "learning_rate": 1.7097027710980059e-06, "loss": 0.0054, "num_tokens": 231774619.0, "step": 508 }, { "epoch": 4.138492871690428, "grad_norm": 0.11303318804836798, "learning_rate": 1.6969697157263968e-06, "loss": 0.0063, "num_tokens": 232234778.0, "step": 509 }, { "epoch": 4.146639511201629, "grad_norm": 0.11928599820659892, "learning_rate": 1.6843423265340241e-06, "loss": 0.0066, "num_tokens": 232685797.0, "step": 510 }, { "epoch": 4.154786150712831, "grad_norm": 0.10102982747758138, "learning_rate": 1.6718209543695198e-06, "loss": 0.0053, "num_tokens": 233131575.0, "step": 511 }, { "epoch": 4.1629327902240325, "grad_norm": 0.10103824755044703, "learning_rate": 1.6594059471358603e-06, "loss": 0.005, "num_tokens": 233592206.0, "step": 512 }, { "epoch": 4.171079429735234, "grad_norm": 0.09473975634726714, "learning_rate": 1.6470976497807028e-06, "loss": 0.0049, "num_tokens": 234073717.0, "step": 513 }, { "epoch": 4.179226069246436, "grad_norm": 0.10317754634136525, "learning_rate": 1.6348964042867963e-06, "loss": 0.0053, "num_tokens": 234525493.0, "step": 514 }, { "epoch": 4.187372708757637, "grad_norm": 0.10883171792223603, "learning_rate": 1.6228025496624816e-06, "loss": 0.0062, "num_tokens": 234975032.0, "step": 515 }, { "epoch": 4.195519348268839, "grad_norm": 0.10753806313999263, "learning_rate": 1.6108164219322759e-06, "loss": 0.0053, "num_tokens": 235438383.0, "step": 516 }, { "epoch": 4.203665987780041, "grad_norm": 0.1064236411620558, "learning_rate": 1.598938354127532e-06, "loss": 0.0059, "num_tokens": 235879893.0, "step": 517 }, { "epoch": 4.211812627291242, "grad_norm": 0.12022958396721184, "learning_rate": 1.5871686762771876e-06, "loss": 0.0059, "num_tokens": 236349201.0, "step": 518 }, { "epoch": 4.219959266802444, "grad_norm": 0.11124601452546444, "learning_rate": 1.5755077153985927e-06, "loss": 0.0057, "num_tokens": 236800777.0, "step": 519 }, { "epoch": 4.228105906313646, "grad_norm": 0.09236406495488149, "learning_rate": 1.5639557954884263e-06, "loss": 0.0043, "num_tokens": 237293264.0, "step": 520 }, { "epoch": 4.236252545824847, "grad_norm": 0.1273600213134377, "learning_rate": 1.552513237513694e-06, "loss": 0.0073, "num_tokens": 237724964.0, "step": 521 }, { "epoch": 4.244399185336049, "grad_norm": 0.11769756147992531, "learning_rate": 1.541180359402809e-06, "loss": 0.0059, "num_tokens": 238180300.0, "step": 522 }, { "epoch": 4.2525458248472505, "grad_norm": 0.1217965830011373, "learning_rate": 1.5299574760367564e-06, "loss": 0.0061, "num_tokens": 238636931.0, "step": 523 }, { "epoch": 4.260692464358452, "grad_norm": 0.11472619272622367, "learning_rate": 1.5188448992403504e-06, "loss": 0.0058, "num_tokens": 239086905.0, "step": 524 }, { "epoch": 4.268839103869654, "grad_norm": 0.11529863540901476, "learning_rate": 1.5078429377735626e-06, "loss": 0.0063, "num_tokens": 239550473.0, "step": 525 }, { "epoch": 4.276985743380855, "grad_norm": 0.1156481007594638, "learning_rate": 1.4969518973229526e-06, "loss": 0.0059, "num_tokens": 239995374.0, "step": 526 }, { "epoch": 4.285132382892057, "grad_norm": 0.10698685435134675, "learning_rate": 1.4861720804931665e-06, "loss": 0.0058, "num_tokens": 240466754.0, "step": 527 }, { "epoch": 4.293279022403259, "grad_norm": 0.11289580051998427, "learning_rate": 1.4755037867985285e-06, "loss": 0.006, "num_tokens": 240906071.0, "step": 528 }, { "epoch": 4.30142566191446, "grad_norm": 0.11536050235837439, "learning_rate": 1.4649473126547273e-06, "loss": 0.0054, "num_tokens": 241355455.0, "step": 529 }, { "epoch": 4.3095723014256615, "grad_norm": 0.11636487088267386, "learning_rate": 1.4545029513705735e-06, "loss": 0.0058, "num_tokens": 241836525.0, "step": 530 }, { "epoch": 4.317718940936864, "grad_norm": 0.09846554835421734, "learning_rate": 1.4441709931398513e-06, "loss": 0.0051, "num_tokens": 242307462.0, "step": 531 }, { "epoch": 4.325865580448065, "grad_norm": 0.1120813571543054, "learning_rate": 1.4339517250332565e-06, "loss": 0.0061, "num_tokens": 242741978.0, "step": 532 }, { "epoch": 4.334012219959266, "grad_norm": 0.1113819187138935, "learning_rate": 1.4238454309904205e-06, "loss": 0.0055, "num_tokens": 243192201.0, "step": 533 }, { "epoch": 4.3421588594704685, "grad_norm": 0.11764577339647353, "learning_rate": 1.4138523918120201e-06, "loss": 0.0065, "num_tokens": 243636087.0, "step": 534 }, { "epoch": 4.35030549898167, "grad_norm": 0.11164487804753273, "learning_rate": 1.4039728851519764e-06, "loss": 0.0055, "num_tokens": 244110581.0, "step": 535 }, { "epoch": 4.358452138492872, "grad_norm": 0.09698712924798691, "learning_rate": 1.3942071855097381e-06, "loss": 0.0049, "num_tokens": 244572435.0, "step": 536 }, { "epoch": 4.366598778004073, "grad_norm": 0.1104930978310767, "learning_rate": 1.3845555642226583e-06, "loss": 0.0056, "num_tokens": 245032371.0, "step": 537 }, { "epoch": 4.374745417515275, "grad_norm": 0.11183186111310507, "learning_rate": 1.375018289458453e-06, "loss": 0.0055, "num_tokens": 245488372.0, "step": 538 }, { "epoch": 4.382892057026477, "grad_norm": 0.11660880918067139, "learning_rate": 1.3655956262077502e-06, "loss": 0.0063, "num_tokens": 245947576.0, "step": 539 }, { "epoch": 4.391038696537678, "grad_norm": 0.11734436147080707, "learning_rate": 1.3562878362767296e-06, "loss": 0.006, "num_tokens": 246410789.0, "step": 540 }, { "epoch": 4.3991853360488795, "grad_norm": 0.11187947506861028, "learning_rate": 1.3470951782798432e-06, "loss": 0.0053, "num_tokens": 246885080.0, "step": 541 }, { "epoch": 4.407331975560082, "grad_norm": 0.10682796561668163, "learning_rate": 1.338017907632635e-06, "loss": 0.0054, "num_tokens": 247344383.0, "step": 542 }, { "epoch": 4.415478615071283, "grad_norm": 0.11487602768278418, "learning_rate": 1.329056276544642e-06, "loss": 0.0054, "num_tokens": 247825702.0, "step": 543 }, { "epoch": 4.423625254582484, "grad_norm": 0.10954303849780199, "learning_rate": 1.320210534012388e-06, "loss": 0.0059, "num_tokens": 248301334.0, "step": 544 }, { "epoch": 4.4317718940936865, "grad_norm": 0.1065560110571518, "learning_rate": 1.311480925812461e-06, "loss": 0.0057, "num_tokens": 248770660.0, "step": 545 }, { "epoch": 4.439918533604888, "grad_norm": 0.12112306787916738, "learning_rate": 1.3028676944946916e-06, "loss": 0.0067, "num_tokens": 249197698.0, "step": 546 }, { "epoch": 4.44806517311609, "grad_norm": 0.10503970639083068, "learning_rate": 1.2943710793754082e-06, "loss": 0.0049, "num_tokens": 249659509.0, "step": 547 }, { "epoch": 4.456211812627291, "grad_norm": 0.11924557580218739, "learning_rate": 1.2859913165307886e-06, "loss": 0.0063, "num_tokens": 250110156.0, "step": 548 }, { "epoch": 4.464358452138493, "grad_norm": 0.11442982117714874, "learning_rate": 1.277728638790303e-06, "loss": 0.0063, "num_tokens": 250550111.0, "step": 549 }, { "epoch": 4.472505091649695, "grad_norm": 0.12110395302590302, "learning_rate": 1.2695832757302412e-06, "loss": 0.0065, "num_tokens": 251002357.0, "step": 550 }, { "epoch": 4.480651731160896, "grad_norm": 0.11664662464057247, "learning_rate": 1.2615554536673377e-06, "loss": 0.0062, "num_tokens": 251458462.0, "step": 551 }, { "epoch": 4.4887983706720975, "grad_norm": 0.11645513165539287, "learning_rate": 1.253645395652481e-06, "loss": 0.0061, "num_tokens": 251902226.0, "step": 552 }, { "epoch": 4.4969450101833, "grad_norm": 0.11363438791067745, "learning_rate": 1.2458533214645175e-06, "loss": 0.0056, "num_tokens": 252346885.0, "step": 553 }, { "epoch": 4.505091649694501, "grad_norm": 0.12965647026273558, "learning_rate": 1.2381794476041447e-06, "loss": 0.0064, "num_tokens": 252804103.0, "step": 554 }, { "epoch": 4.513238289205702, "grad_norm": 0.10419635456766704, "learning_rate": 1.2306239872878946e-06, "loss": 0.0059, "num_tokens": 253273586.0, "step": 555 }, { "epoch": 4.521384928716905, "grad_norm": 0.11350584533770305, "learning_rate": 1.2231871504422117e-06, "loss": 0.0059, "num_tokens": 253725593.0, "step": 556 }, { "epoch": 4.529531568228106, "grad_norm": 0.13468868599441702, "learning_rate": 1.215869143697619e-06, "loss": 0.0073, "num_tokens": 254156458.0, "step": 557 }, { "epoch": 4.537678207739307, "grad_norm": 0.10259852383741634, "learning_rate": 1.2086701703829755e-06, "loss": 0.0054, "num_tokens": 254617846.0, "step": 558 }, { "epoch": 4.545824847250509, "grad_norm": 0.11651453346375099, "learning_rate": 1.2015904305198286e-06, "loss": 0.0063, "num_tokens": 255052922.0, "step": 559 }, { "epoch": 4.553971486761711, "grad_norm": 0.10384453182105129, "learning_rate": 1.1946301208168593e-06, "loss": 0.0051, "num_tokens": 255534554.0, "step": 560 }, { "epoch": 4.562118126272912, "grad_norm": 0.12828945094057975, "learning_rate": 1.1877894346644085e-06, "loss": 0.007, "num_tokens": 255986625.0, "step": 561 }, { "epoch": 4.570264765784114, "grad_norm": 0.10166841643303247, "learning_rate": 1.1810685621291135e-06, "loss": 0.0055, "num_tokens": 256440817.0, "step": 562 }, { "epoch": 4.5784114052953155, "grad_norm": 0.12163643122042941, "learning_rate": 1.174467689948618e-06, "loss": 0.007, "num_tokens": 256883913.0, "step": 563 }, { "epoch": 4.586558044806518, "grad_norm": 0.11612572338384212, "learning_rate": 1.1679870015263908e-06, "loss": 0.0061, "num_tokens": 257340848.0, "step": 564 }, { "epoch": 4.594704684317719, "grad_norm": 0.09659828775248515, "learning_rate": 1.1616266769266263e-06, "loss": 0.0052, "num_tokens": 257795593.0, "step": 565 }, { "epoch": 4.60285132382892, "grad_norm": 0.10140831312358678, "learning_rate": 1.1553868928692422e-06, "loss": 0.0048, "num_tokens": 258288534.0, "step": 566 }, { "epoch": 4.610997963340123, "grad_norm": 0.11217052895153468, "learning_rate": 1.1492678227249695e-06, "loss": 0.0059, "num_tokens": 258741097.0, "step": 567 }, { "epoch": 4.619144602851324, "grad_norm": 0.1126933577651828, "learning_rate": 1.143269636510536e-06, "loss": 0.0061, "num_tokens": 259193501.0, "step": 568 }, { "epoch": 4.627291242362525, "grad_norm": 0.11797745986694334, "learning_rate": 1.1373925008839403e-06, "loss": 0.0063, "num_tokens": 259649197.0, "step": 569 }, { "epoch": 4.635437881873727, "grad_norm": 0.11303980738140469, "learning_rate": 1.1316365791398251e-06, "loss": 0.0061, "num_tokens": 260088831.0, "step": 570 }, { "epoch": 4.643584521384929, "grad_norm": 0.10873603489504344, "learning_rate": 1.1260020312049356e-06, "loss": 0.006, "num_tokens": 260555536.0, "step": 571 }, { "epoch": 4.65173116089613, "grad_norm": 0.0920006832828397, "learning_rate": 1.1204890136336784e-06, "loss": 0.0052, "num_tokens": 261048454.0, "step": 572 }, { "epoch": 4.659877800407332, "grad_norm": 0.1255806723199747, "learning_rate": 1.1150976796037736e-06, "loss": 0.0068, "num_tokens": 261480295.0, "step": 573 }, { "epoch": 4.6680244399185336, "grad_norm": 0.11533169004614044, "learning_rate": 1.1098281789119948e-06, "loss": 0.0057, "num_tokens": 261942589.0, "step": 574 }, { "epoch": 4.676171079429735, "grad_norm": 0.10129996781843084, "learning_rate": 1.104680657970009e-06, "loss": 0.0057, "num_tokens": 262393944.0, "step": 575 }, { "epoch": 4.684317718940937, "grad_norm": 0.11015833267592207, "learning_rate": 1.0996552598003088e-06, "loss": 0.0059, "num_tokens": 262882312.0, "step": 576 }, { "epoch": 4.692464358452138, "grad_norm": 0.10314595226042249, "learning_rate": 1.094752124032238e-06, "loss": 0.0055, "num_tokens": 263336673.0, "step": 577 }, { "epoch": 4.70061099796334, "grad_norm": 0.11664841890610124, "learning_rate": 1.0899713868981123e-06, "loss": 0.0064, "num_tokens": 263792010.0, "step": 578 }, { "epoch": 4.708757637474542, "grad_norm": 0.09972137290365708, "learning_rate": 1.0853131812294355e-06, "loss": 0.0051, "num_tokens": 264237484.0, "step": 579 }, { "epoch": 4.716904276985743, "grad_norm": 0.10268566206680875, "learning_rate": 1.0807776364532044e-06, "loss": 0.0056, "num_tokens": 264713321.0, "step": 580 }, { "epoch": 4.725050916496945, "grad_norm": 0.10619035337589804, "learning_rate": 1.0763648785883186e-06, "loss": 0.0058, "num_tokens": 265183724.0, "step": 581 }, { "epoch": 4.733197556008147, "grad_norm": 0.10541962557747203, "learning_rate": 1.0720750302420745e-06, "loss": 0.0057, "num_tokens": 265627643.0, "step": 582 }, { "epoch": 4.741344195519348, "grad_norm": 0.11506033498658928, "learning_rate": 1.0679082106067618e-06, "loss": 0.0067, "num_tokens": 266084878.0, "step": 583 }, { "epoch": 4.74949083503055, "grad_norm": 0.11142067796057883, "learning_rate": 1.0638645354563488e-06, "loss": 0.0056, "num_tokens": 266578362.0, "step": 584 }, { "epoch": 4.757637474541752, "grad_norm": 0.12323031771225379, "learning_rate": 1.0599441171432685e-06, "loss": 0.0071, "num_tokens": 267005793.0, "step": 585 }, { "epoch": 4.765784114052953, "grad_norm": 0.10911498957082988, "learning_rate": 1.0561470645952939e-06, "loss": 0.0059, "num_tokens": 267445983.0, "step": 586 }, { "epoch": 4.773930753564155, "grad_norm": 0.10589151493278187, "learning_rate": 1.0524734833125155e-06, "loss": 0.006, "num_tokens": 267934787.0, "step": 587 }, { "epoch": 4.782077393075356, "grad_norm": 0.0961251286065213, "learning_rate": 1.0489234753644075e-06, "loss": 0.0047, "num_tokens": 268404039.0, "step": 588 }, { "epoch": 4.790224032586558, "grad_norm": 0.11570808115862555, "learning_rate": 1.0454971393869895e-06, "loss": 0.0061, "num_tokens": 268871776.0, "step": 589 }, { "epoch": 4.79837067209776, "grad_norm": 0.11996049644781787, "learning_rate": 1.0421945705800913e-06, "loss": 0.006, "num_tokens": 269329939.0, "step": 590 }, { "epoch": 4.806517311608961, "grad_norm": 0.11015784556640101, "learning_rate": 1.0390158607047029e-06, "loss": 0.0059, "num_tokens": 269796155.0, "step": 591 }, { "epoch": 4.814663951120163, "grad_norm": 0.10516381427732067, "learning_rate": 1.0359610980804286e-06, "loss": 0.0051, "num_tokens": 270260800.0, "step": 592 }, { "epoch": 4.822810590631365, "grad_norm": 0.11057933848917369, "learning_rate": 1.0330303675830306e-06, "loss": 0.0054, "num_tokens": 270718037.0, "step": 593 }, { "epoch": 4.830957230142566, "grad_norm": 0.12034159438309625, "learning_rate": 1.0302237506420722e-06, "loss": 0.0063, "num_tokens": 271163129.0, "step": 594 }, { "epoch": 4.839103869653767, "grad_norm": 0.1298369000893159, "learning_rate": 1.0275413252386545e-06, "loss": 0.0077, "num_tokens": 271586088.0, "step": 595 }, { "epoch": 4.84725050916497, "grad_norm": 0.11485648605447368, "learning_rate": 1.0249831659032494e-06, "loss": 0.0067, "num_tokens": 272031287.0, "step": 596 }, { "epoch": 4.855397148676171, "grad_norm": 0.11585325382556429, "learning_rate": 1.0225493437136302e-06, "loss": 0.0067, "num_tokens": 272474742.0, "step": 597 }, { "epoch": 4.863543788187373, "grad_norm": 0.1239008691750004, "learning_rate": 1.020239926292895e-06, "loss": 0.0067, "num_tokens": 272932607.0, "step": 598 }, { "epoch": 4.871690427698574, "grad_norm": 0.10254555243859467, "learning_rate": 1.018054977807589e-06, "loss": 0.0057, "num_tokens": 273415530.0, "step": 599 }, { "epoch": 4.879837067209776, "grad_norm": 0.10546701888018833, "learning_rate": 1.0159945589659223e-06, "loss": 0.0056, "num_tokens": 273885366.0, "step": 600 }, { "epoch": 4.887983706720978, "grad_norm": 0.12031804835663963, "learning_rate": 1.0140587270160806e-06, "loss": 0.0066, "num_tokens": 274335421.0, "step": 601 }, { "epoch": 4.896130346232179, "grad_norm": 0.11002140545903802, "learning_rate": 1.0122475357446372e-06, "loss": 0.0061, "num_tokens": 274789915.0, "step": 602 }, { "epoch": 4.904276985743381, "grad_norm": 0.10524124599370216, "learning_rate": 1.0105610354750566e-06, "loss": 0.0055, "num_tokens": 275269107.0, "step": 603 }, { "epoch": 4.912423625254583, "grad_norm": 0.1279839524575316, "learning_rate": 1.0089992730662983e-06, "loss": 0.007, "num_tokens": 275714557.0, "step": 604 }, { "epoch": 4.920570264765784, "grad_norm": 0.11521764229191792, "learning_rate": 1.0075622919115133e-06, "loss": 0.0059, "num_tokens": 276134943.0, "step": 605 }, { "epoch": 4.928716904276985, "grad_norm": 0.10797809839128278, "learning_rate": 1.0062501319368376e-06, "loss": 0.005, "num_tokens": 276628333.0, "step": 606 }, { "epoch": 4.936863543788188, "grad_norm": 0.103832639157195, "learning_rate": 1.0050628296002864e-06, "loss": 0.0055, "num_tokens": 277092549.0, "step": 607 }, { "epoch": 4.945010183299389, "grad_norm": 0.10579829877653406, "learning_rate": 1.0040004178907364e-06, "loss": 0.0059, "num_tokens": 277564414.0, "step": 608 }, { "epoch": 4.953156822810591, "grad_norm": 0.1129191145686251, "learning_rate": 1.0030629263270133e-06, "loss": 0.0057, "num_tokens": 278043267.0, "step": 609 }, { "epoch": 4.961303462321792, "grad_norm": 0.11428351556872687, "learning_rate": 1.0022503809570692e-06, "loss": 0.0058, "num_tokens": 278500208.0, "step": 610 }, { "epoch": 4.969450101832994, "grad_norm": 0.12454757064462266, "learning_rate": 1.0015628043572607e-06, "loss": 0.0067, "num_tokens": 278958350.0, "step": 611 }, { "epoch": 4.977596741344195, "grad_norm": 0.11985284209865818, "learning_rate": 1.0010002156317187e-06, "loss": 0.0055, "num_tokens": 279426149.0, "step": 612 }, { "epoch": 4.985743380855397, "grad_norm": 0.11653540817309618, "learning_rate": 1.0005626304118208e-06, "loss": 0.0062, "num_tokens": 279874989.0, "step": 613 }, { "epoch": 4.993890020366599, "grad_norm": 0.12398585004184347, "learning_rate": 1.0002500608557558e-06, "loss": 0.0064, "num_tokens": 280320581.0, "step": 614 }, { "epoch": 5.0, "grad_norm": 0.12467063210289439, "learning_rate": 1.0000625156481842e-06, "loss": 0.0065, "num_tokens": 280366492.0, "step": 615 }, { "epoch": 5.0, "eval_loss": 0.04499583691358566, "eval_num_tokens": 280366492.0, "eval_runtime": 57.8338, "eval_samples_per_second": 42.173, "eval_steps_per_second": 5.274, "step": 615 }, { "epoch": 5.0, "step": 615, "total_flos": 9.471448716243108e+17, "train_loss": 0.017112477973285245, "train_runtime": 9496.6509, "train_samples_per_second": 8.264, "train_steps_per_second": 0.065 } ], "logging_steps": 1, "max_steps": 615, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.471448716243108e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }