diff --git "a/checkpoint-6250/trainer_state.json" "b/checkpoint-6250/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-6250/trainer_state.json" @@ -0,0 +1,4408 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 6250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016, + "grad_norm": 0.4957960546016693, + "learning_rate": 2.666666666666667e-06, + "loss": 1.4728, + "step": 10 + }, + { + "epoch": 0.0032, + "grad_norm": 0.41995954513549805, + "learning_rate": 5.333333333333334e-06, + "loss": 1.4831, + "step": 20 + }, + { + "epoch": 0.0048, + "grad_norm": 0.2989521324634552, + "learning_rate": 8.000000000000001e-06, + "loss": 1.4327, + "step": 30 + }, + { + "epoch": 0.0064, + "grad_norm": 0.4052715003490448, + "learning_rate": 1.0666666666666667e-05, + "loss": 1.2872, + "step": 40 + }, + { + "epoch": 0.008, + "grad_norm": 0.3986363708972931, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.1546, + "step": 50 + }, + { + "epoch": 0.0096, + "grad_norm": 0.40488049387931824, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.0257, + "step": 60 + }, + { + "epoch": 0.0112, + "grad_norm": 0.21044793725013733, + "learning_rate": 1.866666666666667e-05, + "loss": 0.885, + "step": 70 + }, + { + "epoch": 0.0128, + "grad_norm": 0.15906786918640137, + "learning_rate": 2.1333333333333335e-05, + "loss": 0.7928, + "step": 80 + }, + { + "epoch": 0.0144, + "grad_norm": 0.15704213082790375, + "learning_rate": 2.4e-05, + "loss": 0.7632, + "step": 90 + }, + { + "epoch": 0.016, + "grad_norm": 0.15451550483703613, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.7402, + "step": 100 + }, + { + "epoch": 0.0176, + "grad_norm": 0.1893616020679474, + "learning_rate": 2.9333333333333336e-05, + "loss": 0.7142, + "step": 110 + }, + { + "epoch": 0.0192, + "grad_norm": 0.20109032094478607, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.7315, + "step": 120 + }, + { + "epoch": 0.0208, + "grad_norm": 0.17760278284549713, + "learning_rate": 3.466666666666667e-05, + "loss": 0.6835, + "step": 130 + }, + { + "epoch": 0.0224, + "grad_norm": 0.19728775322437286, + "learning_rate": 3.733333333333334e-05, + "loss": 0.6926, + "step": 140 + }, + { + "epoch": 0.024, + "grad_norm": 0.2489059716463089, + "learning_rate": 4e-05, + "loss": 0.6659, + "step": 150 + }, + { + "epoch": 0.0256, + "grad_norm": 0.3001479506492615, + "learning_rate": 4.266666666666667e-05, + "loss": 0.6724, + "step": 160 + }, + { + "epoch": 0.0272, + "grad_norm": 0.2427113652229309, + "learning_rate": 4.5333333333333335e-05, + "loss": 0.6299, + "step": 170 + }, + { + "epoch": 0.0288, + "grad_norm": 0.19930540025234222, + "learning_rate": 4.8e-05, + "loss": 0.6405, + "step": 180 + }, + { + "epoch": 0.0304, + "grad_norm": 0.19050472974777222, + "learning_rate": 5.0666666666666674e-05, + "loss": 0.6148, + "step": 190 + }, + { + "epoch": 0.032, + "grad_norm": 0.1770695149898529, + "learning_rate": 5.333333333333333e-05, + "loss": 0.6242, + "step": 200 + }, + { + "epoch": 0.0336, + "grad_norm": 0.18207508325576782, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.6062, + "step": 210 + }, + { + "epoch": 0.0352, + "grad_norm": 0.18143683671951294, + "learning_rate": 5.866666666666667e-05, + "loss": 0.607, + "step": 220 + }, + { + "epoch": 0.0368, + "grad_norm": 0.1947951763868332, + "learning_rate": 6.133333333333334e-05, + "loss": 0.6089, + "step": 230 + }, + { + "epoch": 0.0384, + "grad_norm": 0.1820327192544937, + "learning_rate": 6.400000000000001e-05, + "loss": 0.6079, + "step": 240 + }, + { + "epoch": 0.04, + "grad_norm": 0.18461793661117554, + "learning_rate": 6.666666666666667e-05, + "loss": 0.6105, + "step": 250 + }, + { + "epoch": 0.0416, + "grad_norm": 0.18145723640918732, + "learning_rate": 6.933333333333334e-05, + "loss": 0.5889, + "step": 260 + }, + { + "epoch": 0.0432, + "grad_norm": 0.19138211011886597, + "learning_rate": 7.2e-05, + "loss": 0.5756, + "step": 270 + }, + { + "epoch": 0.0448, + "grad_norm": 0.17120584845542908, + "learning_rate": 7.466666666666667e-05, + "loss": 0.6095, + "step": 280 + }, + { + "epoch": 0.0464, + "grad_norm": 0.18954581022262573, + "learning_rate": 7.733333333333333e-05, + "loss": 0.6059, + "step": 290 + }, + { + "epoch": 0.048, + "grad_norm": 0.15436199307441711, + "learning_rate": 8e-05, + "loss": 0.5702, + "step": 300 + }, + { + "epoch": 0.0496, + "grad_norm": 0.23982836306095123, + "learning_rate": 8.266666666666667e-05, + "loss": 0.5984, + "step": 310 + }, + { + "epoch": 0.0512, + "grad_norm": 0.19414694607257843, + "learning_rate": 8.533333333333334e-05, + "loss": 0.6011, + "step": 320 + }, + { + "epoch": 0.0528, + "grad_norm": 0.1918945014476776, + "learning_rate": 8.800000000000001e-05, + "loss": 0.5889, + "step": 330 + }, + { + "epoch": 0.0544, + "grad_norm": 0.19442060589790344, + "learning_rate": 9.066666666666667e-05, + "loss": 0.5618, + "step": 340 + }, + { + "epoch": 0.056, + "grad_norm": 0.19109170138835907, + "learning_rate": 9.333333333333334e-05, + "loss": 0.5879, + "step": 350 + }, + { + "epoch": 0.0576, + "grad_norm": 0.21671929955482483, + "learning_rate": 9.6e-05, + "loss": 0.574, + "step": 360 + }, + { + "epoch": 0.0592, + "grad_norm": 0.19711115956306458, + "learning_rate": 9.866666666666668e-05, + "loss": 0.5877, + "step": 370 + }, + { + "epoch": 0.0608, + "grad_norm": 0.18061968684196472, + "learning_rate": 9.999995804186196e-05, + "loss": 0.5587, + "step": 380 + }, + { + "epoch": 0.0624, + "grad_norm": 0.17720717191696167, + "learning_rate": 9.999962237718015e-05, + "loss": 0.558, + "step": 390 + }, + { + "epoch": 0.064, + "grad_norm": 0.18406185507774353, + "learning_rate": 9.999895105006994e-05, + "loss": 0.5699, + "step": 400 + }, + { + "epoch": 0.0656, + "grad_norm": 0.17033794522285461, + "learning_rate": 9.999794406503817e-05, + "loss": 0.5588, + "step": 410 + }, + { + "epoch": 0.0672, + "grad_norm": 0.1636849194765091, + "learning_rate": 9.9996601428845e-05, + "loss": 0.5625, + "step": 420 + }, + { + "epoch": 0.0688, + "grad_norm": 0.1742946058511734, + "learning_rate": 9.999492315050396e-05, + "loss": 0.5705, + "step": 430 + }, + { + "epoch": 0.0704, + "grad_norm": 0.20321185886859894, + "learning_rate": 9.999290924128185e-05, + "loss": 0.5612, + "step": 440 + }, + { + "epoch": 0.072, + "grad_norm": 0.17660562694072723, + "learning_rate": 9.999055971469864e-05, + "loss": 0.5519, + "step": 450 + }, + { + "epoch": 0.0736, + "grad_norm": 0.17371931672096252, + "learning_rate": 9.998787458652739e-05, + "loss": 0.5603, + "step": 460 + }, + { + "epoch": 0.0752, + "grad_norm": 0.15404820442199707, + "learning_rate": 9.998485387479418e-05, + "loss": 0.5506, + "step": 470 + }, + { + "epoch": 0.0768, + "grad_norm": 0.19996191561222076, + "learning_rate": 9.998149759977795e-05, + "loss": 0.5545, + "step": 480 + }, + { + "epoch": 0.0784, + "grad_norm": 0.16106033325195312, + "learning_rate": 9.997780578401039e-05, + "loss": 0.5645, + "step": 490 + }, + { + "epoch": 0.08, + "grad_norm": 0.15715229511260986, + "learning_rate": 9.997377845227576e-05, + "loss": 0.5452, + "step": 500 + }, + { + "epoch": 0.0816, + "grad_norm": 0.1737920641899109, + "learning_rate": 9.99694156316107e-05, + "loss": 0.5434, + "step": 510 + }, + { + "epoch": 0.0832, + "grad_norm": 0.1768925040960312, + "learning_rate": 9.996471735130422e-05, + "loss": 0.5734, + "step": 520 + }, + { + "epoch": 0.0848, + "grad_norm": 0.1645045131444931, + "learning_rate": 9.995968364289718e-05, + "loss": 0.5482, + "step": 530 + }, + { + "epoch": 0.0864, + "grad_norm": 0.15744324028491974, + "learning_rate": 9.995431454018245e-05, + "loss": 0.5298, + "step": 540 + }, + { + "epoch": 0.088, + "grad_norm": 0.18136905133724213, + "learning_rate": 9.99486100792044e-05, + "loss": 0.5436, + "step": 550 + }, + { + "epoch": 0.0896, + "grad_norm": 0.18272417783737183, + "learning_rate": 9.994257029825876e-05, + "loss": 0.55, + "step": 560 + }, + { + "epoch": 0.0912, + "grad_norm": 0.17868682742118835, + "learning_rate": 9.993619523789241e-05, + "loss": 0.5523, + "step": 570 + }, + { + "epoch": 0.0928, + "grad_norm": 0.16811689734458923, + "learning_rate": 9.992948494090302e-05, + "loss": 0.5528, + "step": 580 + }, + { + "epoch": 0.0944, + "grad_norm": 0.16601411998271942, + "learning_rate": 9.992243945233885e-05, + "loss": 0.5425, + "step": 590 + }, + { + "epoch": 0.096, + "grad_norm": 0.17169275879859924, + "learning_rate": 9.991505881949837e-05, + "loss": 0.5632, + "step": 600 + }, + { + "epoch": 0.0976, + "grad_norm": 0.15906384587287903, + "learning_rate": 9.990734309192994e-05, + "loss": 0.5365, + "step": 610 + }, + { + "epoch": 0.0992, + "grad_norm": 0.1605994552373886, + "learning_rate": 9.989929232143159e-05, + "loss": 0.5653, + "step": 620 + }, + { + "epoch": 0.1008, + "grad_norm": 0.1630123406648636, + "learning_rate": 9.989090656205052e-05, + "loss": 0.5401, + "step": 630 + }, + { + "epoch": 0.1024, + "grad_norm": 0.18528063595294952, + "learning_rate": 9.988218587008286e-05, + "loss": 0.5407, + "step": 640 + }, + { + "epoch": 0.104, + "grad_norm": 0.17673324048519135, + "learning_rate": 9.987313030407323e-05, + "loss": 0.5443, + "step": 650 + }, + { + "epoch": 0.1056, + "grad_norm": 0.18259309232234955, + "learning_rate": 9.986373992481435e-05, + "loss": 0.5338, + "step": 660 + }, + { + "epoch": 0.1072, + "grad_norm": 0.1655215322971344, + "learning_rate": 9.985401479534664e-05, + "loss": 0.5349, + "step": 670 + }, + { + "epoch": 0.1088, + "grad_norm": 0.2126241773366928, + "learning_rate": 9.98439549809578e-05, + "loss": 0.5412, + "step": 680 + }, + { + "epoch": 0.1104, + "grad_norm": 0.18532635271549225, + "learning_rate": 9.983356054918238e-05, + "loss": 0.5289, + "step": 690 + }, + { + "epoch": 0.112, + "grad_norm": 0.16749216616153717, + "learning_rate": 9.982283156980132e-05, + "loss": 0.5329, + "step": 700 + }, + { + "epoch": 0.1136, + "grad_norm": 0.17735940217971802, + "learning_rate": 9.981176811484148e-05, + "loss": 0.5658, + "step": 710 + }, + { + "epoch": 0.1152, + "grad_norm": 0.17970822751522064, + "learning_rate": 9.98003702585751e-05, + "loss": 0.5378, + "step": 720 + }, + { + "epoch": 0.1168, + "grad_norm": 0.17801500856876373, + "learning_rate": 9.978863807751944e-05, + "loss": 0.5434, + "step": 730 + }, + { + "epoch": 0.1184, + "grad_norm": 0.17626236379146576, + "learning_rate": 9.977657165043612e-05, + "loss": 0.5267, + "step": 740 + }, + { + "epoch": 0.12, + "grad_norm": 0.1799333393573761, + "learning_rate": 9.97641710583307e-05, + "loss": 0.538, + "step": 750 + }, + { + "epoch": 0.1216, + "grad_norm": 0.19428451359272003, + "learning_rate": 9.975143638445205e-05, + "loss": 0.5356, + "step": 760 + }, + { + "epoch": 0.1232, + "grad_norm": 0.16534724831581116, + "learning_rate": 9.973836771429184e-05, + "loss": 0.5339, + "step": 770 + }, + { + "epoch": 0.1248, + "grad_norm": 0.16027531027793884, + "learning_rate": 9.972496513558398e-05, + "loss": 0.5296, + "step": 780 + }, + { + "epoch": 0.1264, + "grad_norm": 0.18316860496997833, + "learning_rate": 9.971122873830398e-05, + "loss": 0.5524, + "step": 790 + }, + { + "epoch": 0.128, + "grad_norm": 0.1743122637271881, + "learning_rate": 9.96971586146684e-05, + "loss": 0.5679, + "step": 800 + }, + { + "epoch": 0.1296, + "grad_norm": 0.1652500182390213, + "learning_rate": 9.968275485913417e-05, + "loss": 0.5201, + "step": 810 + }, + { + "epoch": 0.1312, + "grad_norm": 0.18169784545898438, + "learning_rate": 9.966801756839803e-05, + "loss": 0.5565, + "step": 820 + }, + { + "epoch": 0.1328, + "grad_norm": 0.17276670038700104, + "learning_rate": 9.96529468413958e-05, + "loss": 0.5422, + "step": 830 + }, + { + "epoch": 0.1344, + "grad_norm": 0.1608079969882965, + "learning_rate": 9.96375427793018e-05, + "loss": 0.5325, + "step": 840 + }, + { + "epoch": 0.136, + "grad_norm": 0.15167801082134247, + "learning_rate": 9.962180548552812e-05, + "loss": 0.5355, + "step": 850 + }, + { + "epoch": 0.1376, + "grad_norm": 0.18775774538516998, + "learning_rate": 9.96057350657239e-05, + "loss": 0.5488, + "step": 860 + }, + { + "epoch": 0.1392, + "grad_norm": 0.17709018290042877, + "learning_rate": 9.958933162777469e-05, + "loss": 0.5287, + "step": 870 + }, + { + "epoch": 0.1408, + "grad_norm": 0.18322448432445526, + "learning_rate": 9.957259528180165e-05, + "loss": 0.5279, + "step": 880 + }, + { + "epoch": 0.1424, + "grad_norm": 0.1560993790626526, + "learning_rate": 9.955552614016093e-05, + "loss": 0.5354, + "step": 890 + }, + { + "epoch": 0.144, + "grad_norm": 0.15778392553329468, + "learning_rate": 9.953812431744276e-05, + "loss": 0.522, + "step": 900 + }, + { + "epoch": 0.1456, + "grad_norm": 0.16897855699062347, + "learning_rate": 9.952038993047076e-05, + "loss": 0.5556, + "step": 910 + }, + { + "epoch": 0.1472, + "grad_norm": 0.1845034956932068, + "learning_rate": 9.95023230983012e-05, + "loss": 0.5314, + "step": 920 + }, + { + "epoch": 0.1488, + "grad_norm": 0.15034043788909912, + "learning_rate": 9.948392394222215e-05, + "loss": 0.5241, + "step": 930 + }, + { + "epoch": 0.1504, + "grad_norm": 0.16776028275489807, + "learning_rate": 9.946519258575263e-05, + "loss": 0.5469, + "step": 940 + }, + { + "epoch": 0.152, + "grad_norm": 0.16213341057300568, + "learning_rate": 9.944612915464183e-05, + "loss": 0.515, + "step": 950 + }, + { + "epoch": 0.1536, + "grad_norm": 0.17051611840724945, + "learning_rate": 9.942673377686828e-05, + "loss": 0.5373, + "step": 960 + }, + { + "epoch": 0.1552, + "grad_norm": 0.1494649201631546, + "learning_rate": 9.940700658263897e-05, + "loss": 0.5215, + "step": 970 + }, + { + "epoch": 0.1568, + "grad_norm": 0.17321337759494781, + "learning_rate": 9.938694770438844e-05, + "loss": 0.5384, + "step": 980 + }, + { + "epoch": 0.1584, + "grad_norm": 0.1464836597442627, + "learning_rate": 9.936655727677794e-05, + "loss": 0.5269, + "step": 990 + }, + { + "epoch": 0.16, + "grad_norm": 0.1534987837076187, + "learning_rate": 9.934583543669453e-05, + "loss": 0.5153, + "step": 1000 + }, + { + "epoch": 0.1616, + "grad_norm": 0.1458473801612854, + "learning_rate": 9.932478232325013e-05, + "loss": 0.5337, + "step": 1010 + }, + { + "epoch": 0.1632, + "grad_norm": 0.15559281408786774, + "learning_rate": 9.930339807778055e-05, + "loss": 0.524, + "step": 1020 + }, + { + "epoch": 0.1648, + "grad_norm": 0.16684836149215698, + "learning_rate": 9.928168284384467e-05, + "loss": 0.5277, + "step": 1030 + }, + { + "epoch": 0.1664, + "grad_norm": 0.1587754637002945, + "learning_rate": 9.925963676722335e-05, + "loss": 0.5331, + "step": 1040 + }, + { + "epoch": 0.168, + "grad_norm": 0.17447280883789062, + "learning_rate": 9.923725999591847e-05, + "loss": 0.5165, + "step": 1050 + }, + { + "epoch": 0.1696, + "grad_norm": 0.14917245507240295, + "learning_rate": 9.921455268015201e-05, + "loss": 0.5342, + "step": 1060 + }, + { + "epoch": 0.1712, + "grad_norm": 0.19015340507030487, + "learning_rate": 9.919151497236498e-05, + "loss": 0.5513, + "step": 1070 + }, + { + "epoch": 0.1728, + "grad_norm": 0.15490785241127014, + "learning_rate": 9.916814702721642e-05, + "loss": 0.5275, + "step": 1080 + }, + { + "epoch": 0.1744, + "grad_norm": 0.15064150094985962, + "learning_rate": 9.914444900158233e-05, + "loss": 0.5277, + "step": 1090 + }, + { + "epoch": 0.176, + "grad_norm": 0.16470789909362793, + "learning_rate": 9.912042105455463e-05, + "loss": 0.5201, + "step": 1100 + }, + { + "epoch": 0.1776, + "grad_norm": 0.14729981124401093, + "learning_rate": 9.909606334744013e-05, + "loss": 0.5357, + "step": 1110 + }, + { + "epoch": 0.1792, + "grad_norm": 0.15715880692005157, + "learning_rate": 9.90713760437594e-05, + "loss": 0.5218, + "step": 1120 + }, + { + "epoch": 0.1808, + "grad_norm": 0.15312618017196655, + "learning_rate": 9.904635930924573e-05, + "loss": 0.5137, + "step": 1130 + }, + { + "epoch": 0.1824, + "grad_norm": 0.17118622362613678, + "learning_rate": 9.90210133118439e-05, + "loss": 0.5338, + "step": 1140 + }, + { + "epoch": 0.184, + "grad_norm": 0.1367834508419037, + "learning_rate": 9.899533822170922e-05, + "loss": 0.5133, + "step": 1150 + }, + { + "epoch": 0.1856, + "grad_norm": 0.16843043267726898, + "learning_rate": 9.896933421120622e-05, + "loss": 0.5302, + "step": 1160 + }, + { + "epoch": 0.1872, + "grad_norm": 0.1890210509300232, + "learning_rate": 9.894300145490762e-05, + "loss": 0.5241, + "step": 1170 + }, + { + "epoch": 0.1888, + "grad_norm": 0.15399862825870514, + "learning_rate": 9.89163401295931e-05, + "loss": 0.53, + "step": 1180 + }, + { + "epoch": 0.1904, + "grad_norm": 0.17530465126037598, + "learning_rate": 9.88893504142481e-05, + "loss": 0.5389, + "step": 1190 + }, + { + "epoch": 0.192, + "grad_norm": 0.15098971128463745, + "learning_rate": 9.886203249006265e-05, + "loss": 0.4988, + "step": 1200 + }, + { + "epoch": 0.1936, + "grad_norm": 0.17054300010204315, + "learning_rate": 9.883438654043018e-05, + "loss": 0.5357, + "step": 1210 + }, + { + "epoch": 0.1952, + "grad_norm": 0.15649595856666565, + "learning_rate": 9.88064127509462e-05, + "loss": 0.5235, + "step": 1220 + }, + { + "epoch": 0.1968, + "grad_norm": 0.16764892637729645, + "learning_rate": 9.877811130940713e-05, + "loss": 0.4986, + "step": 1230 + }, + { + "epoch": 0.1984, + "grad_norm": 0.16912437975406647, + "learning_rate": 9.874948240580904e-05, + "loss": 0.5139, + "step": 1240 + }, + { + "epoch": 0.2, + "grad_norm": 0.15642336010932922, + "learning_rate": 9.872052623234632e-05, + "loss": 0.5311, + "step": 1250 + }, + { + "epoch": 0.2016, + "grad_norm": 0.17885343730449677, + "learning_rate": 9.869124298341039e-05, + "loss": 0.5309, + "step": 1260 + }, + { + "epoch": 0.2032, + "grad_norm": 0.16021275520324707, + "learning_rate": 9.866163285558851e-05, + "loss": 0.5346, + "step": 1270 + }, + { + "epoch": 0.2048, + "grad_norm": 0.15563759207725525, + "learning_rate": 9.863169604766231e-05, + "loss": 0.5467, + "step": 1280 + }, + { + "epoch": 0.2064, + "grad_norm": 0.1600761115550995, + "learning_rate": 9.860143276060655e-05, + "loss": 0.5266, + "step": 1290 + }, + { + "epoch": 0.208, + "grad_norm": 0.1830061674118042, + "learning_rate": 9.857084319758772e-05, + "loss": 0.5236, + "step": 1300 + }, + { + "epoch": 0.2096, + "grad_norm": 0.1682935357093811, + "learning_rate": 9.853992756396272e-05, + "loss": 0.5187, + "step": 1310 + }, + { + "epoch": 0.2112, + "grad_norm": 0.15699943900108337, + "learning_rate": 9.850868606727745e-05, + "loss": 0.5005, + "step": 1320 + }, + { + "epoch": 0.2128, + "grad_norm": 0.1576133370399475, + "learning_rate": 9.847711891726544e-05, + "loss": 0.5169, + "step": 1330 + }, + { + "epoch": 0.2144, + "grad_norm": 0.16691842675209045, + "learning_rate": 9.844522632584637e-05, + "loss": 0.5123, + "step": 1340 + }, + { + "epoch": 0.216, + "grad_norm": 0.1486142873764038, + "learning_rate": 9.84130085071248e-05, + "loss": 0.5026, + "step": 1350 + }, + { + "epoch": 0.2176, + "grad_norm": 0.16121621429920197, + "learning_rate": 9.838046567738857e-05, + "loss": 0.5164, + "step": 1360 + }, + { + "epoch": 0.2192, + "grad_norm": 0.14498215913772583, + "learning_rate": 9.834759805510743e-05, + "loss": 0.5052, + "step": 1370 + }, + { + "epoch": 0.2208, + "grad_norm": 0.15662848949432373, + "learning_rate": 9.831440586093157e-05, + "loss": 0.5128, + "step": 1380 + }, + { + "epoch": 0.2224, + "grad_norm": 0.15886035561561584, + "learning_rate": 9.828088931769013e-05, + "loss": 0.5242, + "step": 1390 + }, + { + "epoch": 0.224, + "grad_norm": 0.1685263067483902, + "learning_rate": 9.824704865038968e-05, + "loss": 0.5352, + "step": 1400 + }, + { + "epoch": 0.2256, + "grad_norm": 0.161627858877182, + "learning_rate": 9.821288408621275e-05, + "loss": 0.5135, + "step": 1410 + }, + { + "epoch": 0.2272, + "grad_norm": 0.21686546504497528, + "learning_rate": 9.817839585451629e-05, + "loss": 0.5244, + "step": 1420 + }, + { + "epoch": 0.2288, + "grad_norm": 0.1565544456243515, + "learning_rate": 9.814358418683013e-05, + "loss": 0.5096, + "step": 1430 + }, + { + "epoch": 0.2304, + "grad_norm": 0.15174688398838043, + "learning_rate": 9.810844931685541e-05, + "loss": 0.5122, + "step": 1440 + }, + { + "epoch": 0.232, + "grad_norm": 0.1546812653541565, + "learning_rate": 9.8072991480463e-05, + "loss": 0.5047, + "step": 1450 + }, + { + "epoch": 0.2336, + "grad_norm": 0.16348446905612946, + "learning_rate": 9.8037210915692e-05, + "loss": 0.5204, + "step": 1460 + }, + { + "epoch": 0.2352, + "grad_norm": 0.15648159384727478, + "learning_rate": 9.800110786274803e-05, + "loss": 0.5277, + "step": 1470 + }, + { + "epoch": 0.2368, + "grad_norm": 0.18967467546463013, + "learning_rate": 9.796468256400171e-05, + "loss": 0.5126, + "step": 1480 + }, + { + "epoch": 0.2384, + "grad_norm": 0.15906493365764618, + "learning_rate": 9.792793526398694e-05, + "loss": 0.5047, + "step": 1490 + }, + { + "epoch": 0.24, + "grad_norm": 0.16331595182418823, + "learning_rate": 9.789086620939936e-05, + "loss": 0.5079, + "step": 1500 + }, + { + "epoch": 0.2416, + "grad_norm": 0.1666116565465927, + "learning_rate": 9.785347564909463e-05, + "loss": 0.5052, + "step": 1510 + }, + { + "epoch": 0.2432, + "grad_norm": 0.22205305099487305, + "learning_rate": 9.781576383408677e-05, + "loss": 0.5295, + "step": 1520 + }, + { + "epoch": 0.2448, + "grad_norm": 0.13745824992656708, + "learning_rate": 9.777773101754649e-05, + "loss": 0.4919, + "step": 1530 + }, + { + "epoch": 0.2464, + "grad_norm": 0.15058793127536774, + "learning_rate": 9.773937745479942e-05, + "loss": 0.5049, + "step": 1540 + }, + { + "epoch": 0.248, + "grad_norm": 0.1442198008298874, + "learning_rate": 9.770070340332456e-05, + "loss": 0.519, + "step": 1550 + }, + { + "epoch": 0.2496, + "grad_norm": 0.17187006771564484, + "learning_rate": 9.766170912275239e-05, + "loss": 0.4993, + "step": 1560 + }, + { + "epoch": 0.2512, + "grad_norm": 0.1542050540447235, + "learning_rate": 9.762239487486315e-05, + "loss": 0.5083, + "step": 1570 + }, + { + "epoch": 0.2528, + "grad_norm": 0.1757446527481079, + "learning_rate": 9.758276092358518e-05, + "loss": 0.5061, + "step": 1580 + }, + { + "epoch": 0.2544, + "grad_norm": 0.13537611067295074, + "learning_rate": 9.754280753499305e-05, + "loss": 0.4835, + "step": 1590 + }, + { + "epoch": 0.256, + "grad_norm": 0.1829158365726471, + "learning_rate": 9.75025349773058e-05, + "loss": 0.5307, + "step": 1600 + }, + { + "epoch": 0.2576, + "grad_norm": 0.138696551322937, + "learning_rate": 9.746194352088518e-05, + "loss": 0.5099, + "step": 1610 + }, + { + "epoch": 0.2592, + "grad_norm": 0.15225930511951447, + "learning_rate": 9.742103343823376e-05, + "loss": 0.5183, + "step": 1620 + }, + { + "epoch": 0.2608, + "grad_norm": 0.16234968602657318, + "learning_rate": 9.737980500399322e-05, + "loss": 0.5285, + "step": 1630 + }, + { + "epoch": 0.2624, + "grad_norm": 0.15553738176822662, + "learning_rate": 9.733825849494231e-05, + "loss": 0.5172, + "step": 1640 + }, + { + "epoch": 0.264, + "grad_norm": 0.15891076624393463, + "learning_rate": 9.729639418999523e-05, + "loss": 0.5014, + "step": 1650 + }, + { + "epoch": 0.2656, + "grad_norm": 0.1538977175951004, + "learning_rate": 9.725421237019957e-05, + "loss": 0.5134, + "step": 1660 + }, + { + "epoch": 0.2672, + "grad_norm": 0.16845689713954926, + "learning_rate": 9.721171331873451e-05, + "loss": 0.4972, + "step": 1670 + }, + { + "epoch": 0.2688, + "grad_norm": 0.16934283077716827, + "learning_rate": 9.716889732090889e-05, + "loss": 0.5225, + "step": 1680 + }, + { + "epoch": 0.2704, + "grad_norm": 0.17025482654571533, + "learning_rate": 9.712576466415935e-05, + "loss": 0.5136, + "step": 1690 + }, + { + "epoch": 0.272, + "grad_norm": 0.17082297801971436, + "learning_rate": 9.708231563804828e-05, + "loss": 0.5298, + "step": 1700 + }, + { + "epoch": 0.2736, + "grad_norm": 0.15026861429214478, + "learning_rate": 9.703855053426202e-05, + "loss": 0.5231, + "step": 1710 + }, + { + "epoch": 0.2752, + "grad_norm": 0.15596430003643036, + "learning_rate": 9.699446964660881e-05, + "loss": 0.5091, + "step": 1720 + }, + { + "epoch": 0.2768, + "grad_norm": 0.15791009366512299, + "learning_rate": 9.695007327101684e-05, + "loss": 0.5091, + "step": 1730 + }, + { + "epoch": 0.2784, + "grad_norm": 0.14188261330127716, + "learning_rate": 9.690536170553226e-05, + "loss": 0.5088, + "step": 1740 + }, + { + "epoch": 0.28, + "grad_norm": 0.14109936356544495, + "learning_rate": 9.686033525031719e-05, + "loss": 0.503, + "step": 1750 + }, + { + "epoch": 0.2816, + "grad_norm": 0.15977415442466736, + "learning_rate": 9.68149942076477e-05, + "loss": 0.5225, + "step": 1760 + }, + { + "epoch": 0.2832, + "grad_norm": 0.1453007310628891, + "learning_rate": 9.676933888191177e-05, + "loss": 0.5087, + "step": 1770 + }, + { + "epoch": 0.2848, + "grad_norm": 0.1460779756307602, + "learning_rate": 9.67233695796073e-05, + "loss": 0.5142, + "step": 1780 + }, + { + "epoch": 0.2864, + "grad_norm": 0.1568945050239563, + "learning_rate": 9.667708660933994e-05, + "loss": 0.5152, + "step": 1790 + }, + { + "epoch": 0.288, + "grad_norm": 0.14836131036281586, + "learning_rate": 9.663049028182111e-05, + "loss": 0.5049, + "step": 1800 + }, + { + "epoch": 0.2896, + "grad_norm": 0.1540466994047165, + "learning_rate": 9.658358090986594e-05, + "loss": 0.4946, + "step": 1810 + }, + { + "epoch": 0.2912, + "grad_norm": 0.13895122706890106, + "learning_rate": 9.653635880839106e-05, + "loss": 0.5085, + "step": 1820 + }, + { + "epoch": 0.2928, + "grad_norm": 0.14438685774803162, + "learning_rate": 9.648882429441257e-05, + "loss": 0.5019, + "step": 1830 + }, + { + "epoch": 0.2944, + "grad_norm": 0.14534763991832733, + "learning_rate": 9.64409776870439e-05, + "loss": 0.4752, + "step": 1840 + }, + { + "epoch": 0.296, + "grad_norm": 0.14334291219711304, + "learning_rate": 9.639281930749362e-05, + "loss": 0.5005, + "step": 1850 + }, + { + "epoch": 0.2976, + "grad_norm": 0.17404726147651672, + "learning_rate": 9.634434947906336e-05, + "loss": 0.5156, + "step": 1860 + }, + { + "epoch": 0.2992, + "grad_norm": 0.15235920250415802, + "learning_rate": 9.62955685271456e-05, + "loss": 0.4978, + "step": 1870 + }, + { + "epoch": 0.3008, + "grad_norm": 0.1535569131374359, + "learning_rate": 9.624647677922142e-05, + "loss": 0.4959, + "step": 1880 + }, + { + "epoch": 0.3024, + "grad_norm": 0.16104567050933838, + "learning_rate": 9.619707456485848e-05, + "loss": 0.4937, + "step": 1890 + }, + { + "epoch": 0.304, + "grad_norm": 0.1405969262123108, + "learning_rate": 9.61473622157086e-05, + "loss": 0.5055, + "step": 1900 + }, + { + "epoch": 0.3056, + "grad_norm": 0.1405443549156189, + "learning_rate": 9.609734006550562e-05, + "loss": 0.4941, + "step": 1910 + }, + { + "epoch": 0.3072, + "grad_norm": 0.13433068990707397, + "learning_rate": 9.604700845006326e-05, + "loss": 0.5069, + "step": 1920 + }, + { + "epoch": 0.3088, + "grad_norm": 0.15099148452281952, + "learning_rate": 9.599636770727269e-05, + "loss": 0.4921, + "step": 1930 + }, + { + "epoch": 0.3104, + "grad_norm": 0.1635752171278, + "learning_rate": 9.594541817710037e-05, + "loss": 0.5141, + "step": 1940 + }, + { + "epoch": 0.312, + "grad_norm": 0.15128713846206665, + "learning_rate": 9.589416020158578e-05, + "loss": 0.4916, + "step": 1950 + }, + { + "epoch": 0.3136, + "grad_norm": 0.15562082827091217, + "learning_rate": 9.584259412483897e-05, + "loss": 0.5025, + "step": 1960 + }, + { + "epoch": 0.3152, + "grad_norm": 0.14861498773097992, + "learning_rate": 9.579072029303854e-05, + "loss": 0.5157, + "step": 1970 + }, + { + "epoch": 0.3168, + "grad_norm": 0.1256442368030548, + "learning_rate": 9.5738539054429e-05, + "loss": 0.5016, + "step": 1980 + }, + { + "epoch": 0.3184, + "grad_norm": 0.1592012196779251, + "learning_rate": 9.56860507593186e-05, + "loss": 0.5198, + "step": 1990 + }, + { + "epoch": 0.32, + "grad_norm": 0.14706481993198395, + "learning_rate": 9.563325576007701e-05, + "loss": 0.5246, + "step": 2000 + }, + { + "epoch": 0.3216, + "grad_norm": 0.1479787975549698, + "learning_rate": 9.558015441113285e-05, + "loss": 0.5193, + "step": 2010 + }, + { + "epoch": 0.3232, + "grad_norm": 0.12703679502010345, + "learning_rate": 9.552674706897136e-05, + "loss": 0.5126, + "step": 2020 + }, + { + "epoch": 0.3248, + "grad_norm": 0.14176440238952637, + "learning_rate": 9.547303409213202e-05, + "loss": 0.5136, + "step": 2030 + }, + { + "epoch": 0.3264, + "grad_norm": 0.16507187485694885, + "learning_rate": 9.541901584120612e-05, + "loss": 0.4896, + "step": 2040 + }, + { + "epoch": 0.328, + "grad_norm": 0.16316381096839905, + "learning_rate": 9.536469267883433e-05, + "loss": 0.5051, + "step": 2050 + }, + { + "epoch": 0.3296, + "grad_norm": 0.16869358718395233, + "learning_rate": 9.531006496970429e-05, + "loss": 0.509, + "step": 2060 + }, + { + "epoch": 0.3312, + "grad_norm": 0.1428690254688263, + "learning_rate": 9.525513308054819e-05, + "loss": 0.5056, + "step": 2070 + }, + { + "epoch": 0.3328, + "grad_norm": 0.13872046768665314, + "learning_rate": 9.519989738014022e-05, + "loss": 0.5127, + "step": 2080 + }, + { + "epoch": 0.3344, + "grad_norm": 0.15747520327568054, + "learning_rate": 9.514435823929417e-05, + "loss": 0.5162, + "step": 2090 + }, + { + "epoch": 0.336, + "grad_norm": 0.16116927564144135, + "learning_rate": 9.508851603086093e-05, + "loss": 0.4873, + "step": 2100 + }, + { + "epoch": 0.3376, + "grad_norm": 0.15870094299316406, + "learning_rate": 9.503237112972594e-05, + "loss": 0.4746, + "step": 2110 + }, + { + "epoch": 0.3392, + "grad_norm": 0.15173602104187012, + "learning_rate": 9.497592391280673e-05, + "loss": 0.5209, + "step": 2120 + }, + { + "epoch": 0.3408, + "grad_norm": 0.17726799845695496, + "learning_rate": 9.491917475905035e-05, + "loss": 0.4928, + "step": 2130 + }, + { + "epoch": 0.3424, + "grad_norm": 0.16077756881713867, + "learning_rate": 9.486212404943084e-05, + "loss": 0.5218, + "step": 2140 + }, + { + "epoch": 0.344, + "grad_norm": 0.17364545166492462, + "learning_rate": 9.480477216694673e-05, + "loss": 0.497, + "step": 2150 + }, + { + "epoch": 0.3456, + "grad_norm": 0.14399157464504242, + "learning_rate": 9.474711949661835e-05, + "loss": 0.4914, + "step": 2160 + }, + { + "epoch": 0.3472, + "grad_norm": 0.14652396738529205, + "learning_rate": 9.468916642548533e-05, + "loss": 0.5049, + "step": 2170 + }, + { + "epoch": 0.3488, + "grad_norm": 0.1399794965982437, + "learning_rate": 9.463091334260396e-05, + "loss": 0.4883, + "step": 2180 + }, + { + "epoch": 0.3504, + "grad_norm": 0.15792648494243622, + "learning_rate": 9.457236063904464e-05, + "loss": 0.4825, + "step": 2190 + }, + { + "epoch": 0.352, + "grad_norm": 0.13943254947662354, + "learning_rate": 9.45135087078892e-05, + "loss": 0.4954, + "step": 2200 + }, + { + "epoch": 0.3536, + "grad_norm": 0.14800620079040527, + "learning_rate": 9.445435794422825e-05, + "loss": 0.5001, + "step": 2210 + }, + { + "epoch": 0.3552, + "grad_norm": 0.16801360249519348, + "learning_rate": 9.439490874515858e-05, + "loss": 0.4966, + "step": 2220 + }, + { + "epoch": 0.3568, + "grad_norm": 0.15416771173477173, + "learning_rate": 9.433516150978044e-05, + "loss": 0.49, + "step": 2230 + }, + { + "epoch": 0.3584, + "grad_norm": 0.13983893394470215, + "learning_rate": 9.427511663919491e-05, + "loss": 0.5013, + "step": 2240 + }, + { + "epoch": 0.36, + "grad_norm": 0.1356741338968277, + "learning_rate": 9.421477453650118e-05, + "loss": 0.4938, + "step": 2250 + }, + { + "epoch": 0.3616, + "grad_norm": 0.16618719696998596, + "learning_rate": 9.415413560679385e-05, + "loss": 0.5059, + "step": 2260 + }, + { + "epoch": 0.3632, + "grad_norm": 0.15583930909633636, + "learning_rate": 9.409320025716017e-05, + "loss": 0.4952, + "step": 2270 + }, + { + "epoch": 0.3648, + "grad_norm": 0.13846057653427124, + "learning_rate": 9.403196889667742e-05, + "loss": 0.4967, + "step": 2280 + }, + { + "epoch": 0.3664, + "grad_norm": 0.16764026880264282, + "learning_rate": 9.397044193641e-05, + "loss": 0.4816, + "step": 2290 + }, + { + "epoch": 0.368, + "grad_norm": 0.1505175083875656, + "learning_rate": 9.390861978940686e-05, + "loss": 0.4929, + "step": 2300 + }, + { + "epoch": 0.3696, + "grad_norm": 0.14527346193790436, + "learning_rate": 9.384650287069856e-05, + "loss": 0.496, + "step": 2310 + }, + { + "epoch": 0.3712, + "grad_norm": 0.1403575986623764, + "learning_rate": 9.378409159729454e-05, + "loss": 0.5155, + "step": 2320 + }, + { + "epoch": 0.3728, + "grad_norm": 0.1594780683517456, + "learning_rate": 9.372138638818035e-05, + "loss": 0.5098, + "step": 2330 + }, + { + "epoch": 0.3744, + "grad_norm": 0.14727714657783508, + "learning_rate": 9.365838766431488e-05, + "loss": 0.5039, + "step": 2340 + }, + { + "epoch": 0.376, + "grad_norm": 0.16067099571228027, + "learning_rate": 9.359509584862736e-05, + "loss": 0.4879, + "step": 2350 + }, + { + "epoch": 0.3776, + "grad_norm": 0.15750008821487427, + "learning_rate": 9.353151136601471e-05, + "loss": 0.4924, + "step": 2360 + }, + { + "epoch": 0.3792, + "grad_norm": 0.14568068087100983, + "learning_rate": 9.34676346433386e-05, + "loss": 0.5096, + "step": 2370 + }, + { + "epoch": 0.3808, + "grad_norm": 0.1653110682964325, + "learning_rate": 9.340346610942258e-05, + "loss": 0.4888, + "step": 2380 + }, + { + "epoch": 0.3824, + "grad_norm": 0.15464653074741364, + "learning_rate": 9.333900619504923e-05, + "loss": 0.4816, + "step": 2390 + }, + { + "epoch": 0.384, + "grad_norm": 0.1369209885597229, + "learning_rate": 9.327425533295724e-05, + "loss": 0.4838, + "step": 2400 + }, + { + "epoch": 0.3856, + "grad_norm": 0.17630405724048615, + "learning_rate": 9.32092139578385e-05, + "loss": 0.5075, + "step": 2410 + }, + { + "epoch": 0.3872, + "grad_norm": 0.13938814401626587, + "learning_rate": 9.314388250633526e-05, + "loss": 0.5009, + "step": 2420 + }, + { + "epoch": 0.3888, + "grad_norm": 0.15210223197937012, + "learning_rate": 9.30782614170371e-05, + "loss": 0.494, + "step": 2430 + }, + { + "epoch": 0.3904, + "grad_norm": 0.16148371994495392, + "learning_rate": 9.301235113047802e-05, + "loss": 0.4902, + "step": 2440 + }, + { + "epoch": 0.392, + "grad_norm": 0.14204657077789307, + "learning_rate": 9.294615208913348e-05, + "loss": 0.4984, + "step": 2450 + }, + { + "epoch": 0.3936, + "grad_norm": 0.16954472661018372, + "learning_rate": 9.287966473741751e-05, + "loss": 0.5018, + "step": 2460 + }, + { + "epoch": 0.3952, + "grad_norm": 0.15013043582439423, + "learning_rate": 9.281288952167956e-05, + "loss": 0.5118, + "step": 2470 + }, + { + "epoch": 0.3968, + "grad_norm": 0.1555643230676651, + "learning_rate": 9.274582689020164e-05, + "loss": 0.4891, + "step": 2480 + }, + { + "epoch": 0.3984, + "grad_norm": 0.14124083518981934, + "learning_rate": 9.267847729319528e-05, + "loss": 0.4852, + "step": 2490 + }, + { + "epoch": 0.4, + "grad_norm": 0.14627353847026825, + "learning_rate": 9.261084118279847e-05, + "loss": 0.4767, + "step": 2500 + }, + { + "epoch": 0.4016, + "grad_norm": 0.162032350897789, + "learning_rate": 9.254291901307266e-05, + "loss": 0.5018, + "step": 2510 + }, + { + "epoch": 0.4032, + "grad_norm": 0.1579425036907196, + "learning_rate": 9.24747112399997e-05, + "loss": 0.4997, + "step": 2520 + }, + { + "epoch": 0.4048, + "grad_norm": 0.14606937766075134, + "learning_rate": 9.24062183214788e-05, + "loss": 0.486, + "step": 2530 + }, + { + "epoch": 0.4064, + "grad_norm": 0.17731498181819916, + "learning_rate": 9.23374407173234e-05, + "loss": 0.4883, + "step": 2540 + }, + { + "epoch": 0.408, + "grad_norm": 0.160239577293396, + "learning_rate": 9.226837888925813e-05, + "loss": 0.491, + "step": 2550 + }, + { + "epoch": 0.4096, + "grad_norm": 0.15299195051193237, + "learning_rate": 9.219903330091575e-05, + "loss": 0.4969, + "step": 2560 + }, + { + "epoch": 0.4112, + "grad_norm": 0.1531609743833542, + "learning_rate": 9.212940441783391e-05, + "loss": 0.4873, + "step": 2570 + }, + { + "epoch": 0.4128, + "grad_norm": 0.1488921195268631, + "learning_rate": 9.205949270745217e-05, + "loss": 0.5022, + "step": 2580 + }, + { + "epoch": 0.4144, + "grad_norm": 0.15338288247585297, + "learning_rate": 9.198929863910874e-05, + "loss": 0.4984, + "step": 2590 + }, + { + "epoch": 0.416, + "grad_norm": 0.14224661886692047, + "learning_rate": 9.191882268403743e-05, + "loss": 0.4825, + "step": 2600 + }, + { + "epoch": 0.4176, + "grad_norm": 0.1488322913646698, + "learning_rate": 9.184806531536437e-05, + "loss": 0.4944, + "step": 2610 + }, + { + "epoch": 0.4192, + "grad_norm": 0.16006579995155334, + "learning_rate": 9.177702700810501e-05, + "loss": 0.502, + "step": 2620 + }, + { + "epoch": 0.4208, + "grad_norm": 0.16268739104270935, + "learning_rate": 9.170570823916074e-05, + "loss": 0.4932, + "step": 2630 + }, + { + "epoch": 0.4224, + "grad_norm": 0.1719304919242859, + "learning_rate": 9.16341094873158e-05, + "loss": 0.4969, + "step": 2640 + }, + { + "epoch": 0.424, + "grad_norm": 0.1372731626033783, + "learning_rate": 9.156223123323405e-05, + "loss": 0.4969, + "step": 2650 + }, + { + "epoch": 0.4256, + "grad_norm": 0.15454748272895813, + "learning_rate": 9.149007395945569e-05, + "loss": 0.4991, + "step": 2660 + }, + { + "epoch": 0.4272, + "grad_norm": 0.17319807410240173, + "learning_rate": 9.141763815039412e-05, + "loss": 0.5064, + "step": 2670 + }, + { + "epoch": 0.4288, + "grad_norm": 0.15817826986312866, + "learning_rate": 9.134492429233261e-05, + "loss": 0.4986, + "step": 2680 + }, + { + "epoch": 0.4304, + "grad_norm": 0.14886827766895294, + "learning_rate": 9.127193287342102e-05, + "loss": 0.5196, + "step": 2690 + }, + { + "epoch": 0.432, + "grad_norm": 0.1564101129770279, + "learning_rate": 9.119866438367263e-05, + "loss": 0.4983, + "step": 2700 + }, + { + "epoch": 0.4336, + "grad_norm": 0.15275469422340393, + "learning_rate": 9.112511931496071e-05, + "loss": 0.495, + "step": 2710 + }, + { + "epoch": 0.4352, + "grad_norm": 0.14362412691116333, + "learning_rate": 9.10512981610153e-05, + "loss": 0.5152, + "step": 2720 + }, + { + "epoch": 0.4368, + "grad_norm": 0.16199463605880737, + "learning_rate": 9.097720141741994e-05, + "loss": 0.4921, + "step": 2730 + }, + { + "epoch": 0.4384, + "grad_norm": 0.17098335921764374, + "learning_rate": 9.090282958160823e-05, + "loss": 0.5078, + "step": 2740 + }, + { + "epoch": 0.44, + "grad_norm": 0.1529405117034912, + "learning_rate": 9.082818315286055e-05, + "loss": 0.479, + "step": 2750 + }, + { + "epoch": 0.4416, + "grad_norm": 0.15313425660133362, + "learning_rate": 9.075326263230073e-05, + "loss": 0.5036, + "step": 2760 + }, + { + "epoch": 0.4432, + "grad_norm": 0.1497267782688141, + "learning_rate": 9.067806852289263e-05, + "loss": 0.4959, + "step": 2770 + }, + { + "epoch": 0.4448, + "grad_norm": 0.15258878469467163, + "learning_rate": 9.060260132943682e-05, + "loss": 0.5022, + "step": 2780 + }, + { + "epoch": 0.4464, + "grad_norm": 0.15484480559825897, + "learning_rate": 9.052686155856716e-05, + "loss": 0.487, + "step": 2790 + }, + { + "epoch": 0.448, + "grad_norm": 0.134060338139534, + "learning_rate": 9.045084971874738e-05, + "loss": 0.4715, + "step": 2800 + }, + { + "epoch": 0.4496, + "grad_norm": 0.1444830596446991, + "learning_rate": 9.037456632026773e-05, + "loss": 0.4914, + "step": 2810 + }, + { + "epoch": 0.4512, + "grad_norm": 0.14150072634220123, + "learning_rate": 9.029801187524147e-05, + "loss": 0.5039, + "step": 2820 + }, + { + "epoch": 0.4528, + "grad_norm": 0.1608240157365799, + "learning_rate": 9.022118689760152e-05, + "loss": 0.4912, + "step": 2830 + }, + { + "epoch": 0.4544, + "grad_norm": 0.14713039994239807, + "learning_rate": 9.014409190309695e-05, + "loss": 0.4813, + "step": 2840 + }, + { + "epoch": 0.456, + "grad_norm": 0.13848505914211273, + "learning_rate": 9.006672740928952e-05, + "loss": 0.4911, + "step": 2850 + }, + { + "epoch": 0.4576, + "grad_norm": 0.1491885781288147, + "learning_rate": 8.998909393555021e-05, + "loss": 0.4747, + "step": 2860 + }, + { + "epoch": 0.4592, + "grad_norm": 0.15007364749908447, + "learning_rate": 8.99111920030558e-05, + "loss": 0.4982, + "step": 2870 + }, + { + "epoch": 0.4608, + "grad_norm": 0.14453564584255219, + "learning_rate": 8.983302213478523e-05, + "loss": 0.4994, + "step": 2880 + }, + { + "epoch": 0.4624, + "grad_norm": 0.152187317609787, + "learning_rate": 8.975458485551629e-05, + "loss": 0.4854, + "step": 2890 + }, + { + "epoch": 0.464, + "grad_norm": 0.15166030824184418, + "learning_rate": 8.967588069182185e-05, + "loss": 0.5095, + "step": 2900 + }, + { + "epoch": 0.4656, + "grad_norm": 0.14143046736717224, + "learning_rate": 8.959691017206653e-05, + "loss": 0.4863, + "step": 2910 + }, + { + "epoch": 0.4672, + "grad_norm": 0.15788449347019196, + "learning_rate": 8.951767382640307e-05, + "loss": 0.4829, + "step": 2920 + }, + { + "epoch": 0.4688, + "grad_norm": 0.15448391437530518, + "learning_rate": 8.943817218676877e-05, + "loss": 0.4968, + "step": 2930 + }, + { + "epoch": 0.4704, + "grad_norm": 0.14346468448638916, + "learning_rate": 8.935840578688191e-05, + "loss": 0.5016, + "step": 2940 + }, + { + "epoch": 0.472, + "grad_norm": 0.14543242752552032, + "learning_rate": 8.927837516223824e-05, + "loss": 0.4854, + "step": 2950 + }, + { + "epoch": 0.4736, + "grad_norm": 0.15722206234931946, + "learning_rate": 8.919808085010726e-05, + "loss": 0.5265, + "step": 2960 + }, + { + "epoch": 0.4752, + "grad_norm": 0.136716827750206, + "learning_rate": 8.911752338952875e-05, + "loss": 0.4795, + "step": 2970 + }, + { + "epoch": 0.4768, + "grad_norm": 0.1450624167919159, + "learning_rate": 8.903670332130901e-05, + "loss": 0.5143, + "step": 2980 + }, + { + "epoch": 0.4784, + "grad_norm": 0.17104503512382507, + "learning_rate": 8.895562118801738e-05, + "loss": 0.5128, + "step": 2990 + }, + { + "epoch": 0.48, + "grad_norm": 0.16369347274303436, + "learning_rate": 8.887427753398248e-05, + "loss": 0.5031, + "step": 3000 + }, + { + "epoch": 0.4816, + "grad_norm": 0.17435304820537567, + "learning_rate": 8.879267290528859e-05, + "loss": 0.509, + "step": 3010 + }, + { + "epoch": 0.4832, + "grad_norm": 0.15486499667167664, + "learning_rate": 8.871080784977199e-05, + "loss": 0.4923, + "step": 3020 + }, + { + "epoch": 0.4848, + "grad_norm": 0.13806629180908203, + "learning_rate": 8.862868291701735e-05, + "loss": 0.4808, + "step": 3030 + }, + { + "epoch": 0.4864, + "grad_norm": 0.13489525020122528, + "learning_rate": 8.854629865835387e-05, + "loss": 0.4899, + "step": 3040 + }, + { + "epoch": 0.488, + "grad_norm": 0.13588005304336548, + "learning_rate": 8.846365562685177e-05, + "loss": 0.4947, + "step": 3050 + }, + { + "epoch": 0.4896, + "grad_norm": 0.14742115139961243, + "learning_rate": 8.838075437731843e-05, + "loss": 0.496, + "step": 3060 + }, + { + "epoch": 0.4912, + "grad_norm": 0.13850219547748566, + "learning_rate": 8.829759546629475e-05, + "loss": 0.5047, + "step": 3070 + }, + { + "epoch": 0.4928, + "grad_norm": 0.1488957554101944, + "learning_rate": 8.82141794520514e-05, + "loss": 0.491, + "step": 3080 + }, + { + "epoch": 0.4944, + "grad_norm": 0.15855398774147034, + "learning_rate": 8.813050689458502e-05, + "loss": 0.4955, + "step": 3090 + }, + { + "epoch": 0.496, + "grad_norm": 0.15771140158176422, + "learning_rate": 8.804657835561456e-05, + "loss": 0.4974, + "step": 3100 + }, + { + "epoch": 0.4976, + "grad_norm": 0.1374940276145935, + "learning_rate": 8.79623943985774e-05, + "loss": 0.4993, + "step": 3110 + }, + { + "epoch": 0.4992, + "grad_norm": 0.13341200351715088, + "learning_rate": 8.787795558862566e-05, + "loss": 0.4742, + "step": 3120 + }, + { + "epoch": 0.5008, + "grad_norm": 0.14264991879463196, + "learning_rate": 8.77932624926223e-05, + "loss": 0.4906, + "step": 3130 + }, + { + "epoch": 0.5024, + "grad_norm": 0.1297423094511032, + "learning_rate": 8.770831567913746e-05, + "loss": 0.4628, + "step": 3140 + }, + { + "epoch": 0.504, + "grad_norm": 0.18045248091220856, + "learning_rate": 8.762311571844451e-05, + "loss": 0.5279, + "step": 3150 + }, + { + "epoch": 0.5056, + "grad_norm": 0.15919335186481476, + "learning_rate": 8.753766318251628e-05, + "loss": 0.4792, + "step": 3160 + }, + { + "epoch": 0.5072, + "grad_norm": 0.1516360193490982, + "learning_rate": 8.745195864502122e-05, + "loss": 0.4979, + "step": 3170 + }, + { + "epoch": 0.5088, + "grad_norm": 0.1668049842119217, + "learning_rate": 8.736600268131953e-05, + "loss": 0.4881, + "step": 3180 + }, + { + "epoch": 0.5104, + "grad_norm": 0.15729983150959015, + "learning_rate": 8.727979586845931e-05, + "loss": 0.4888, + "step": 3190 + }, + { + "epoch": 0.512, + "grad_norm": 0.1290794163942337, + "learning_rate": 8.719333878517273e-05, + "loss": 0.481, + "step": 3200 + }, + { + "epoch": 0.5136, + "grad_norm": 0.13955941796302795, + "learning_rate": 8.710663201187203e-05, + "loss": 0.5035, + "step": 3210 + }, + { + "epoch": 0.5152, + "grad_norm": 0.16280502080917358, + "learning_rate": 8.701967613064575e-05, + "loss": 0.4696, + "step": 3220 + }, + { + "epoch": 0.5168, + "grad_norm": 0.14852701127529144, + "learning_rate": 8.693247172525471e-05, + "loss": 0.4743, + "step": 3230 + }, + { + "epoch": 0.5184, + "grad_norm": 0.15226416289806366, + "learning_rate": 8.684501938112821e-05, + "loss": 0.5162, + "step": 3240 + }, + { + "epoch": 0.52, + "grad_norm": 0.1534862518310547, + "learning_rate": 8.675731968536002e-05, + "loss": 0.4883, + "step": 3250 + }, + { + "epoch": 0.5216, + "grad_norm": 0.16827471554279327, + "learning_rate": 8.666937322670442e-05, + "loss": 0.4954, + "step": 3260 + }, + { + "epoch": 0.5232, + "grad_norm": 0.14639480412006378, + "learning_rate": 8.658118059557231e-05, + "loss": 0.4967, + "step": 3270 + }, + { + "epoch": 0.5248, + "grad_norm": 0.1429436057806015, + "learning_rate": 8.649274238402723e-05, + "loss": 0.4938, + "step": 3280 + }, + { + "epoch": 0.5264, + "grad_norm": 0.14089152216911316, + "learning_rate": 8.640405918578134e-05, + "loss": 0.4746, + "step": 3290 + }, + { + "epoch": 0.528, + "grad_norm": 0.13779769837856293, + "learning_rate": 8.631513159619151e-05, + "loss": 0.4827, + "step": 3300 + }, + { + "epoch": 0.5296, + "grad_norm": 0.13641296327114105, + "learning_rate": 8.622596021225524e-05, + "loss": 0.4951, + "step": 3310 + }, + { + "epoch": 0.5312, + "grad_norm": 0.1407901793718338, + "learning_rate": 8.613654563260674e-05, + "loss": 0.4972, + "step": 3320 + }, + { + "epoch": 0.5328, + "grad_norm": 0.1497213989496231, + "learning_rate": 8.604688845751282e-05, + "loss": 0.4767, + "step": 3330 + }, + { + "epoch": 0.5344, + "grad_norm": 0.15180891752243042, + "learning_rate": 8.595698928886894e-05, + "loss": 0.5095, + "step": 3340 + }, + { + "epoch": 0.536, + "grad_norm": 0.14883661270141602, + "learning_rate": 8.586684873019513e-05, + "loss": 0.4699, + "step": 3350 + }, + { + "epoch": 0.5376, + "grad_norm": 0.15242557227611542, + "learning_rate": 8.577646738663192e-05, + "loss": 0.4972, + "step": 3360 + }, + { + "epoch": 0.5392, + "grad_norm": 0.14527393877506256, + "learning_rate": 8.568584586493634e-05, + "loss": 0.486, + "step": 3370 + }, + { + "epoch": 0.5408, + "grad_norm": 0.13982641696929932, + "learning_rate": 8.559498477347776e-05, + "loss": 0.4676, + "step": 3380 + }, + { + "epoch": 0.5424, + "grad_norm": 0.15055052936077118, + "learning_rate": 8.550388472223391e-05, + "loss": 0.4923, + "step": 3390 + }, + { + "epoch": 0.544, + "grad_norm": 0.14700070023536682, + "learning_rate": 8.541254632278665e-05, + "loss": 0.4961, + "step": 3400 + }, + { + "epoch": 0.5456, + "grad_norm": 0.13797661662101746, + "learning_rate": 8.532097018831805e-05, + "loss": 0.4658, + "step": 3410 + }, + { + "epoch": 0.5472, + "grad_norm": 0.16799747943878174, + "learning_rate": 8.522915693360606e-05, + "loss": 0.4676, + "step": 3420 + }, + { + "epoch": 0.5488, + "grad_norm": 0.14375045895576477, + "learning_rate": 8.513710717502056e-05, + "loss": 0.4924, + "step": 3430 + }, + { + "epoch": 0.5504, + "grad_norm": 0.16271580755710602, + "learning_rate": 8.504482153051912e-05, + "loss": 0.4839, + "step": 3440 + }, + { + "epoch": 0.552, + "grad_norm": 0.15062017738819122, + "learning_rate": 8.495230061964288e-05, + "loss": 0.4846, + "step": 3450 + }, + { + "epoch": 0.5536, + "grad_norm": 0.15407858788967133, + "learning_rate": 8.485954506351241e-05, + "loss": 0.4801, + "step": 3460 + }, + { + "epoch": 0.5552, + "grad_norm": 0.16920284926891327, + "learning_rate": 8.476655548482353e-05, + "loss": 0.4766, + "step": 3470 + }, + { + "epoch": 0.5568, + "grad_norm": 0.16469761729240417, + "learning_rate": 8.467333250784308e-05, + "loss": 0.4972, + "step": 3480 + }, + { + "epoch": 0.5584, + "grad_norm": 0.14731143414974213, + "learning_rate": 8.457987675840484e-05, + "loss": 0.4749, + "step": 3490 + }, + { + "epoch": 0.56, + "grad_norm": 0.17362773418426514, + "learning_rate": 8.448618886390522e-05, + "loss": 0.4804, + "step": 3500 + }, + { + "epoch": 0.5616, + "grad_norm": 0.13967199623584747, + "learning_rate": 8.439226945329907e-05, + "loss": 0.5027, + "step": 3510 + }, + { + "epoch": 0.5632, + "grad_norm": 0.15841037034988403, + "learning_rate": 8.42981191570955e-05, + "loss": 0.4982, + "step": 3520 + }, + { + "epoch": 0.5648, + "grad_norm": 0.1695510447025299, + "learning_rate": 8.420373860735366e-05, + "loss": 0.4914, + "step": 3530 + }, + { + "epoch": 0.5664, + "grad_norm": 0.17265760898590088, + "learning_rate": 8.410912843767837e-05, + "loss": 0.4863, + "step": 3540 + }, + { + "epoch": 0.568, + "grad_norm": 0.15655963122844696, + "learning_rate": 8.401428928321607e-05, + "loss": 0.4918, + "step": 3550 + }, + { + "epoch": 0.5696, + "grad_norm": 0.14493335783481598, + "learning_rate": 8.391922178065036e-05, + "loss": 0.4996, + "step": 3560 + }, + { + "epoch": 0.5712, + "grad_norm": 0.17128992080688477, + "learning_rate": 8.382392656819785e-05, + "loss": 0.4907, + "step": 3570 + }, + { + "epoch": 0.5728, + "grad_norm": 0.15563562512397766, + "learning_rate": 8.372840428560378e-05, + "loss": 0.4733, + "step": 3580 + }, + { + "epoch": 0.5744, + "grad_norm": 0.1321481317281723, + "learning_rate": 8.363265557413786e-05, + "loss": 0.4844, + "step": 3590 + }, + { + "epoch": 0.576, + "grad_norm": 0.15822237730026245, + "learning_rate": 8.353668107658984e-05, + "loss": 0.4769, + "step": 3600 + }, + { + "epoch": 0.5776, + "grad_norm": 0.14847378432750702, + "learning_rate": 8.344048143726523e-05, + "loss": 0.5, + "step": 3610 + }, + { + "epoch": 0.5792, + "grad_norm": 0.1387806236743927, + "learning_rate": 8.334405730198101e-05, + "loss": 0.4784, + "step": 3620 + }, + { + "epoch": 0.5808, + "grad_norm": 0.13971810042858124, + "learning_rate": 8.324740931806124e-05, + "loss": 0.4759, + "step": 3630 + }, + { + "epoch": 0.5824, + "grad_norm": 0.1577521711587906, + "learning_rate": 8.315053813433279e-05, + "loss": 0.4624, + "step": 3640 + }, + { + "epoch": 0.584, + "grad_norm": 0.14834187924861908, + "learning_rate": 8.305344440112089e-05, + "loss": 0.4561, + "step": 3650 + }, + { + "epoch": 0.5856, + "grad_norm": 0.132356658577919, + "learning_rate": 8.295612877024482e-05, + "loss": 0.4872, + "step": 3660 + }, + { + "epoch": 0.5872, + "grad_norm": 0.13213378190994263, + "learning_rate": 8.285859189501352e-05, + "loss": 0.4861, + "step": 3670 + }, + { + "epoch": 0.5888, + "grad_norm": 0.14484821259975433, + "learning_rate": 8.276083443022126e-05, + "loss": 0.4788, + "step": 3680 + }, + { + "epoch": 0.5904, + "grad_norm": 0.16415955126285553, + "learning_rate": 8.266285703214315e-05, + "loss": 0.4902, + "step": 3690 + }, + { + "epoch": 0.592, + "grad_norm": 0.15873567759990692, + "learning_rate": 8.256466035853076e-05, + "loss": 0.4728, + "step": 3700 + }, + { + "epoch": 0.5936, + "grad_norm": 0.16248160600662231, + "learning_rate": 8.246624506860779e-05, + "loss": 0.4827, + "step": 3710 + }, + { + "epoch": 0.5952, + "grad_norm": 0.14072096347808838, + "learning_rate": 8.23676118230655e-05, + "loss": 0.4471, + "step": 3720 + }, + { + "epoch": 0.5968, + "grad_norm": 0.157021626830101, + "learning_rate": 8.226876128405837e-05, + "loss": 0.4691, + "step": 3730 + }, + { + "epoch": 0.5984, + "grad_norm": 0.15769942104816437, + "learning_rate": 8.21696941151997e-05, + "loss": 0.4515, + "step": 3740 + }, + { + "epoch": 0.6, + "grad_norm": 0.14718876779079437, + "learning_rate": 8.2070410981557e-05, + "loss": 0.4865, + "step": 3750 + }, + { + "epoch": 0.6016, + "grad_norm": 0.1689719259738922, + "learning_rate": 8.197091254964768e-05, + "loss": 0.4936, + "step": 3760 + }, + { + "epoch": 0.6032, + "grad_norm": 0.14521871507167816, + "learning_rate": 8.18711994874345e-05, + "loss": 0.4768, + "step": 3770 + }, + { + "epoch": 0.6048, + "grad_norm": 0.15074457228183746, + "learning_rate": 8.177127246432106e-05, + "loss": 0.4701, + "step": 3780 + }, + { + "epoch": 0.6064, + "grad_norm": 0.14675575494766235, + "learning_rate": 8.167113215114738e-05, + "loss": 0.4782, + "step": 3790 + }, + { + "epoch": 0.608, + "grad_norm": 0.1510019302368164, + "learning_rate": 8.157077922018537e-05, + "loss": 0.465, + "step": 3800 + }, + { + "epoch": 0.6096, + "grad_norm": 0.1646314412355423, + "learning_rate": 8.147021434513426e-05, + "loss": 0.5005, + "step": 3810 + }, + { + "epoch": 0.6112, + "grad_norm": 0.14409051835536957, + "learning_rate": 8.136943820111615e-05, + "loss": 0.4736, + "step": 3820 + }, + { + "epoch": 0.6128, + "grad_norm": 0.12928612530231476, + "learning_rate": 8.12684514646715e-05, + "loss": 0.4635, + "step": 3830 + }, + { + "epoch": 0.6144, + "grad_norm": 0.14266784489154816, + "learning_rate": 8.116725481375446e-05, + "loss": 0.4997, + "step": 3840 + }, + { + "epoch": 0.616, + "grad_norm": 0.1464674174785614, + "learning_rate": 8.106584892772844e-05, + "loss": 0.4972, + "step": 3850 + }, + { + "epoch": 0.6176, + "grad_norm": 0.14282941818237305, + "learning_rate": 8.096423448736149e-05, + "loss": 0.4855, + "step": 3860 + }, + { + "epoch": 0.6192, + "grad_norm": 0.1639098823070526, + "learning_rate": 8.086241217482177e-05, + "loss": 0.4934, + "step": 3870 + }, + { + "epoch": 0.6208, + "grad_norm": 0.13832098245620728, + "learning_rate": 8.076038267367292e-05, + "loss": 0.4652, + "step": 3880 + }, + { + "epoch": 0.6224, + "grad_norm": 0.11946088075637817, + "learning_rate": 8.065814666886954e-05, + "loss": 0.4737, + "step": 3890 + }, + { + "epoch": 0.624, + "grad_norm": 0.14873822033405304, + "learning_rate": 8.055570484675251e-05, + "loss": 0.4812, + "step": 3900 + }, + { + "epoch": 0.6256, + "grad_norm": 0.14197267591953278, + "learning_rate": 8.045305789504444e-05, + "loss": 0.4716, + "step": 3910 + }, + { + "epoch": 0.6272, + "grad_norm": 0.17008569836616516, + "learning_rate": 8.035020650284506e-05, + "loss": 0.4775, + "step": 3920 + }, + { + "epoch": 0.6288, + "grad_norm": 0.14778374135494232, + "learning_rate": 8.02471513606265e-05, + "loss": 0.486, + "step": 3930 + }, + { + "epoch": 0.6304, + "grad_norm": 0.13922783732414246, + "learning_rate": 8.014389316022881e-05, + "loss": 0.4749, + "step": 3940 + }, + { + "epoch": 0.632, + "grad_norm": 0.15005655586719513, + "learning_rate": 8.004043259485519e-05, + "loss": 0.4814, + "step": 3950 + }, + { + "epoch": 0.6336, + "grad_norm": 0.15039877593517303, + "learning_rate": 7.993677035906735e-05, + "loss": 0.4857, + "step": 3960 + }, + { + "epoch": 0.6352, + "grad_norm": 0.14078804850578308, + "learning_rate": 7.983290714878092e-05, + "loss": 0.4988, + "step": 3970 + }, + { + "epoch": 0.6368, + "grad_norm": 0.17190280556678772, + "learning_rate": 7.972884366126072e-05, + "loss": 0.4853, + "step": 3980 + }, + { + "epoch": 0.6384, + "grad_norm": 0.15270432829856873, + "learning_rate": 7.962458059511606e-05, + "loss": 0.4892, + "step": 3990 + }, + { + "epoch": 0.64, + "grad_norm": 0.16630981862545013, + "learning_rate": 7.952011865029614e-05, + "loss": 0.4972, + "step": 4000 + }, + { + "epoch": 0.6416, + "grad_norm": 0.15123958885669708, + "learning_rate": 7.941545852808522e-05, + "loss": 0.4531, + "step": 4010 + }, + { + "epoch": 0.6432, + "grad_norm": 0.14224116504192352, + "learning_rate": 7.931060093109807e-05, + "loss": 0.456, + "step": 4020 + }, + { + "epoch": 0.6448, + "grad_norm": 0.1724097728729248, + "learning_rate": 7.920554656327508e-05, + "loss": 0.471, + "step": 4030 + }, + { + "epoch": 0.6464, + "grad_norm": 0.14767609536647797, + "learning_rate": 7.910029612987766e-05, + "loss": 0.4738, + "step": 4040 + }, + { + "epoch": 0.648, + "grad_norm": 0.153291255235672, + "learning_rate": 7.89948503374835e-05, + "loss": 0.4594, + "step": 4050 + }, + { + "epoch": 0.6496, + "grad_norm": 0.15384319424629211, + "learning_rate": 7.888920989398174e-05, + "loss": 0.4906, + "step": 4060 + }, + { + "epoch": 0.6512, + "grad_norm": 0.16041874885559082, + "learning_rate": 7.878337550856829e-05, + "loss": 0.4907, + "step": 4070 + }, + { + "epoch": 0.6528, + "grad_norm": 0.15995880961418152, + "learning_rate": 7.867734789174104e-05, + "loss": 0.4875, + "step": 4080 + }, + { + "epoch": 0.6544, + "grad_norm": 0.16801008582115173, + "learning_rate": 7.857112775529514e-05, + "loss": 0.4655, + "step": 4090 + }, + { + "epoch": 0.656, + "grad_norm": 0.1646784543991089, + "learning_rate": 7.846471581231814e-05, + "loss": 0.4961, + "step": 4100 + }, + { + "epoch": 0.6576, + "grad_norm": 0.17190945148468018, + "learning_rate": 7.835811277718527e-05, + "loss": 0.4638, + "step": 4110 + }, + { + "epoch": 0.6592, + "grad_norm": 0.17251329123973846, + "learning_rate": 7.82513193655546e-05, + "loss": 0.469, + "step": 4120 + }, + { + "epoch": 0.6608, + "grad_norm": 0.14122401177883148, + "learning_rate": 7.814433629436225e-05, + "loss": 0.4842, + "step": 4130 + }, + { + "epoch": 0.6624, + "grad_norm": 0.13392092287540436, + "learning_rate": 7.803716428181763e-05, + "loss": 0.4775, + "step": 4140 + }, + { + "epoch": 0.664, + "grad_norm": 0.14145219326019287, + "learning_rate": 7.792980404739848e-05, + "loss": 0.4755, + "step": 4150 + }, + { + "epoch": 0.6656, + "grad_norm": 0.13244092464447021, + "learning_rate": 7.782225631184623e-05, + "loss": 0.4823, + "step": 4160 + }, + { + "epoch": 0.6672, + "grad_norm": 0.1631832867860794, + "learning_rate": 7.771452179716099e-05, + "loss": 0.4695, + "step": 4170 + }, + { + "epoch": 0.6688, + "grad_norm": 0.15262052416801453, + "learning_rate": 7.760660122659682e-05, + "loss": 0.4711, + "step": 4180 + }, + { + "epoch": 0.6704, + "grad_norm": 0.15986715257167816, + "learning_rate": 7.749849532465677e-05, + "loss": 0.4751, + "step": 4190 + }, + { + "epoch": 0.672, + "grad_norm": 0.16850730776786804, + "learning_rate": 7.739020481708815e-05, + "loss": 0.4836, + "step": 4200 + }, + { + "epoch": 0.6736, + "grad_norm": 0.1642541140317917, + "learning_rate": 7.728173043087755e-05, + "loss": 0.4622, + "step": 4210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.1534961462020874, + "learning_rate": 7.717307289424594e-05, + "loss": 0.4721, + "step": 4220 + }, + { + "epoch": 0.6768, + "grad_norm": 0.14704926311969757, + "learning_rate": 7.706423293664391e-05, + "loss": 0.491, + "step": 4230 + }, + { + "epoch": 0.6784, + "grad_norm": 0.13047677278518677, + "learning_rate": 7.695521128874668e-05, + "loss": 0.4511, + "step": 4240 + }, + { + "epoch": 0.68, + "grad_norm": 0.12621961534023285, + "learning_rate": 7.68460086824492e-05, + "loss": 0.4924, + "step": 4250 + }, + { + "epoch": 0.6816, + "grad_norm": 0.16819994151592255, + "learning_rate": 7.673662585086123e-05, + "loss": 0.4663, + "step": 4260 + }, + { + "epoch": 0.6832, + "grad_norm": 0.17435623705387115, + "learning_rate": 7.662706352830245e-05, + "loss": 0.4703, + "step": 4270 + }, + { + "epoch": 0.6848, + "grad_norm": 0.13229072093963623, + "learning_rate": 7.651732245029753e-05, + "loss": 0.4805, + "step": 4280 + }, + { + "epoch": 0.6864, + "grad_norm": 0.14928007125854492, + "learning_rate": 7.640740335357115e-05, + "loss": 0.4697, + "step": 4290 + }, + { + "epoch": 0.688, + "grad_norm": 0.1565156728029251, + "learning_rate": 7.629730697604314e-05, + "loss": 0.4744, + "step": 4300 + }, + { + "epoch": 0.6896, + "grad_norm": 0.16777831315994263, + "learning_rate": 7.618703405682341e-05, + "loss": 0.4627, + "step": 4310 + }, + { + "epoch": 0.6912, + "grad_norm": 0.14447970688343048, + "learning_rate": 7.607658533620707e-05, + "loss": 0.4576, + "step": 4320 + }, + { + "epoch": 0.6928, + "grad_norm": 0.1756659299135208, + "learning_rate": 7.596596155566942e-05, + "loss": 0.462, + "step": 4330 + }, + { + "epoch": 0.6944, + "grad_norm": 0.1511956751346588, + "learning_rate": 7.585516345786103e-05, + "loss": 0.487, + "step": 4340 + }, + { + "epoch": 0.696, + "grad_norm": 0.1530904918909073, + "learning_rate": 7.574419178660268e-05, + "loss": 0.4645, + "step": 4350 + }, + { + "epoch": 0.6976, + "grad_norm": 0.15749047696590424, + "learning_rate": 7.56330472868804e-05, + "loss": 0.4756, + "step": 4360 + }, + { + "epoch": 0.6992, + "grad_norm": 0.1609058678150177, + "learning_rate": 7.552173070484048e-05, + "loss": 0.4567, + "step": 4370 + }, + { + "epoch": 0.7008, + "grad_norm": 0.14234240353107452, + "learning_rate": 7.541024278778446e-05, + "loss": 0.4708, + "step": 4380 + }, + { + "epoch": 0.7024, + "grad_norm": 0.14794382452964783, + "learning_rate": 7.52985842841641e-05, + "loss": 0.4836, + "step": 4390 + }, + { + "epoch": 0.704, + "grad_norm": 0.14040929079055786, + "learning_rate": 7.518675594357633e-05, + "loss": 0.4698, + "step": 4400 + }, + { + "epoch": 0.7056, + "grad_norm": 0.16733860969543457, + "learning_rate": 7.507475851675827e-05, + "loss": 0.4795, + "step": 4410 + }, + { + "epoch": 0.7072, + "grad_norm": 0.1319868117570877, + "learning_rate": 7.496259275558217e-05, + "loss": 0.4934, + "step": 4420 + }, + { + "epoch": 0.7088, + "grad_norm": 0.12977701425552368, + "learning_rate": 7.485025941305036e-05, + "loss": 0.4674, + "step": 4430 + }, + { + "epoch": 0.7104, + "grad_norm": 0.1638069450855255, + "learning_rate": 7.473775924329017e-05, + "loss": 0.4672, + "step": 4440 + }, + { + "epoch": 0.712, + "grad_norm": 0.16847467422485352, + "learning_rate": 7.462509300154892e-05, + "loss": 0.4771, + "step": 4450 + }, + { + "epoch": 0.7136, + "grad_norm": 0.1610843539237976, + "learning_rate": 7.45122614441888e-05, + "loss": 0.4678, + "step": 4460 + }, + { + "epoch": 0.7152, + "grad_norm": 0.15362519025802612, + "learning_rate": 7.439926532868183e-05, + "loss": 0.4644, + "step": 4470 + }, + { + "epoch": 0.7168, + "grad_norm": 0.1431635618209839, + "learning_rate": 7.428610541360474e-05, + "loss": 0.4767, + "step": 4480 + }, + { + "epoch": 0.7184, + "grad_norm": 0.1660245805978775, + "learning_rate": 7.41727824586339e-05, + "loss": 0.5056, + "step": 4490 + }, + { + "epoch": 0.72, + "grad_norm": 0.1396573930978775, + "learning_rate": 7.405929722454026e-05, + "loss": 0.4542, + "step": 4500 + }, + { + "epoch": 0.7216, + "grad_norm": 0.1382286697626114, + "learning_rate": 7.39456504731841e-05, + "loss": 0.4813, + "step": 4510 + }, + { + "epoch": 0.7232, + "grad_norm": 0.13782280683517456, + "learning_rate": 7.383184296751013e-05, + "loss": 0.4497, + "step": 4520 + }, + { + "epoch": 0.7248, + "grad_norm": 0.14329640567302704, + "learning_rate": 7.371787547154216e-05, + "loss": 0.4683, + "step": 4530 + }, + { + "epoch": 0.7264, + "grad_norm": 0.1624203324317932, + "learning_rate": 7.36037487503781e-05, + "loss": 0.477, + "step": 4540 + }, + { + "epoch": 0.728, + "grad_norm": 0.1574970781803131, + "learning_rate": 7.348946357018479e-05, + "loss": 0.4756, + "step": 4550 + }, + { + "epoch": 0.7296, + "grad_norm": 0.15991832315921783, + "learning_rate": 7.337502069819286e-05, + "loss": 0.4802, + "step": 4560 + }, + { + "epoch": 0.7312, + "grad_norm": 0.13380931317806244, + "learning_rate": 7.326042090269151e-05, + "loss": 0.4727, + "step": 4570 + }, + { + "epoch": 0.7328, + "grad_norm": 0.14259736239910126, + "learning_rate": 7.314566495302353e-05, + "loss": 0.4536, + "step": 4580 + }, + { + "epoch": 0.7344, + "grad_norm": 0.16807004809379578, + "learning_rate": 7.303075361957992e-05, + "loss": 0.485, + "step": 4590 + }, + { + "epoch": 0.736, + "grad_norm": 0.15170413255691528, + "learning_rate": 7.291568767379484e-05, + "loss": 0.4826, + "step": 4600 + }, + { + "epoch": 0.7376, + "grad_norm": 0.14543898403644562, + "learning_rate": 7.280046788814044e-05, + "loss": 0.4883, + "step": 4610 + }, + { + "epoch": 0.7392, + "grad_norm": 0.1284785270690918, + "learning_rate": 7.268509503612162e-05, + "loss": 0.4503, + "step": 4620 + }, + { + "epoch": 0.7408, + "grad_norm": 0.1725163757801056, + "learning_rate": 7.256956989227084e-05, + "loss": 0.4695, + "step": 4630 + }, + { + "epoch": 0.7424, + "grad_norm": 0.16216342151165009, + "learning_rate": 7.2453893232143e-05, + "loss": 0.4584, + "step": 4640 + }, + { + "epoch": 0.744, + "grad_norm": 0.16917464137077332, + "learning_rate": 7.233806583231011e-05, + "loss": 0.477, + "step": 4650 + }, + { + "epoch": 0.7456, + "grad_norm": 0.1455005705356598, + "learning_rate": 7.22220884703562e-05, + "loss": 0.4728, + "step": 4660 + }, + { + "epoch": 0.7472, + "grad_norm": 0.14221833646297455, + "learning_rate": 7.210596192487198e-05, + "loss": 0.4606, + "step": 4670 + }, + { + "epoch": 0.7488, + "grad_norm": 0.1557261198759079, + "learning_rate": 7.19896869754497e-05, + "loss": 0.4709, + "step": 4680 + }, + { + "epoch": 0.7504, + "grad_norm": 0.1663047969341278, + "learning_rate": 7.18732644026779e-05, + "loss": 0.4908, + "step": 4690 + }, + { + "epoch": 0.752, + "grad_norm": 0.169974684715271, + "learning_rate": 7.175669498813617e-05, + "loss": 0.5036, + "step": 4700 + }, + { + "epoch": 0.7536, + "grad_norm": 0.18103328347206116, + "learning_rate": 7.163997951438986e-05, + "loss": 0.4765, + "step": 4710 + }, + { + "epoch": 0.7552, + "grad_norm": 0.1532663255929947, + "learning_rate": 7.152311876498487e-05, + "loss": 0.4647, + "step": 4720 + }, + { + "epoch": 0.7568, + "grad_norm": 0.1575092375278473, + "learning_rate": 7.14061135244424e-05, + "loss": 0.494, + "step": 4730 + }, + { + "epoch": 0.7584, + "grad_norm": 0.17228181660175323, + "learning_rate": 7.128896457825364e-05, + "loss": 0.462, + "step": 4740 + }, + { + "epoch": 0.76, + "grad_norm": 0.1586621254682541, + "learning_rate": 7.117167271287453e-05, + "loss": 0.4739, + "step": 4750 + }, + { + "epoch": 0.7616, + "grad_norm": 0.14224156737327576, + "learning_rate": 7.105423871572044e-05, + "loss": 0.482, + "step": 4760 + }, + { + "epoch": 0.7632, + "grad_norm": 0.15228785574436188, + "learning_rate": 7.093666337516094e-05, + "loss": 0.4862, + "step": 4770 + }, + { + "epoch": 0.7648, + "grad_norm": 0.15366891026496887, + "learning_rate": 7.081894748051451e-05, + "loss": 0.4817, + "step": 4780 + }, + { + "epoch": 0.7664, + "grad_norm": 0.1466524302959442, + "learning_rate": 7.070109182204316e-05, + "loss": 0.4733, + "step": 4790 + }, + { + "epoch": 0.768, + "grad_norm": 0.14818432927131653, + "learning_rate": 7.05830971909472e-05, + "loss": 0.4731, + "step": 4800 + }, + { + "epoch": 0.7696, + "grad_norm": 0.16463568806648254, + "learning_rate": 7.046496437935989e-05, + "loss": 0.466, + "step": 4810 + }, + { + "epoch": 0.7712, + "grad_norm": 0.16732560098171234, + "learning_rate": 7.034669418034216e-05, + "loss": 0.4577, + "step": 4820 + }, + { + "epoch": 0.7728, + "grad_norm": 0.15019118785858154, + "learning_rate": 7.022828738787724e-05, + "loss": 0.4733, + "step": 4830 + }, + { + "epoch": 0.7744, + "grad_norm": 0.15039797127246857, + "learning_rate": 7.010974479686538e-05, + "loss": 0.4593, + "step": 4840 + }, + { + "epoch": 0.776, + "grad_norm": 0.1453026980161667, + "learning_rate": 6.999106720311845e-05, + "loss": 0.4535, + "step": 4850 + }, + { + "epoch": 0.7776, + "grad_norm": 0.15475086867809296, + "learning_rate": 6.987225540335467e-05, + "loss": 0.4812, + "step": 4860 + }, + { + "epoch": 0.7792, + "grad_norm": 0.14673496782779694, + "learning_rate": 6.975331019519322e-05, + "loss": 0.4763, + "step": 4870 + }, + { + "epoch": 0.7808, + "grad_norm": 0.15719452500343323, + "learning_rate": 6.963423237714883e-05, + "loss": 0.4641, + "step": 4880 + }, + { + "epoch": 0.7824, + "grad_norm": 0.1439889371395111, + "learning_rate": 6.951502274862655e-05, + "loss": 0.4605, + "step": 4890 + }, + { + "epoch": 0.784, + "grad_norm": 0.1485874503850937, + "learning_rate": 6.939568210991633e-05, + "loss": 0.4898, + "step": 4900 + }, + { + "epoch": 0.7856, + "grad_norm": 0.17086032032966614, + "learning_rate": 6.927621126218755e-05, + "loss": 0.4797, + "step": 4910 + }, + { + "epoch": 0.7872, + "grad_norm": 0.15497176349163055, + "learning_rate": 6.915661100748378e-05, + "loss": 0.503, + "step": 4920 + }, + { + "epoch": 0.7888, + "grad_norm": 0.15688523650169373, + "learning_rate": 6.903688214871734e-05, + "loss": 0.4708, + "step": 4930 + }, + { + "epoch": 0.7904, + "grad_norm": 0.15009275078773499, + "learning_rate": 6.891702548966385e-05, + "loss": 0.4714, + "step": 4940 + }, + { + "epoch": 0.792, + "grad_norm": 0.15371015667915344, + "learning_rate": 6.879704183495695e-05, + "loss": 0.4541, + "step": 4950 + }, + { + "epoch": 0.7936, + "grad_norm": 0.13578854501247406, + "learning_rate": 6.867693199008285e-05, + "loss": 0.4715, + "step": 4960 + }, + { + "epoch": 0.7952, + "grad_norm": 0.1727481335401535, + "learning_rate": 6.855669676137483e-05, + "loss": 0.4631, + "step": 4970 + }, + { + "epoch": 0.7968, + "grad_norm": 0.15292920172214508, + "learning_rate": 6.843633695600802e-05, + "loss": 0.4895, + "step": 4980 + }, + { + "epoch": 0.7984, + "grad_norm": 0.16094942390918732, + "learning_rate": 6.831585338199376e-05, + "loss": 0.4796, + "step": 4990 + }, + { + "epoch": 0.8, + "grad_norm": 0.13955985009670258, + "learning_rate": 6.819524684817438e-05, + "loss": 0.478, + "step": 5000 + }, + { + "epoch": 0.8016, + "grad_norm": 0.15332014858722687, + "learning_rate": 6.807451816421761e-05, + "loss": 0.47, + "step": 5010 + }, + { + "epoch": 0.8032, + "grad_norm": 0.15463106334209442, + "learning_rate": 6.795366814061126e-05, + "loss": 0.4539, + "step": 5020 + }, + { + "epoch": 0.8048, + "grad_norm": 0.16516199707984924, + "learning_rate": 6.783269758865768e-05, + "loss": 0.4832, + "step": 5030 + }, + { + "epoch": 0.8064, + "grad_norm": 0.15162187814712524, + "learning_rate": 6.77116073204684e-05, + "loss": 0.4623, + "step": 5040 + }, + { + "epoch": 0.808, + "grad_norm": 0.15436415374279022, + "learning_rate": 6.759039814895862e-05, + "loss": 0.475, + "step": 5050 + }, + { + "epoch": 0.8096, + "grad_norm": 0.14143790304660797, + "learning_rate": 6.746907088784182e-05, + "loss": 0.4713, + "step": 5060 + }, + { + "epoch": 0.8112, + "grad_norm": 0.16152924299240112, + "learning_rate": 6.734762635162418e-05, + "loss": 0.4623, + "step": 5070 + }, + { + "epoch": 0.8128, + "grad_norm": 0.17818845808506012, + "learning_rate": 6.72260653555992e-05, + "loss": 0.4701, + "step": 5080 + }, + { + "epoch": 0.8144, + "grad_norm": 0.16604343056678772, + "learning_rate": 6.710438871584224e-05, + "loss": 0.4824, + "step": 5090 + }, + { + "epoch": 0.816, + "grad_norm": 0.1397034227848053, + "learning_rate": 6.698259724920502e-05, + "loss": 0.4584, + "step": 5100 + }, + { + "epoch": 0.8176, + "grad_norm": 0.15765774250030518, + "learning_rate": 6.68606917733101e-05, + "loss": 0.4649, + "step": 5110 + }, + { + "epoch": 0.8192, + "grad_norm": 0.1569841355085373, + "learning_rate": 6.673867310654538e-05, + "loss": 0.4856, + "step": 5120 + }, + { + "epoch": 0.8208, + "grad_norm": 0.14227326214313507, + "learning_rate": 6.661654206805873e-05, + "loss": 0.4537, + "step": 5130 + }, + { + "epoch": 0.8224, + "grad_norm": 0.1603560745716095, + "learning_rate": 6.649429947775236e-05, + "loss": 0.4884, + "step": 5140 + }, + { + "epoch": 0.824, + "grad_norm": 0.17131942510604858, + "learning_rate": 6.637194615627733e-05, + "loss": 0.4747, + "step": 5150 + }, + { + "epoch": 0.8256, + "grad_norm": 0.14724785089492798, + "learning_rate": 6.624948292502814e-05, + "loss": 0.4617, + "step": 5160 + }, + { + "epoch": 0.8272, + "grad_norm": 0.1343044489622116, + "learning_rate": 6.61269106061371e-05, + "loss": 0.4434, + "step": 5170 + }, + { + "epoch": 0.8288, + "grad_norm": 0.15071623027324677, + "learning_rate": 6.600423002246885e-05, + "loss": 0.4659, + "step": 5180 + }, + { + "epoch": 0.8304, + "grad_norm": 0.16061842441558838, + "learning_rate": 6.588144199761487e-05, + "loss": 0.4736, + "step": 5190 + }, + { + "epoch": 0.832, + "grad_norm": 0.1504916101694107, + "learning_rate": 6.575854735588794e-05, + "loss": 0.4694, + "step": 5200 + }, + { + "epoch": 0.8336, + "grad_norm": 0.14578945934772491, + "learning_rate": 6.563554692231655e-05, + "loss": 0.4583, + "step": 5210 + }, + { + "epoch": 0.8352, + "grad_norm": 0.1559389978647232, + "learning_rate": 6.55124415226394e-05, + "loss": 0.4703, + "step": 5220 + }, + { + "epoch": 0.8368, + "grad_norm": 0.15339642763137817, + "learning_rate": 6.538923198329993e-05, + "loss": 0.4568, + "step": 5230 + }, + { + "epoch": 0.8384, + "grad_norm": 0.16523930430412292, + "learning_rate": 6.526591913144061e-05, + "loss": 0.4813, + "step": 5240 + }, + { + "epoch": 0.84, + "grad_norm": 0.12947815656661987, + "learning_rate": 6.514250379489753e-05, + "loss": 0.4725, + "step": 5250 + }, + { + "epoch": 0.8416, + "grad_norm": 0.14968736469745636, + "learning_rate": 6.50189868021948e-05, + "loss": 0.468, + "step": 5260 + }, + { + "epoch": 0.8432, + "grad_norm": 0.1549982726573944, + "learning_rate": 6.489536898253893e-05, + "loss": 0.4557, + "step": 5270 + }, + { + "epoch": 0.8448, + "grad_norm": 0.14880022406578064, + "learning_rate": 6.477165116581333e-05, + "loss": 0.4569, + "step": 5280 + }, + { + "epoch": 0.8464, + "grad_norm": 0.1381607949733734, + "learning_rate": 6.464783418257277e-05, + "loss": 0.4798, + "step": 5290 + }, + { + "epoch": 0.848, + "grad_norm": 0.15493975579738617, + "learning_rate": 6.452391886403767e-05, + "loss": 0.4702, + "step": 5300 + }, + { + "epoch": 0.8496, + "grad_norm": 0.15621016919612885, + "learning_rate": 6.439990604208868e-05, + "loss": 0.4854, + "step": 5310 + }, + { + "epoch": 0.8512, + "grad_norm": 0.15215294063091278, + "learning_rate": 6.427579654926095e-05, + "loss": 0.4739, + "step": 5320 + }, + { + "epoch": 0.8528, + "grad_norm": 0.1679297685623169, + "learning_rate": 6.415159121873868e-05, + "loss": 0.4703, + "step": 5330 + }, + { + "epoch": 0.8544, + "grad_norm": 0.15487776696681976, + "learning_rate": 6.402729088434942e-05, + "loss": 0.4668, + "step": 5340 + }, + { + "epoch": 0.856, + "grad_norm": 0.13464778661727905, + "learning_rate": 6.390289638055851e-05, + "loss": 0.4454, + "step": 5350 + }, + { + "epoch": 0.8576, + "grad_norm": 0.17225205898284912, + "learning_rate": 6.377840854246348e-05, + "loss": 0.4665, + "step": 5360 + }, + { + "epoch": 0.8592, + "grad_norm": 0.15738821029663086, + "learning_rate": 6.365382820578845e-05, + "loss": 0.4743, + "step": 5370 + }, + { + "epoch": 0.8608, + "grad_norm": 0.15247198939323425, + "learning_rate": 6.352915620687847e-05, + "loss": 0.4693, + "step": 5380 + }, + { + "epoch": 0.8624, + "grad_norm": 0.15310163795948029, + "learning_rate": 6.340439338269402e-05, + "loss": 0.4819, + "step": 5390 + }, + { + "epoch": 0.864, + "grad_norm": 0.1685699075460434, + "learning_rate": 6.327954057080526e-05, + "loss": 0.4662, + "step": 5400 + }, + { + "epoch": 0.8656, + "grad_norm": 0.15499986708164215, + "learning_rate": 6.315459860938649e-05, + "loss": 0.4794, + "step": 5410 + }, + { + "epoch": 0.8672, + "grad_norm": 0.14956675469875336, + "learning_rate": 6.302956833721047e-05, + "loss": 0.464, + "step": 5420 + }, + { + "epoch": 0.8688, + "grad_norm": 0.14995618164539337, + "learning_rate": 6.290445059364286e-05, + "loss": 0.4621, + "step": 5430 + }, + { + "epoch": 0.8704, + "grad_norm": 0.1638861447572708, + "learning_rate": 6.277924621863649e-05, + "loss": 0.475, + "step": 5440 + }, + { + "epoch": 0.872, + "grad_norm": 0.1652149260044098, + "learning_rate": 6.265395605272581e-05, + "loss": 0.4889, + "step": 5450 + }, + { + "epoch": 0.8736, + "grad_norm": 0.1676662415266037, + "learning_rate": 6.252858093702121e-05, + "loss": 0.4776, + "step": 5460 + }, + { + "epoch": 0.8752, + "grad_norm": 0.14927372336387634, + "learning_rate": 6.240312171320336e-05, + "loss": 0.4753, + "step": 5470 + }, + { + "epoch": 0.8768, + "grad_norm": 0.13902117311954498, + "learning_rate": 6.227757922351755e-05, + "loss": 0.4695, + "step": 5480 + }, + { + "epoch": 0.8784, + "grad_norm": 0.15907002985477448, + "learning_rate": 6.215195431076813e-05, + "loss": 0.4844, + "step": 5490 + }, + { + "epoch": 0.88, + "grad_norm": 0.1425957828760147, + "learning_rate": 6.202624781831268e-05, + "loss": 0.4507, + "step": 5500 + }, + { + "epoch": 0.8816, + "grad_norm": 0.1457531601190567, + "learning_rate": 6.190046059005655e-05, + "loss": 0.4652, + "step": 5510 + }, + { + "epoch": 0.8832, + "grad_norm": 0.15740548074245453, + "learning_rate": 6.177459347044703e-05, + "loss": 0.4645, + "step": 5520 + }, + { + "epoch": 0.8848, + "grad_norm": 0.15774016082286835, + "learning_rate": 6.164864730446776e-05, + "loss": 0.4551, + "step": 5530 + }, + { + "epoch": 0.8864, + "grad_norm": 0.16875840723514557, + "learning_rate": 6.152262293763305e-05, + "loss": 0.4795, + "step": 5540 + }, + { + "epoch": 0.888, + "grad_norm": 0.16172195971012115, + "learning_rate": 6.139652121598218e-05, + "loss": 0.4631, + "step": 5550 + }, + { + "epoch": 0.8896, + "grad_norm": 0.15295062959194183, + "learning_rate": 6.127034298607375e-05, + "loss": 0.4631, + "step": 5560 + }, + { + "epoch": 0.8912, + "grad_norm": 0.14107844233512878, + "learning_rate": 6.114408909497998e-05, + "loss": 0.4628, + "step": 5570 + }, + { + "epoch": 0.8928, + "grad_norm": 0.14286963641643524, + "learning_rate": 6.101776039028104e-05, + "loss": 0.4518, + "step": 5580 + }, + { + "epoch": 0.8944, + "grad_norm": 0.16553056240081787, + "learning_rate": 6.0891357720059314e-05, + "loss": 0.4793, + "step": 5590 + }, + { + "epoch": 0.896, + "grad_norm": 0.12574143707752228, + "learning_rate": 6.076488193289375e-05, + "loss": 0.4481, + "step": 5600 + }, + { + "epoch": 0.8976, + "grad_norm": 0.15329398214817047, + "learning_rate": 6.063833387785418e-05, + "loss": 0.4733, + "step": 5610 + }, + { + "epoch": 0.8992, + "grad_norm": 0.174740269780159, + "learning_rate": 6.0511714404495546e-05, + "loss": 0.4699, + "step": 5620 + }, + { + "epoch": 0.9008, + "grad_norm": 0.16872422397136688, + "learning_rate": 6.038502436285227e-05, + "loss": 0.4397, + "step": 5630 + }, + { + "epoch": 0.9024, + "grad_norm": 0.1316106766462326, + "learning_rate": 6.0258264603432526e-05, + "loss": 0.465, + "step": 5640 + }, + { + "epoch": 0.904, + "grad_norm": 0.15435636043548584, + "learning_rate": 6.013143597721251e-05, + "loss": 0.4681, + "step": 5650 + }, + { + "epoch": 0.9056, + "grad_norm": 0.15449753403663635, + "learning_rate": 6.000453933563075e-05, + "loss": 0.4642, + "step": 5660 + }, + { + "epoch": 0.9072, + "grad_norm": 0.15895044803619385, + "learning_rate": 5.987757553058236e-05, + "loss": 0.4414, + "step": 5670 + }, + { + "epoch": 0.9088, + "grad_norm": 0.16449132561683655, + "learning_rate": 5.9750545414413403e-05, + "loss": 0.4552, + "step": 5680 + }, + { + "epoch": 0.9104, + "grad_norm": 0.15979304909706116, + "learning_rate": 5.962344983991503e-05, + "loss": 0.4612, + "step": 5690 + }, + { + "epoch": 0.912, + "grad_norm": 0.13905392587184906, + "learning_rate": 5.949628966031785e-05, + "loss": 0.4649, + "step": 5700 + }, + { + "epoch": 0.9136, + "grad_norm": 0.1556766778230667, + "learning_rate": 5.9369065729286245e-05, + "loss": 0.4714, + "step": 5710 + }, + { + "epoch": 0.9152, + "grad_norm": 0.14686590433120728, + "learning_rate": 5.92417789009125e-05, + "loss": 0.4825, + "step": 5720 + }, + { + "epoch": 0.9168, + "grad_norm": 0.13739673793315887, + "learning_rate": 5.911443002971122e-05, + "loss": 0.4496, + "step": 5730 + }, + { + "epoch": 0.9184, + "grad_norm": 0.15686804056167603, + "learning_rate": 5.898701997061349e-05, + "loss": 0.4551, + "step": 5740 + }, + { + "epoch": 0.92, + "grad_norm": 0.16496874392032623, + "learning_rate": 5.885954957896115e-05, + "loss": 0.4605, + "step": 5750 + }, + { + "epoch": 0.9216, + "grad_norm": 0.14325810968875885, + "learning_rate": 5.873201971050107e-05, + "loss": 0.4388, + "step": 5760 + }, + { + "epoch": 0.9232, + "grad_norm": 0.15459012985229492, + "learning_rate": 5.860443122137946e-05, + "loss": 0.457, + "step": 5770 + }, + { + "epoch": 0.9248, + "grad_norm": 0.1705654412508011, + "learning_rate": 5.847678496813601e-05, + "loss": 0.4556, + "step": 5780 + }, + { + "epoch": 0.9264, + "grad_norm": 0.15087370574474335, + "learning_rate": 5.8349081807698236e-05, + "loss": 0.4772, + "step": 5790 + }, + { + "epoch": 0.928, + "grad_norm": 0.14736686646938324, + "learning_rate": 5.822132259737565e-05, + "loss": 0.4503, + "step": 5800 + }, + { + "epoch": 0.9296, + "grad_norm": 0.1365375965833664, + "learning_rate": 5.809350819485407e-05, + "loss": 0.4562, + "step": 5810 + }, + { + "epoch": 0.9312, + "grad_norm": 0.1499178409576416, + "learning_rate": 5.7965639458189835e-05, + "loss": 0.4862, + "step": 5820 + }, + { + "epoch": 0.9328, + "grad_norm": 0.1491120159626007, + "learning_rate": 5.7837717245804045e-05, + "loss": 0.4397, + "step": 5830 + }, + { + "epoch": 0.9344, + "grad_norm": 0.13986369967460632, + "learning_rate": 5.7709742416476785e-05, + "loss": 0.4557, + "step": 5840 + }, + { + "epoch": 0.936, + "grad_norm": 0.14495722949504852, + "learning_rate": 5.7581715829341396e-05, + "loss": 0.4729, + "step": 5850 + }, + { + "epoch": 0.9376, + "grad_norm": 0.14987362921237946, + "learning_rate": 5.7453638343878665e-05, + "loss": 0.4531, + "step": 5860 + }, + { + "epoch": 0.9392, + "grad_norm": 0.14620471000671387, + "learning_rate": 5.732551081991109e-05, + "loss": 0.4637, + "step": 5870 + }, + { + "epoch": 0.9408, + "grad_norm": 0.15326456725597382, + "learning_rate": 5.719733411759707e-05, + "loss": 0.4611, + "step": 5880 + }, + { + "epoch": 0.9424, + "grad_norm": 0.15329191088676453, + "learning_rate": 5.706910909742518e-05, + "loss": 0.4519, + "step": 5890 + }, + { + "epoch": 0.944, + "grad_norm": 0.1496082991361618, + "learning_rate": 5.6940836620208346e-05, + "loss": 0.4573, + "step": 5900 + }, + { + "epoch": 0.9456, + "grad_norm": 0.15664245188236237, + "learning_rate": 5.6812517547078094e-05, + "loss": 0.4684, + "step": 5910 + }, + { + "epoch": 0.9472, + "grad_norm": 0.13982662558555603, + "learning_rate": 5.668415273947876e-05, + "loss": 0.453, + "step": 5920 + }, + { + "epoch": 0.9488, + "grad_norm": 0.172904372215271, + "learning_rate": 5.655574305916172e-05, + "loss": 0.4758, + "step": 5930 + }, + { + "epoch": 0.9504, + "grad_norm": 0.15912851691246033, + "learning_rate": 5.6427289368179605e-05, + "loss": 0.4601, + "step": 5940 + }, + { + "epoch": 0.952, + "grad_norm": 0.14622333645820618, + "learning_rate": 5.629879252888046e-05, + "loss": 0.4596, + "step": 5950 + }, + { + "epoch": 0.9536, + "grad_norm": 0.16883589327335358, + "learning_rate": 5.617025340390203e-05, + "loss": 0.4559, + "step": 5960 + }, + { + "epoch": 0.9552, + "grad_norm": 0.16833968460559845, + "learning_rate": 5.604167285616593e-05, + "loss": 0.452, + "step": 5970 + }, + { + "epoch": 0.9568, + "grad_norm": 0.15931782126426697, + "learning_rate": 5.591305174887185e-05, + "loss": 0.464, + "step": 5980 + }, + { + "epoch": 0.9584, + "grad_norm": 0.13519136607646942, + "learning_rate": 5.578439094549178e-05, + "loss": 0.4726, + "step": 5990 + }, + { + "epoch": 0.96, + "grad_norm": 0.1617356389760971, + "learning_rate": 5.565569130976422e-05, + "loss": 0.4706, + "step": 6000 + }, + { + "epoch": 0.9616, + "grad_norm": 0.1612035483121872, + "learning_rate": 5.55269537056883e-05, + "loss": 0.4574, + "step": 6010 + }, + { + "epoch": 0.9632, + "grad_norm": 0.1506652981042862, + "learning_rate": 5.539817899751812e-05, + "loss": 0.4845, + "step": 6020 + }, + { + "epoch": 0.9648, + "grad_norm": 0.14405718445777893, + "learning_rate": 5.526936804975681e-05, + "loss": 0.4625, + "step": 6030 + }, + { + "epoch": 0.9664, + "grad_norm": 0.14939121901988983, + "learning_rate": 5.5140521727150805e-05, + "loss": 0.447, + "step": 6040 + }, + { + "epoch": 0.968, + "grad_norm": 0.1367347687482834, + "learning_rate": 5.5011640894684056e-05, + "loss": 0.4487, + "step": 6050 + }, + { + "epoch": 0.9696, + "grad_norm": 0.15077663958072662, + "learning_rate": 5.488272641757215e-05, + "loss": 0.4618, + "step": 6060 + }, + { + "epoch": 0.9712, + "grad_norm": 0.16199123859405518, + "learning_rate": 5.475377916125655e-05, + "loss": 0.4659, + "step": 6070 + }, + { + "epoch": 0.9728, + "grad_norm": 0.13599364459514618, + "learning_rate": 5.4624799991398766e-05, + "loss": 0.4601, + "step": 6080 + }, + { + "epoch": 0.9744, + "grad_norm": 0.18672606348991394, + "learning_rate": 5.4495789773874594e-05, + "loss": 0.457, + "step": 6090 + }, + { + "epoch": 0.976, + "grad_norm": 0.16099564731121063, + "learning_rate": 5.43667493747682e-05, + "loss": 0.4561, + "step": 6100 + }, + { + "epoch": 0.9776, + "grad_norm": 0.14696004986763, + "learning_rate": 5.4237679660366435e-05, + "loss": 0.4632, + "step": 6110 + }, + { + "epoch": 0.9792, + "grad_norm": 0.16183456778526306, + "learning_rate": 5.410858149715289e-05, + "loss": 0.4562, + "step": 6120 + }, + { + "epoch": 0.9808, + "grad_norm": 0.16029495000839233, + "learning_rate": 5.3979455751802175e-05, + "loss": 0.4599, + "step": 6130 + }, + { + "epoch": 0.9824, + "grad_norm": 0.15214140713214874, + "learning_rate": 5.385030329117408e-05, + "loss": 0.4527, + "step": 6140 + }, + { + "epoch": 0.984, + "grad_norm": 0.16601146757602692, + "learning_rate": 5.37211249823077e-05, + "loss": 0.4594, + "step": 6150 + }, + { + "epoch": 0.9856, + "grad_norm": 0.17921248078346252, + "learning_rate": 5.3591921692415704e-05, + "loss": 0.4625, + "step": 6160 + }, + { + "epoch": 0.9872, + "grad_norm": 0.13406029343605042, + "learning_rate": 5.346269428887842e-05, + "loss": 0.4416, + "step": 6170 + }, + { + "epoch": 0.9888, + "grad_norm": 0.1382516622543335, + "learning_rate": 5.3333443639238104e-05, + "loss": 0.449, + "step": 6180 + }, + { + "epoch": 0.9904, + "grad_norm": 0.1678723245859146, + "learning_rate": 5.320417061119303e-05, + "loss": 0.4747, + "step": 6190 + }, + { + "epoch": 0.992, + "grad_norm": 0.1683942973613739, + "learning_rate": 5.307487607259175e-05, + "loss": 0.4601, + "step": 6200 + }, + { + "epoch": 0.9936, + "grad_norm": 0.13359014689922333, + "learning_rate": 5.294556089142716e-05, + "loss": 0.4588, + "step": 6210 + }, + { + "epoch": 0.9952, + "grad_norm": 0.18495921790599823, + "learning_rate": 5.28162259358308e-05, + "loss": 0.4629, + "step": 6220 + }, + { + "epoch": 0.9968, + "grad_norm": 0.17101769149303436, + "learning_rate": 5.2686872074066914e-05, + "loss": 0.4562, + "step": 6230 + }, + { + "epoch": 0.9984, + "grad_norm": 0.16476212441921234, + "learning_rate": 5.25575001745267e-05, + "loss": 0.4672, + "step": 6240 + }, + { + "epoch": 1.0, + "grad_norm": 0.15620839595794678, + "learning_rate": 5.242811110572242e-05, + "loss": 0.4441, + "step": 6250 + } + ], + "logging_steps": 10, + "max_steps": 12500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.627710223619768e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}